From 10742fee98eb3b3e8453ef27a33dee314b15f7bd Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 3 Jul 2019 14:07:54 +0200 Subject: [PATCH 001/721] eeprom: at24: remove unneeded include We used to have a call to ilog2() in AT24_DEVICE_MAGIC(). That's long gone so this header is no longer needed. Signed-off-by: Bartosz Golaszewski Acked-by: Wolfram Sang --- drivers/misc/eeprom/at24.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c index 35bf2477693d..fff5b340580c 100644 --- a/drivers/misc/eeprom/at24.c +++ b/drivers/misc/eeprom/at24.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include From 4f77e0956bd99901af2bfc6641437ff8bff8d4a4 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Mon, 15 Jul 2019 14:34:16 -0600 Subject: [PATCH 002/721] PCI: Remove pci_block_cfg_access() et al (unused) Remove the following unused functions from include/linux/pci.h: pci_block_cfg_access() pci_block_cfg_access_in_atomic() pci_unblock_cfg_access() These were added by fb51ccbf217c ("PCI: Rework config space blocking services"), though no callers were added. Link: https://lore.kernel.org/r/20190715203416.37547-1-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Bulwahn --- include/linux/pci.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..61b64fa1b0b1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1747,11 +1747,6 @@ static inline void pci_release_regions(struct pci_dev *dev) { } static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; } -static inline void pci_block_cfg_access(struct pci_dev *dev) { } -static inline int pci_block_cfg_access_in_atomic(struct pci_dev *dev) -{ return 0; } -static inline void pci_unblock_cfg_access(struct pci_dev *dev) { } - static inline struct pci_bus *pci_find_next_bus(const struct pci_bus *from) { return NULL; } static inline struct pci_dev *pci_get_slot(struct pci_bus *bus, From fae6b93b19b42fcae0ef71e669b643f3366f6824 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 17 Jul 2019 12:23:53 -0600 Subject: [PATCH 003/721] PCI: Unexport pci_bus_get() and pci_bus_put() pci_bus_get() and pci_bus_put() are not used by a loadable kernel module and do not need to be exported. These were exported by fe830ef62ac6 ("PCI: Introduce pci_bus_{get|put}() to manage PCI bus reference count"), but there are no loadable modules in the tree that use them. Link: https://lore.kernel.org/r/20190717182353.45557-1-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Acked-by: Greg Kroah-Hartman --- drivers/pci/bus.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 495059d923f7..8e40b3e6da77 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -417,11 +417,9 @@ struct pci_bus *pci_bus_get(struct pci_bus *bus) get_device(&bus->dev); return bus; } -EXPORT_SYMBOL(pci_bus_get); void pci_bus_put(struct pci_bus *bus) { if (bus) put_device(&bus->dev); } -EXPORT_SYMBOL(pci_bus_put); From 70a658073726ae79cf9747b651f577d1fdf45c3a Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 17 Jul 2019 21:29:52 -0600 Subject: [PATCH 004/721] PCI: Unexport pci_bus_sem pci_bus_sem is not used by a loadable kernel module and does not need to be exported. It was exported by ce29ca3ea407 ("PCI: acpiphp: remove all functions in slot, even without ACPI _EJx"), which added a use of pci_bus_sem in acpiphp, which could be built as a module at that time. But since 6037a803b05e ("PCI: acpiphp: Convert acpiphp to be builtin only, not modular"), it can no longer be built as a module. Link: https://lore.kernel.org/r/20190718032951.40188-1-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/search.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/search.c b/drivers/pci/search.c index 7f4e65872b8d..bade14002fd8 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -15,7 +15,6 @@ #include "pci.h" DECLARE_RWSEM(pci_bus_sem); -EXPORT_SYMBOL_GPL(pci_bus_sem); /* * pci_for_each_dma_alias - Iterate over DMA aliases for a device From befa45fb5bdd514a40a19e659491193f7fd022f2 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Mon, 8 Jul 2019 20:33:54 +0800 Subject: [PATCH 005/721] PCI: Use devm_add_action_or_reset() devm_add_action_or_reset() is a helper function which internally calls devm_add_action(). If the devm_add_action() fails, it will execute the action mentioned and return the error code. Use devm_add_action_or_reset() to reduce source code size (avoid writing the action twice) and reduce the likelihood of bugs. Link: https://lore.kernel.org/r/20190708123354.12127-1-huangfq.daxian@gmail.com Signed-off-by: Fuqian Huang Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pci-host-common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index c742881b5061..c8cb9c5188a4 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -43,9 +43,8 @@ static struct pci_config_window *gen_pci_init(struct device *dev, goto err_out; } - err = devm_add_action(dev, gen_pci_unmap_cfg, cfg); + err = devm_add_action_or_reset(dev, gen_pci_unmap_cfg, cfg); if (err) { - gen_pci_unmap_cfg(cfg); goto err_out; } return cfg; From 197df18f7038178ef9c5806db1eff8e494381fa6 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Tue, 9 Jul 2019 23:01:32 +0530 Subject: [PATCH 006/721] mfd: max77620: Add of_node_put() before return Each iteration of for_each_child_of_node puts the previous node, but in the case of a return from the middle of the loop, there is no put, thus causing a memory leak. Hence add an of_node_put before the return. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Signed-off-by: Lee Jones --- drivers/mfd/max77620.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/max77620.c b/drivers/mfd/max77620.c index 0c28965fcc6a..a851ff473a44 100644 --- a/drivers/mfd/max77620.c +++ b/drivers/mfd/max77620.c @@ -416,8 +416,10 @@ static int max77620_initialise_fps(struct max77620_chip *chip) for_each_child_of_node(fps_np, fps_child) { ret = max77620_config_fps(chip, fps_child); - if (ret < 0) + if (ret < 0) { + of_node_put(fps_child); return ret; + } } config = chip->enable_global_lpm ? MAX77620_ONOFFCNFG2_SLP_LPM_MSK : 0; From b5e29aa880be84c271be8d0726cec4018bfbfd74 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Jul 2019 13:47:13 +0200 Subject: [PATCH 007/721] mfd: davinci_voicecodec: Remove pointless #include When building davinci as multiplatform, we get a build error in this file: drivers/mfd/davinci_voicecodec.c:22:10: fatal error: 'mach/hardware.h' file not found The header is only used to access the io_v2p() macro, but the result is already known because that comes from the resource a little bit earlier. Signed-off-by: Arnd Bergmann Acked-by: Sekhar Nori Signed-off-by: Lee Jones --- drivers/mfd/davinci_voicecodec.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/davinci_voicecodec.c b/drivers/mfd/davinci_voicecodec.c index 13ca7203e193..e5c8bc998eb4 100644 --- a/drivers/mfd/davinci_voicecodec.c +++ b/drivers/mfd/davinci_voicecodec.c @@ -19,7 +19,6 @@ #include #include -#include static const struct regmap_config davinci_vc_regmap = { .reg_bits = 32, @@ -31,6 +30,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) struct davinci_vc *davinci_vc; struct resource *res; struct mfd_cell *cell = NULL; + dma_addr_t fifo_base; int ret; davinci_vc = devm_kzalloc(&pdev->dev, @@ -48,6 +48,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + fifo_base = (dma_addr_t)res->start; davinci_vc->base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(davinci_vc->base)) { ret = PTR_ERR(davinci_vc->base); @@ -70,8 +71,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) } davinci_vc->davinci_vcif.dma_tx_channel = res->start; - davinci_vc->davinci_vcif.dma_tx_addr = - (dma_addr_t)(io_v2p(davinci_vc->base) + DAVINCI_VC_WFIFO); + davinci_vc->davinci_vcif.dma_tx_addr = fifo_base + DAVINCI_VC_WFIFO; res = platform_get_resource(pdev, IORESOURCE_DMA, 1); if (!res) { @@ -81,8 +81,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) } davinci_vc->davinci_vcif.dma_rx_channel = res->start; - davinci_vc->davinci_vcif.dma_rx_addr = - (dma_addr_t)(io_v2p(davinci_vc->base) + DAVINCI_VC_RFIFO); + davinci_vc->davinci_vcif.dma_rx_addr = fifo_base + DAVINCI_VC_RFIFO; davinci_vc->dev = &pdev->dev; davinci_vc->pdev = pdev; From c776dd50196a07a0ef943fcb74e8c86f9a5a20a8 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:38 -0600 Subject: [PATCH 008/721] PCI: Make PCI_PM_* delay times private These delay time definitions: #define PCI_PM_D2_DELAY 200 #define PCI_PM_D3_WAIT 10 #define PCI_PM_D3COLD_WAIT 100 #define PCI_PM_BUS_WAIT 50 are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-2-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 5 +++++ include/linux/pci.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1be03a97cb92..708413632429 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -39,6 +39,11 @@ int pci_probe_reset_function(struct pci_dev *dev); int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); +#define PCI_PM_D2_DELAY 200 +#define PCI_PM_D3_WAIT 10 +#define PCI_PM_D3COLD_WAIT 100 +#define PCI_PM_BUS_WAIT 50 + /** * struct pci_platform_pm_ops - Firmware PM callbacks * diff --git a/include/linux/pci.h b/include/linux/pci.h index 61b64fa1b0b1..9c2c6e161cfd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -145,11 +145,6 @@ static inline const char *pci_power_name(pci_power_t state) return pci_power_names[1 + (__force int) state]; } -#define PCI_PM_D2_DELAY 200 -#define PCI_PM_D3_WAIT 10 -#define PCI_PM_D3COLD_WAIT 100 -#define PCI_PM_BUS_WAIT 50 - /** * typedef pci_channel_state_t * From 669696ebbccc40e8096a2ee9809c028fc1538001 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:39 -0600 Subject: [PATCH 009/721] PCI: Make pci_check_pme_status(), pci_pme_wakeup_bus() private These interfaces: bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-3-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 2 ++ include/linux/pci.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 708413632429..ad1fe54ab8ee 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -89,6 +89,8 @@ void pci_power_up(struct pci_dev *dev); void pci_disable_enabled_device(struct pci_dev *dev); int pci_finish_runtime_suspend(struct pci_dev *dev); void pcie_clear_root_pme_status(struct pci_dev *dev); +bool pci_check_pme_status(struct pci_dev *dev); +void pci_pme_wakeup_bus(struct pci_bus *bus); int __pci_pme_wakeup(struct pci_dev *dev, void *ign); void pci_pme_restore(struct pci_dev *dev); bool pci_dev_need_resume(struct pci_dev *dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index 9c2c6e161cfd..b13fb829171e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1236,8 +1236,6 @@ int pci_wake_from_d3(struct pci_dev *dev, bool enable); int pci_prepare_to_sleep(struct pci_dev *dev); int pci_back_from_sleep(struct pci_dev *dev); bool pci_dev_run_wake(struct pci_dev *dev); -bool pci_check_pme_status(struct pci_dev *dev); -void pci_pme_wakeup_bus(struct pci_bus *bus); void pci_d3cold_enable(struct pci_dev *dev); void pci_d3cold_disable(struct pci_dev *dev); bool pcie_relaxed_ordering_enabled(struct pci_dev *dev); From 975e1ac173058b8710e5979e97fc1397233301f3 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:40 -0600 Subject: [PATCH 010/721] PCI: Make pci_get_host_bridge_device(), pci_put_host_bridge_device() private These interfaces: struct device *pci_get_host_bridge_device(struct pci_dev *dev); void pci_put_host_bridge_device(struct device *dev); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-4-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 3 +++ include/linux/pci.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ad1fe54ab8ee..b06c750e08d7 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -243,6 +243,9 @@ enum pci_bar_type { pci_bar_mem64, /* A 64-bit memory BAR */ }; +struct device *pci_get_host_bridge_device(struct pci_dev *dev); +void pci_put_host_bridge_device(struct device *dev); + int pci_configure_extended_tags(struct pci_dev *dev, void *ign); bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); diff --git a/include/linux/pci.h b/include/linux/pci.h index b13fb829171e..ffb904e7895d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -644,9 +644,6 @@ static inline struct pci_dev *pci_upstream_bridge(struct pci_dev *dev) return dev->bus->self; } -struct device *pci_get_host_bridge_device(struct pci_dev *dev); -void pci_put_host_bridge_device(struct device *dev); - #ifdef CONFIG_PCI_MSI static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev) { From 440589dd1068fb36b2fe10ccddf97e43267e3b8d Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:41 -0600 Subject: [PATCH 011/721] PCI: Make pci_save_vc_state(), pci_restore_vc_state(), etc private These Virtual Channel interfaces: int pci_save_vc_state(struct pci_dev *dev); void pci_restore_vc_state(struct pci_dev *dev); void pci_allocate_vc_save_buffers(struct pci_dev *dev); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-5-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 5 +++++ include/linux/pci.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index b06c750e08d7..19cda6e6fccd 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -130,6 +130,11 @@ void pci_vpd_release(struct pci_dev *dev); void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev); void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev); +/* PCI Virtual Channel */ +int pci_save_vc_state(struct pci_dev *dev); +void pci_restore_vc_state(struct pci_dev *dev); +void pci_allocate_vc_save_buffers(struct pci_dev *dev); + /* PCI /proc functions */ #ifdef CONFIG_PROC_FS int pci_proc_attach_device(struct pci_dev *dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index ffb904e7895d..3ff458f89190 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1239,11 +1239,6 @@ bool pcie_relaxed_ordering_enabled(struct pci_dev *dev); void pci_wakeup_bus(struct pci_bus *bus); void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state); -/* PCI Virtual Channel */ -int pci_save_vc_state(struct pci_dev *dev); -void pci_restore_vc_state(struct pci_dev *dev); -void pci_allocate_vc_save_buffers(struct pci_dev *dev); - /* For use by arch with custom probe code */ void set_pcie_port_type(struct pci_dev *pdev); void set_pcie_hotplug_bridge(struct pci_dev *pdev); From 003d3b2c5f835d897b3b10a13a9e1340630e93f1 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:42 -0600 Subject: [PATCH 012/721] PCI: Make pci_hotplug_io_size, mem_size, and bus_size private These symbols: extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mem_size; extern unsigned long pci_hotplug_bus_size; are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-6-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 3 +++ include/linux/pci.h | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 19cda6e6fccd..9698fa805e28 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -208,6 +208,9 @@ extern const struct attribute_group *pcibus_groups[]; extern const struct device_type pci_dev_type; extern const struct attribute_group *pci_bus_groups[]; +extern unsigned long pci_hotplug_io_size; +extern unsigned long pci_hotplug_mem_size; +extern unsigned long pci_hotplug_bus_size; /** * pci_match_one_device - Tell if a PCI device structure has a matching diff --git a/include/linux/pci.h b/include/linux/pci.h index 3ff458f89190..b5592b9b7f38 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2010,10 +2010,6 @@ extern unsigned long pci_cardbus_mem_size; extern u8 pci_dfl_cache_line_size; extern u8 pci_cache_line_size; -extern unsigned long pci_hotplug_io_size; -extern unsigned long pci_hotplug_mem_size; -extern unsigned long pci_hotplug_bus_size; - /* Architecture-specific versions may override these (weak) */ void pcibios_disable_device(struct pci_dev *dev); void pcibios_set_master(struct pci_dev *dev); From ecd29c1a38af4ba6e61382591dfe93f06f14ba25 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:43 -0600 Subject: [PATCH 013/721] PCI: Make pci_bus_get(), pci_bus_put() private These interfaces: struct pci_bus *pci_bus_get(struct pci_bus *bus); void pci_bus_put(struct pci_bus *bus); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-7-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 2 ++ include/linux/pci.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 9698fa805e28..81e94c9f1c12 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -274,6 +274,8 @@ bool pci_bus_clip_resource(struct pci_dev *dev, int idx); void pci_reassigndev_resource_alignment(struct pci_dev *dev); void pci_disable_bridge_window(struct pci_dev *dev); +struct pci_bus *pci_bus_get(struct pci_bus *bus); +void pci_bus_put(struct pci_bus *bus); /* PCIe link information */ #define PCIE_SPEED2STR(speed) \ diff --git a/include/linux/pci.h b/include/linux/pci.h index b5592b9b7f38..4d1b6f07f8f1 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1282,8 +1282,6 @@ int pci_request_selected_regions_exclusive(struct pci_dev *, int, const char *); void pci_release_selected_regions(struct pci_dev *, int); /* drivers/pci/bus.c */ -struct pci_bus *pci_bus_get(struct pci_bus *bus); -void pci_bus_put(struct pci_bus *bus); void pci_add_resource(struct list_head *resources, struct resource *res); void pci_add_resource_offset(struct list_head *resources, struct resource *res, resource_size_t offset); From 5da78d95785daa9454336a88b2f86a8998f6c739 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:44 -0600 Subject: [PATCH 014/721] PCI: Make pcie_update_link_speed() private This interface: void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); is only used in drivers/pci/ and does not need to be seen by the rest of the kernel. Move it to drivers/pci/pci.h so it's private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-8-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 1 + include/linux/pci.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 81e94c9f1c12..8459663358c5 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -299,6 +299,7 @@ u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed, enum pcie_link_width *width); void __pcie_print_link_status(struct pci_dev *dev, bool verbose); void pcie_report_downtraining(struct pci_dev *dev); +void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); /* Single Root I/O Virtualization */ struct pci_sriov { diff --git a/include/linux/pci.h b/include/linux/pci.h index 4d1b6f07f8f1..8301352a937f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -987,7 +987,6 @@ struct pci_bus *pci_scan_root_bus(struct device *parent, int bus, int pci_scan_root_bus_bridge(struct pci_host_bridge *bridge); struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr); -void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr, const char *name, struct hotplug_slot *hotplug); From b92b512a435da01b52de07e3dcc2f07a4ad404de Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:45 -0600 Subject: [PATCH 015/721] PCI: Make pci_ats_init() private This interface: void pci_ats_init(struct pci_dev *dev); is only used in drivers/pci/ and does not need to be seen by the rest of the kernel. Move it to drivers/pci/pci.h so it's private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-9-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 7 ++++--- include/linux/pci.h | 2 -- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 8459663358c5..2bb59afb540e 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -439,11 +439,12 @@ static inline void pci_restore_dpc_state(struct pci_dev *dev) {} #endif #ifdef CONFIG_PCI_ATS +/* Address Translation Service */ +void pci_ats_init(struct pci_dev *dev); void pci_restore_ats_state(struct pci_dev *dev); #else -static inline void pci_restore_ats_state(struct pci_dev *dev) -{ -} +static inline void pci_ats_init(struct pci_dev *d) { } +static inline void pci_restore_ats_state(struct pci_dev *dev) { } #endif /* CONFIG_PCI_ATS */ #ifdef CONFIG_PCI_IOV diff --git a/include/linux/pci.h b/include/linux/pci.h index 8301352a937f..9b8d0c2a1c9a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1761,13 +1761,11 @@ static inline bool pci_ats_disabled(void) { return true; } #ifdef CONFIG_PCI_ATS /* Address Translation Service */ -void pci_ats_init(struct pci_dev *dev); int pci_enable_ats(struct pci_dev *dev, int ps); void pci_disable_ats(struct pci_dev *dev); int pci_ats_queue_depth(struct pci_dev *dev); int pci_ats_page_aligned(struct pci_dev *dev); #else -static inline void pci_ats_init(struct pci_dev *d) { } static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; } static inline void pci_disable_ats(struct pci_dev *d) { } static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; } From 72bde9ced373ca1e27332fd020d248151319a3d6 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:46 -0600 Subject: [PATCH 016/721] PCI: Make pcie_set_ecrc_checking(), pcie_ecrc_get_policy() private These interfaces: void pcie_set_ecrc_checking(struct pci_dev *dev); void pcie_ecrc_get_policy(char *str); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-10-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 8 ++++++++ include/linux/pci.h | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 2bb59afb540e..d3d006c95041 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -541,6 +541,14 @@ static inline void pcie_aspm_create_sysfs_dev_files(struct pci_dev *pdev) { } static inline void pcie_aspm_remove_sysfs_dev_files(struct pci_dev *pdev) { } #endif +#ifdef CONFIG_PCIE_ECRC +void pcie_set_ecrc_checking(struct pci_dev *dev); +void pcie_ecrc_get_policy(char *str); +#else +static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } +static inline void pcie_ecrc_get_policy(char *str) { } +#endif + #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); #else diff --git a/include/linux/pci.h b/include/linux/pci.h index 9b8d0c2a1c9a..65502c564661 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1559,14 +1559,6 @@ bool pci_aer_available(void); static inline bool pci_aer_available(void) { return false; } #endif -#ifdef CONFIG_PCIE_ECRC -void pcie_set_ecrc_checking(struct pci_dev *dev); -void pcie_ecrc_get_policy(char *str); -#else -static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } -static inline void pcie_ecrc_get_policy(char *str) { } -#endif - bool pci_ats_disabled(void); #ifdef CONFIG_PCIE_PTM From ac6c26da29c12fa511c877c273ed5c939dc9e96c Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:47 -0600 Subject: [PATCH 017/721] PCI: Make pci_enable_ptm() private This interface: int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); is only used in drivers/pci/ and does not need to be seen by the rest of the kernel. Move it to drivers/pci/pci.h so it's private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-11-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 3 +++ include/linux/pci.h | 7 ------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d3d006c95041..8935fc1ff446 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -551,8 +551,11 @@ static inline void pcie_ecrc_get_policy(char *str) { } #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); +int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); #else static inline void pci_ptm_init(struct pci_dev *dev) { } +static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +{ return -EINVAL; } #endif struct pci_dev_reset_methods { diff --git a/include/linux/pci.h b/include/linux/pci.h index 65502c564661..ef73b0c87f01 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1561,13 +1561,6 @@ static inline bool pci_aer_available(void) { return false; } bool pci_ats_disabled(void); -#ifdef CONFIG_PCIE_PTM -int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); -#else -static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) -{ return -EINVAL; } -#endif - void pci_cfg_access_lock(struct pci_dev *dev); bool pci_cfg_access_trylock(struct pci_dev *dev); void pci_cfg_access_unlock(struct pci_dev *dev); From 621f7e354fd8f99db0e3f7a0e8cda238caee9f3a Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Wed, 24 Jul 2019 17:38:48 -0600 Subject: [PATCH 018/721] PCI: Make pci_set_of_node(), etc private These interfaces: void pci_set_of_node(struct pci_dev *dev); void pci_release_of_node(struct pci_dev *dev); void pci_set_bus_of_node(struct pci_bus *bus); void pci_release_bus_of_node(struct pci_bus *bus); are only used in drivers/pci/ and do not need to be seen by the rest of the kernel. Move them to drivers/pci/pci.h so they're private to the PCI subsystem. Link: https://lore.kernel.org/r/20190724233848.73327-12-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 9 +++++++++ include/linux/pci.h | 8 -------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 8935fc1ff446..61bbfd611140 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -592,6 +592,10 @@ struct device_node; int of_pci_parse_bus_range(struct device_node *node, struct resource *res); int of_get_pci_domain_nr(struct device_node *node); int of_pci_get_max_link_speed(struct device_node *node); +void pci_set_of_node(struct pci_dev *dev); +void pci_release_of_node(struct pci_dev *dev); +void pci_set_bus_of_node(struct pci_bus *bus); +void pci_release_bus_of_node(struct pci_bus *bus); #else static inline int @@ -611,6 +615,11 @@ of_pci_get_max_link_speed(struct device_node *node) { return -EINVAL; } + +static inline void pci_set_of_node(struct pci_dev *dev) { } +static inline void pci_release_of_node(struct pci_dev *dev) { } +static inline void pci_set_bus_of_node(struct pci_bus *bus) { } +static inline void pci_release_bus_of_node(struct pci_bus *bus) { } #endif /* CONFIG_OF */ #if defined(CONFIG_OF_ADDRESS) diff --git a/include/linux/pci.h b/include/linux/pci.h index ef73b0c87f01..5a26b7db71b3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2259,10 +2259,6 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, #ifdef CONFIG_OF struct device_node; struct irq_domain; -void pci_set_of_node(struct pci_dev *dev); -void pci_release_of_node(struct pci_dev *dev); -void pci_set_bus_of_node(struct pci_bus *bus); -void pci_release_bus_of_node(struct pci_bus *bus); struct irq_domain *pci_host_bridge_of_msi_domain(struct pci_bus *bus); int pci_parse_request_of_pci_ranges(struct device *dev, struct list_head *resources, @@ -2272,10 +2268,6 @@ int pci_parse_request_of_pci_ranges(struct device *dev, struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus); #else /* CONFIG_OF */ -static inline void pci_set_of_node(struct pci_dev *dev) { } -static inline void pci_release_of_node(struct pci_dev *dev) { } -static inline void pci_set_bus_of_node(struct pci_bus *bus) { } -static inline void pci_release_bus_of_node(struct pci_bus *bus) { } static inline struct irq_domain * pci_host_bridge_of_msi_domain(struct pci_bus *bus) { return NULL; } static inline int pci_parse_request_of_pci_ranges(struct device *dev, From 1fed17df7e501b170f876b87eec11f930e3f1df5 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 14 Jun 2019 18:42:17 +0000 Subject: [PATCH 019/721] hv_balloon: Use a static page for the balloon_up send buffer It's unnecessary to dynamically allocate the buffer. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/hv_balloon.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 6fb4ea5f0304..4b06f076374a 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -494,7 +494,7 @@ enum hv_dm_state { static __u8 recv_buffer[PAGE_SIZE]; -static __u8 *send_buffer; +static __u8 balloon_up_send_buffer[PAGE_SIZE]; #define PAGES_IN_2M 512 #define HA_CHUNK (32 * 1024) @@ -1292,8 +1292,8 @@ static void balloon_up(struct work_struct *dummy) } while (!done) { - bl_resp = (struct dm_balloon_response *)send_buffer; - memset(send_buffer, 0, PAGE_SIZE); + memset(balloon_up_send_buffer, 0, PAGE_SIZE); + bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer; bl_resp->hdr.type = DM_BALLOON_RESPONSE; bl_resp->hdr.size = sizeof(struct dm_balloon_response); bl_resp->more_pages = 1; @@ -1578,19 +1578,11 @@ static int balloon_probe(struct hv_device *dev, do_hot_add = false; #endif - /* - * First allocate a send buffer. - */ - - send_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!send_buffer) - return -ENOMEM; - ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, balloon_onchannelcallback, dev); if (ret) - goto probe_error0; + return ret; dm_device.dev = dev; dm_device.state = DM_INITIALIZING; @@ -1716,8 +1708,6 @@ static int balloon_probe(struct hv_device *dev, probe_error1: vmbus_close(dev->channel); -probe_error0: - kfree(send_buffer); return ret; } @@ -1736,7 +1726,6 @@ static int balloon_remove(struct hv_device *dev) vmbus_close(dev->channel); kthread_stop(dm->thread); - kfree(send_buffer); #ifdef CONFIG_MEMORY_HOTPLUG restore_online_page_callback(&hv_online_page); unregister_memory_notifier(&hv_memory_nb); From 221f6df008ab39151600d4b09c85fcc730716bcb Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 14 Jun 2019 18:42:30 +0000 Subject: [PATCH 020/721] hv_balloon: Reorganize the probe function Move the code that negotiates with the host to a new function balloon_connect_vsp() and improve the error handling. This makes the code more readable and paves the way for the support of hibernation in future. Makes no real logic change here. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/hv_balloon.c | 124 +++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 4b06f076374a..34bd73526afd 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1564,50 +1564,18 @@ static void balloon_onchannelcallback(void *context) } -static int balloon_probe(struct hv_device *dev, - const struct hv_vmbus_device_id *dev_id) +static int balloon_connect_vsp(struct hv_device *dev) { - int ret; - unsigned long t; struct dm_version_request version_req; struct dm_capabilities cap_msg; - -#ifdef CONFIG_MEMORY_HOTPLUG - do_hot_add = hot_add; -#else - do_hot_add = false; -#endif + unsigned long t; + int ret; ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, - balloon_onchannelcallback, dev); - + balloon_onchannelcallback, dev); if (ret) return ret; - dm_device.dev = dev; - dm_device.state = DM_INITIALIZING; - dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; - init_completion(&dm_device.host_event); - init_completion(&dm_device.config_event); - INIT_LIST_HEAD(&dm_device.ha_region_list); - spin_lock_init(&dm_device.ha_lock); - INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); - INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); - dm_device.host_specified_ha_region = false; - - dm_device.thread = - kthread_run(dm_thread_func, &dm_device, "hv_balloon"); - if (IS_ERR(dm_device.thread)) { - ret = PTR_ERR(dm_device.thread); - goto probe_error1; - } - -#ifdef CONFIG_MEMORY_HOTPLUG - set_online_page_callback(&hv_online_page); - register_memory_notifier(&hv_memory_nb); -#endif - - hv_set_drvdata(dev, &dm_device); /* * Initiate the hand shake with the host and negotiate * a version that the host can support. We start with the @@ -1623,16 +1591,15 @@ static int balloon_probe(struct hv_device *dev, dm_device.version = version_req.version.version; ret = vmbus_sendpacket(dev->channel, &version_req, - sizeof(struct dm_version_request), - (unsigned long)NULL, - VM_PKT_DATA_INBAND, 0); + sizeof(struct dm_version_request), + (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); if (ret) - goto probe_error2; + goto out; t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); if (t == 0) { ret = -ETIMEDOUT; - goto probe_error2; + goto out; } /* @@ -1640,8 +1607,8 @@ static int balloon_probe(struct hv_device *dev, * fail the probe function. */ if (dm_device.state == DM_INIT_ERROR) { - ret = -ETIMEDOUT; - goto probe_error2; + ret = -EPROTO; + goto out; } pr_info("Using Dynamic Memory protocol version %u.%u\n", @@ -1674,16 +1641,15 @@ static int balloon_probe(struct hv_device *dev, cap_msg.max_page_number = -1; ret = vmbus_sendpacket(dev->channel, &cap_msg, - sizeof(struct dm_capabilities), - (unsigned long)NULL, - VM_PKT_DATA_INBAND, 0); + sizeof(struct dm_capabilities), + (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); if (ret) - goto probe_error2; + goto out; t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); if (t == 0) { ret = -ETIMEDOUT; - goto probe_error2; + goto out; } /* @@ -1691,23 +1657,65 @@ static int balloon_probe(struct hv_device *dev, * fail the probe function. */ if (dm_device.state == DM_INIT_ERROR) { - ret = -ETIMEDOUT; - goto probe_error2; + ret = -EPROTO; + goto out; } + return 0; +out: + vmbus_close(dev->channel); + return ret; +} + +static int balloon_probe(struct hv_device *dev, + const struct hv_vmbus_device_id *dev_id) +{ + int ret; + +#ifdef CONFIG_MEMORY_HOTPLUG + do_hot_add = hot_add; +#else + do_hot_add = false; +#endif + dm_device.dev = dev; + dm_device.state = DM_INITIALIZING; + dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; + init_completion(&dm_device.host_event); + init_completion(&dm_device.config_event); + INIT_LIST_HEAD(&dm_device.ha_region_list); + spin_lock_init(&dm_device.ha_lock); + INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); + INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); + dm_device.host_specified_ha_region = false; + +#ifdef CONFIG_MEMORY_HOTPLUG + set_online_page_callback(&hv_online_page); + register_memory_notifier(&hv_memory_nb); +#endif + + hv_set_drvdata(dev, &dm_device); + + ret = balloon_connect_vsp(dev); + if (ret != 0) + return ret; + dm_device.state = DM_INITIALIZED; - last_post_time = jiffies; + + dm_device.thread = + kthread_run(dm_thread_func, &dm_device, "hv_balloon"); + if (IS_ERR(dm_device.thread)) { + ret = PTR_ERR(dm_device.thread); + goto probe_error; + } return 0; -probe_error2: +probe_error: + vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG + unregister_memory_notifier(&hv_memory_nb); restore_online_page_callback(&hv_online_page); #endif - kthread_stop(dm_device.thread); - -probe_error1: - vmbus_close(dev->channel); return ret; } @@ -1724,11 +1732,11 @@ static int balloon_remove(struct hv_device *dev) cancel_work_sync(&dm->balloon_wrk.wrk); cancel_work_sync(&dm->ha_wrk.wrk); - vmbus_close(dev->channel); kthread_stop(dm->thread); + vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG - restore_online_page_callback(&hv_online_page); unregister_memory_notifier(&hv_memory_nb); + restore_online_page_callback(&hv_online_page); #endif spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { From e5738bc46d49642dc4804bad3656b23016f8e1f1 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 8 Jul 2019 02:12:34 +0300 Subject: [PATCH 021/721] i2c: tegra: Compile PM functions unconditionally The I2C driver fails to probe if CONFIG_PM_SLEEP=n because runtime PM doesn't depend on the PM sleep and in this case the runtime PM ops are not included in the driver, resulting in I2C clock not being enabled. It's much cleaner to simply allow compiler to remove the dead code instead of messing with the #ifdefs. This patch fixes such errors when CONFIG_PM_SLEEP=n: tegra-i2c 7000c400.i2c: timeout waiting for fifo flush tegra-i2c 7000c400.i2c: Failed to initialize i2c controller Signed-off-by: Dmitry Osipenko Acked-by: Jon Hunter Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-tegra.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 9fcb13beeb8f..18f0ceed9f1b 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -636,7 +636,7 @@ static void tegra_dvc_init(struct tegra_i2c_dev *i2c_dev) dvc_writel(i2c_dev, val, DVC_CTRL_REG1); } -static int tegra_i2c_runtime_resume(struct device *dev) +static int __maybe_unused tegra_i2c_runtime_resume(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); int ret; @@ -665,7 +665,7 @@ static int tegra_i2c_runtime_resume(struct device *dev) return 0; } -static int tegra_i2c_runtime_suspend(struct device *dev) +static int __maybe_unused tegra_i2c_runtime_suspend(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); @@ -1711,8 +1711,7 @@ static int tegra_i2c_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM_SLEEP -static int tegra_i2c_suspend(struct device *dev) +static int __maybe_unused tegra_i2c_suspend(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); @@ -1721,7 +1720,7 @@ static int tegra_i2c_suspend(struct device *dev) return 0; } -static int tegra_i2c_resume(struct device *dev) +static int __maybe_unused tegra_i2c_resume(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); int err; @@ -1741,18 +1740,13 @@ static const struct dev_pm_ops tegra_i2c_pm = { NULL) }; -#define TEGRA_I2C_PM (&tegra_i2c_pm) -#else -#define TEGRA_I2C_PM NULL -#endif - static struct platform_driver tegra_i2c_driver = { .probe = tegra_i2c_probe, .remove = tegra_i2c_remove, .driver = { .name = "tegra-i2c", .of_match_table = tegra_i2c_of_match, - .pm = TEGRA_I2C_PM, + .pm = &tegra_i2c_pm, }, }; From 34de3513e668d5ad46bd32ebee63f9e3dc57e1db Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Mon, 15 Jul 2019 11:17:39 +0800 Subject: [PATCH 022/721] i2c: ismt: Remove call to memset after dmam_alloc_coherent In commit 518a2f1925c3 ("dma-mapping: zero memory returned from dma_alloc_*"), dma_alloc_coherent has already zeroed the memory. So memset is not needed. Acked-by: Neil Horman Signed-off-by: Fuqian Huang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-ismt.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index 02d23edb2fb1..2f95e25a10f7 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -781,8 +781,6 @@ static int ismt_dev_init(struct ismt_priv *priv) if (!priv->hw) return -ENOMEM; - memset(priv->hw, 0, (ISMT_DESC_ENTRIES * sizeof(struct ismt_desc))); - priv->head = 0; init_completion(&priv->cmp); From b17e6d19dcd3fcb2b93166034e739e82b49d2a51 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Wed, 17 Jul 2019 16:40:16 +0800 Subject: [PATCH 023/721] i2c: mxs: use devm_platform_ioremap_resource() to simplify code Use the new helper devm_platform_ioremap_resource() which wraps the platform_get_resource() and devm_ioremap_resource() together, to simplify the code. Signed-off-by: Anson Huang Reviewed-by: Dong Aisheng Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mxs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c index 7d79317a1046..89224913f578 100644 --- a/drivers/i2c/busses/i2c-mxs.c +++ b/drivers/i2c/busses/i2c-mxs.c @@ -802,7 +802,6 @@ static int mxs_i2c_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct mxs_i2c_dev *i2c; struct i2c_adapter *adap; - struct resource *res; int err, irq; i2c = devm_kzalloc(dev, sizeof(*i2c), GFP_KERNEL); @@ -814,8 +813,7 @@ static int mxs_i2c_probe(struct platform_device *pdev) i2c->dev_type = device_id->driver_data; } - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - i2c->regs = devm_ioremap_resource(&pdev->dev, res); + i2c->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(i2c->regs)) return PTR_ERR(i2c->regs); From 5667b5b59f45bd0bebadfae9ed55745b4cff2198 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Wed, 17 Jul 2019 16:40:17 +0800 Subject: [PATCH 024/721] i2c: imx-lpi2c: use devm_platform_ioremap_resource() to simplify code Use the new helper devm_platform_ioremap_resource() which wraps the platform_get_resource() and devm_ioremap_resource() together, to simplify the code. Signed-off-by: Anson Huang Acked-by: Dong Aisheng Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-imx-lpi2c.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-imx-lpi2c.c b/drivers/i2c/busses/i2c-imx-lpi2c.c index dc00fabc919a..c92b56485fa6 100644 --- a/drivers/i2c/busses/i2c-imx-lpi2c.c +++ b/drivers/i2c/busses/i2c-imx-lpi2c.c @@ -545,7 +545,6 @@ MODULE_DEVICE_TABLE(of, lpi2c_imx_of_match); static int lpi2c_imx_probe(struct platform_device *pdev) { struct lpi2c_imx_struct *lpi2c_imx; - struct resource *res; unsigned int temp; int irq, ret; @@ -553,8 +552,7 @@ static int lpi2c_imx_probe(struct platform_device *pdev) if (!lpi2c_imx) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - lpi2c_imx->base = devm_ioremap_resource(&pdev->dev, res); + lpi2c_imx->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(lpi2c_imx->base)) return PTR_ERR(lpi2c_imx->base); From 7735eeebd2be26d6d3fa151c85d839d62d710d93 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Tue, 23 Jul 2019 19:11:10 +0800 Subject: [PATCH 025/721] i2c: busses: Use dev_get_drvdata where possible Instead of using to_pci_dev + pci_get_drvdata, use dev_get_drvdata to make code simpler. Signed-off-by: Chuhong Yuan Reviewed-by: Jean Delvare Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-pcidrv.c | 6 ++---- drivers/i2c/busses/i2c-i801.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c index 76810deb2de6..7d2e6959679c 100644 --- a/drivers/i2c/busses/i2c-designware-pcidrv.c +++ b/drivers/i2c/busses/i2c-designware-pcidrv.c @@ -173,8 +173,7 @@ static struct dw_pci_controller dw_pci_controllers[] = { #ifdef CONFIG_PM static int i2c_dw_pci_suspend(struct device *dev) { - struct pci_dev *pdev = to_pci_dev(dev); - struct dw_i2c_dev *i_dev = pci_get_drvdata(pdev); + struct dw_i2c_dev *i_dev = dev_get_drvdata(dev); i_dev->suspended = true; i_dev->disable(i_dev); @@ -184,8 +183,7 @@ static int i2c_dw_pci_suspend(struct device *dev) static int i2c_dw_pci_resume(struct device *dev) { - struct pci_dev *pdev = to_pci_dev(dev); - struct dw_i2c_dev *i_dev = pci_get_drvdata(pdev); + struct dw_i2c_dev *i_dev = dev_get_drvdata(dev); int ret; ret = i_dev->init(i_dev); diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index f2956936c3f2..a6469978e735 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1912,8 +1912,7 @@ static int i801_suspend(struct device *dev) static int i801_resume(struct device *dev) { - struct pci_dev *pci_dev = to_pci_dev(dev); - struct i801_priv *priv = pci_get_drvdata(pci_dev); + struct i801_priv *priv = dev_get_drvdata(dev); i801_enable_host_notify(&priv->adapter); From 23c2556d8fbec57e72e2c8fd8b7be2715c192b25 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 24 Jul 2019 14:15:56 +0200 Subject: [PATCH 026/721] dt-bindings: i2c: sh_mobile: Rename bindings documentation file Rename the bindings documentation file for sh_mobile I2C controller from i2c-sh_mobile.txt to renesas,iic.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../bindings/i2c/{i2c-sh_mobile.txt => renesas,iic.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-sh_mobile.txt => renesas,iic.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt b/Documentation/devicetree/bindings/i2c/renesas,iic.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt rename to Documentation/devicetree/bindings/i2c/renesas,iic.txt diff --git a/MAINTAINERS b/MAINTAINERS index 6426db5198f0..c8a78239447a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13651,7 +13651,7 @@ RENESAS R-CAR I2C DRIVERS M: Wolfram Sang S: Supported F: Documentation/devicetree/bindings/i2c/i2c-rcar.txt -F: Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt +F: Documentation/devicetree/bindings/i2c/renesas,iic.txt F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c From d13ed84b195cc6e5789b446f07aede357939f7ad Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 24 Jul 2019 14:15:57 +0200 Subject: [PATCH 027/721] dt-bindings: i2c: rcar: Rename bindings documentation file Rename the bindings documentation file for R-Car I2C controller from i2c-rcar.txt to renesas,rcar.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/{i2c-rcar.txt => renesas,i2c.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-rcar.txt => renesas,i2c.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-rcar.txt b/Documentation/devicetree/bindings/i2c/renesas,i2c.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-rcar.txt rename to Documentation/devicetree/bindings/i2c/renesas,i2c.txt diff --git a/MAINTAINERS b/MAINTAINERS index c8a78239447a..f92a88594dfc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13650,7 +13650,7 @@ F: drivers/iio/adc/rcar-gyroadc.c RENESAS R-CAR I2C DRIVERS M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-rcar.txt +F: Documentation/devicetree/bindings/i2c/renesas,i2c.txt F: Documentation/devicetree/bindings/i2c/renesas,iic.txt F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c From 1d583590514a4f1c9aa582d1234fca7aa3d627e9 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 24 Jul 2019 14:15:58 +0200 Subject: [PATCH 028/721] dt-bindings: i2c: riic: Rename bindings documentation file Rename the bindings documentation file for RIIC controller from i2c-riic.txt to renesas,riic.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/{i2c-riic.txt => renesas,riic.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-riic.txt => renesas,riic.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-riic.txt b/Documentation/devicetree/bindings/i2c/renesas,riic.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-riic.txt rename to Documentation/devicetree/bindings/i2c/renesas,riic.txt diff --git a/MAINTAINERS b/MAINTAINERS index f92a88594dfc..a14ce44c33b3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13658,7 +13658,7 @@ F: drivers/i2c/busses/i2c-sh_mobile.c RENESAS RIIC DRIVER M: Chris Brandt S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-riic.txt +F: Documentation/devicetree/bindings/i2c/renesas,riic.txt F: drivers/i2c/busses/i2c-riic.c RENESAS USB PHY DRIVER From 684ca71259a69c5a3019da72fe718bf983841926 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 24 Jul 2019 14:15:59 +0200 Subject: [PATCH 029/721] dt-bindings: i2c: riic: Rename bindings documentation file Rename the bindings documentation file for Renesas EMEV2 IIC controller from i2c-emev2.txt to renesas,iic-emev2.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../bindings/i2c/{i2c-emev2.txt => renesas,iic-emev2.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-emev2.txt => renesas,iic-emev2.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-emev2.txt b/Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-emev2.txt rename to Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt diff --git a/MAINTAINERS b/MAINTAINERS index a14ce44c33b3..f89f27a2d09a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13628,7 +13628,7 @@ F: drivers/clk/renesas/ RENESAS EMEV2 I2C DRIVER M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-emev2.txt +F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt F: drivers/i2c/busses/i2c-emev2.c RENESAS ETHERNET DRIVERS From 33eb09a02e8d8fad976019694196eec2cc210d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 25 Jul 2019 22:21:36 +0200 Subject: [PATCH 030/721] i2c: designware: make use of devm_gpiod_get_optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a semantical change: if devm_gpiod_get_optional returns -ENOSYS this is passed as error to the caller. This effectively reverts commit d1fa74520dcd ("i2c: designware: Consider SCL GPIO optional") which shouldn't be necessary any more since gpiod_get_optional doesn't return -ENOSYS any more with GPIOLIB=n. Signed-off-by: Uwe Kleine-König Reviewed-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-master.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index d464799e40a3..867787dade43 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -657,13 +657,10 @@ static int i2c_dw_init_recovery_info(struct dw_i2c_dev *dev) struct gpio_desc *gpio; int r; - gpio = devm_gpiod_get(dev->dev, "scl", GPIOD_OUT_HIGH); - if (IS_ERR(gpio)) { - r = PTR_ERR(gpio); - if (r == -ENOENT || r == -ENOSYS) - return 0; - return r; - } + gpio = devm_gpiod_get_optional(dev->dev, "scl", GPIOD_OUT_HIGH); + if (IS_ERR_OR_NULL(gpio)) + return PTR_ERR_OR_ZERO(gpio); + rinfo->scl_gpiod = gpio; gpio = devm_gpiod_get_optional(dev->dev, "sda", GPIOD_IN); From 378b80370aa1fe50f9c48a3ac8af3e416e73b89f Mon Sep 17 00:00:00 2001 From: Fabian Henneke Date: Tue, 9 Jul 2019 13:03:37 +0200 Subject: [PATCH 031/721] hidraw: Return EPOLLOUT from hidraw_poll Always return EPOLLOUT from hidraw_poll when a device is connected. This is safe since writes are always possible (but will always block). hidraw does not support non-blocking writes and instead always calls blocking backend functions on write requests. Hence, so far, a call to poll never returned EPOLLOUT, which confuses tools like socat. Signed-off-by: Fabian Henneke In-reply-to: Signed-off-by: Jiri Kosina --- drivers/hid/hidraw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index 006bd6f4f653..24f40a3ddded 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -252,7 +252,7 @@ static __poll_t hidraw_poll(struct file *file, poll_table *wait) poll_wait(file, &list->hidraw->wait, wait); if (list->head != list->tail) - return EPOLLIN | EPOLLRDNORM; + return EPOLLIN | EPOLLRDNORM | EPOLLOUT; if (!list->hidraw->exist) return EPOLLERR | EPOLLHUP; return 0; From c801aff1a57697d79618ea3e84dc8ff79617669d Mon Sep 17 00:00:00 2001 From: Fabian Henneke Date: Thu, 18 Jul 2019 22:50:58 +0200 Subject: [PATCH 032/721] hiddev: Return EPOLLOUT from hiddev_poll Always return EPOLLOUT from hiddev_poll when a device is connected. This is safe since hiddev_write always fails and improves compatibility with tools like socat. Signed-off-by: Fabian Henneke In-reply-to: Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hiddev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c index 55b72573066b..73a76d9af974 100644 --- a/drivers/hid/usbhid/hiddev.c +++ b/drivers/hid/usbhid/hiddev.c @@ -416,7 +416,7 @@ static __poll_t hiddev_poll(struct file *file, poll_table *wait) poll_wait(file, &list->hiddev->wait, wait); if (list->head != list->tail) - return EPOLLIN | EPOLLRDNORM; + return EPOLLIN | EPOLLRDNORM | EPOLLOUT; if (!list->hiddev->exist) return EPOLLERR | EPOLLHUP; return 0; From 5b6cc1277a0536dae20f37d531f29b3416cee670 Mon Sep 17 00:00:00 2001 From: Olivier Gay Date: Mon, 29 Jul 2019 19:21:52 +0200 Subject: [PATCH 033/721] HID: logitech-dj: extend consumer usages range Extend the range of usage codes in the consumer page descriptor of the driver. Some Logitech HID devices send usages in that upper range. Signed-off-by: Olivier Gay Tested-by: Benson Leung Signed-off-by: Jiri Kosina --- drivers/hid/hid-logitech-dj.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c index 6196217a7d93..0e058ddb8e1c 100644 --- a/drivers/hid/hid-logitech-dj.c +++ b/drivers/hid/hid-logitech-dj.c @@ -380,9 +380,9 @@ static const char consumer_descriptor[] = { 0x75, 0x10, /* REPORT_SIZE (16) */ 0x95, 0x02, /* REPORT_COUNT (2) */ 0x15, 0x01, /* LOGICAL_MIN (1) */ - 0x26, 0x8C, 0x02, /* LOGICAL_MAX (652) */ + 0x26, 0xFF, 0x02, /* LOGICAL_MAX (767) */ 0x19, 0x01, /* USAGE_MIN (1) */ - 0x2A, 0x8C, 0x02, /* USAGE_MAX (652) */ + 0x2A, 0xFF, 0x02, /* USAGE_MAX (767) */ 0x81, 0x00, /* INPUT (Data Ary Abs) */ 0xC0, /* END_COLLECTION */ }; /* */ From 0eb2f2962489edb6943fe8a853de024c53aa91b2 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 31 Jul 2019 19:29:14 -0400 Subject: [PATCH 034/721] selinux: shuffle around policydb.c to get rid of forward declarations No code changes, but move a lot of the policydb destructors higher up so we can get rid of a forward declaration. This patch does expose a few old checkpatch.pl errors, but those will be dealt with in a separate (set of) patches. Signed-off-by: Paul Moore --- security/selinux/ss/policydb.c | 376 ++++++++++++++++----------------- 1 file changed, 187 insertions(+), 189 deletions(-) diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 38d0083204f1..d5effce86304 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -178,6 +178,193 @@ static struct policydb_compat_info *policydb_lookup_compat(int version) return info; } +/* + * The following *_destroy functions are used to + * free any memory allocated for each kind of + * symbol data in the policy database. + */ + +static int perm_destroy(void *key, void *datum, void *p) +{ + kfree(key); + kfree(datum); + return 0; +} + +static int common_destroy(void *key, void *datum, void *p) +{ + struct common_datum *comdatum; + + kfree(key); + if (datum) { + comdatum = datum; + hashtab_map(comdatum->permissions.table, perm_destroy, NULL); + hashtab_destroy(comdatum->permissions.table); + } + kfree(datum); + return 0; +} + +static void constraint_expr_destroy(struct constraint_expr *expr) +{ + if (expr) { + ebitmap_destroy(&expr->names); + if (expr->type_names) { + ebitmap_destroy(&expr->type_names->types); + ebitmap_destroy(&expr->type_names->negset); + kfree(expr->type_names); + } + kfree(expr); + } +} + +static int cls_destroy(void *key, void *datum, void *p) +{ + struct class_datum *cladatum; + struct constraint_node *constraint, *ctemp; + struct constraint_expr *e, *etmp; + + kfree(key); + if (datum) { + cladatum = datum; + hashtab_map(cladatum->permissions.table, perm_destroy, NULL); + hashtab_destroy(cladatum->permissions.table); + constraint = cladatum->constraints; + while (constraint) { + e = constraint->expr; + while (e) { + etmp = e; + e = e->next; + constraint_expr_destroy(etmp); + } + ctemp = constraint; + constraint = constraint->next; + kfree(ctemp); + } + + constraint = cladatum->validatetrans; + while (constraint) { + e = constraint->expr; + while (e) { + etmp = e; + e = e->next; + constraint_expr_destroy(etmp); + } + ctemp = constraint; + constraint = constraint->next; + kfree(ctemp); + } + kfree(cladatum->comkey); + } + kfree(datum); + return 0; +} + +static int role_destroy(void *key, void *datum, void *p) +{ + struct role_datum *role; + + kfree(key); + if (datum) { + role = datum; + ebitmap_destroy(&role->dominates); + ebitmap_destroy(&role->types); + } + kfree(datum); + return 0; +} + +static int type_destroy(void *key, void *datum, void *p) +{ + kfree(key); + kfree(datum); + return 0; +} + +static int user_destroy(void *key, void *datum, void *p) +{ + struct user_datum *usrdatum; + + kfree(key); + if (datum) { + usrdatum = datum; + ebitmap_destroy(&usrdatum->roles); + ebitmap_destroy(&usrdatum->range.level[0].cat); + ebitmap_destroy(&usrdatum->range.level[1].cat); + ebitmap_destroy(&usrdatum->dfltlevel.cat); + } + kfree(datum); + return 0; +} + +static int sens_destroy(void *key, void *datum, void *p) +{ + struct level_datum *levdatum; + + kfree(key); + if (datum) { + levdatum = datum; + if (levdatum->level) + ebitmap_destroy(&levdatum->level->cat); + kfree(levdatum->level); + } + kfree(datum); + return 0; +} + +static int cat_destroy(void *key, void *datum, void *p) +{ + kfree(key); + kfree(datum); + return 0; +} + +static int (*destroy_f[SYM_NUM]) (void *key, void *datum, void *datap) = +{ + common_destroy, + cls_destroy, + role_destroy, + type_destroy, + user_destroy, + cond_destroy_bool, + sens_destroy, + cat_destroy, +}; + +static int filenametr_destroy(void *key, void *datum, void *p) +{ + struct filename_trans *ft = key; + kfree(ft->name); + kfree(key); + kfree(datum); + cond_resched(); + return 0; +} + +static int range_tr_destroy(void *key, void *datum, void *p) +{ + struct mls_range *rt = datum; + kfree(key); + ebitmap_destroy(&rt->level[0].cat); + ebitmap_destroy(&rt->level[1].cat); + kfree(datum); + cond_resched(); + return 0; +} + +static void ocontext_destroy(struct ocontext *c, int i) +{ + if (!c) + return; + + context_destroy(&c->context[0]); + context_destroy(&c->context[1]); + if (i == OCON_ISID || i == OCON_FS || + i == OCON_NETIF || i == OCON_FSUSE) + kfree(c->u.name); + kfree(c); +} + /* * Initialize the role table. */ @@ -274,8 +461,6 @@ static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2) return v; } -static int (*destroy_f[SYM_NUM]) (void *key, void *datum, void *datap); - /* * Initialize a policy database structure. */ @@ -569,193 +754,6 @@ static int policydb_index(struct policydb *p) return rc; } -/* - * The following *_destroy functions are used to - * free any memory allocated for each kind of - * symbol data in the policy database. - */ - -static int perm_destroy(void *key, void *datum, void *p) -{ - kfree(key); - kfree(datum); - return 0; -} - -static int common_destroy(void *key, void *datum, void *p) -{ - struct common_datum *comdatum; - - kfree(key); - if (datum) { - comdatum = datum; - hashtab_map(comdatum->permissions.table, perm_destroy, NULL); - hashtab_destroy(comdatum->permissions.table); - } - kfree(datum); - return 0; -} - -static void constraint_expr_destroy(struct constraint_expr *expr) -{ - if (expr) { - ebitmap_destroy(&expr->names); - if (expr->type_names) { - ebitmap_destroy(&expr->type_names->types); - ebitmap_destroy(&expr->type_names->negset); - kfree(expr->type_names); - } - kfree(expr); - } -} - -static int cls_destroy(void *key, void *datum, void *p) -{ - struct class_datum *cladatum; - struct constraint_node *constraint, *ctemp; - struct constraint_expr *e, *etmp; - - kfree(key); - if (datum) { - cladatum = datum; - hashtab_map(cladatum->permissions.table, perm_destroy, NULL); - hashtab_destroy(cladatum->permissions.table); - constraint = cladatum->constraints; - while (constraint) { - e = constraint->expr; - while (e) { - etmp = e; - e = e->next; - constraint_expr_destroy(etmp); - } - ctemp = constraint; - constraint = constraint->next; - kfree(ctemp); - } - - constraint = cladatum->validatetrans; - while (constraint) { - e = constraint->expr; - while (e) { - etmp = e; - e = e->next; - constraint_expr_destroy(etmp); - } - ctemp = constraint; - constraint = constraint->next; - kfree(ctemp); - } - kfree(cladatum->comkey); - } - kfree(datum); - return 0; -} - -static int role_destroy(void *key, void *datum, void *p) -{ - struct role_datum *role; - - kfree(key); - if (datum) { - role = datum; - ebitmap_destroy(&role->dominates); - ebitmap_destroy(&role->types); - } - kfree(datum); - return 0; -} - -static int type_destroy(void *key, void *datum, void *p) -{ - kfree(key); - kfree(datum); - return 0; -} - -static int user_destroy(void *key, void *datum, void *p) -{ - struct user_datum *usrdatum; - - kfree(key); - if (datum) { - usrdatum = datum; - ebitmap_destroy(&usrdatum->roles); - ebitmap_destroy(&usrdatum->range.level[0].cat); - ebitmap_destroy(&usrdatum->range.level[1].cat); - ebitmap_destroy(&usrdatum->dfltlevel.cat); - } - kfree(datum); - return 0; -} - -static int sens_destroy(void *key, void *datum, void *p) -{ - struct level_datum *levdatum; - - kfree(key); - if (datum) { - levdatum = datum; - if (levdatum->level) - ebitmap_destroy(&levdatum->level->cat); - kfree(levdatum->level); - } - kfree(datum); - return 0; -} - -static int cat_destroy(void *key, void *datum, void *p) -{ - kfree(key); - kfree(datum); - return 0; -} - -static int (*destroy_f[SYM_NUM]) (void *key, void *datum, void *datap) = -{ - common_destroy, - cls_destroy, - role_destroy, - type_destroy, - user_destroy, - cond_destroy_bool, - sens_destroy, - cat_destroy, -}; - -static int filenametr_destroy(void *key, void *datum, void *p) -{ - struct filename_trans *ft = key; - kfree(ft->name); - kfree(key); - kfree(datum); - cond_resched(); - return 0; -} - -static int range_tr_destroy(void *key, void *datum, void *p) -{ - struct mls_range *rt = datum; - kfree(key); - ebitmap_destroy(&rt->level[0].cat); - ebitmap_destroy(&rt->level[1].cat); - kfree(datum); - cond_resched(); - return 0; -} - -static void ocontext_destroy(struct ocontext *c, int i) -{ - if (!c) - return; - - context_destroy(&c->context[0]); - context_destroy(&c->context[1]); - if (i == OCON_ISID || i == OCON_FS || - i == OCON_NETIF || i == OCON_FSUSE) - kfree(c->u.name); - kfree(c); -} - /* * Free any memory allocated by a policy database structure. */ From 2492acaf1e53a13a528e31769aa3df15b4d9f6ef Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Mon, 29 Jul 2019 10:41:16 +0200 Subject: [PATCH 035/721] selinux: policydb - fix some checkpatch.pl warnings Fix most of the code style warnings discovered when moving code around. Signed-off-by: Ondrej Mosnacek Signed-off-by: Paul Moore --- security/selinux/ss/policydb.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index d5effce86304..943c39b07bbb 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -334,6 +334,7 @@ static int (*destroy_f[SYM_NUM]) (void *key, void *datum, void *datap) = static int filenametr_destroy(void *key, void *datum, void *p) { struct filename_trans *ft = key; + kfree(ft->name); kfree(key); kfree(datum); @@ -344,6 +345,7 @@ static int filenametr_destroy(void *key, void *datum, void *p) static int range_tr_destroy(void *key, void *datum, void *p) { struct mls_range *rt = datum; + kfree(key); ebitmap_destroy(&rt->level[0].cat); ebitmap_destroy(&rt->level[1].cat); @@ -439,6 +441,7 @@ static int filenametr_cmp(struct hashtab *h, const void *k1, const void *k2) static u32 rangetr_hash(struct hashtab *h, const void *k) { const struct range_trans *key = k; + return (key->source_type + (key->target_type << 3) + (key->target_class << 5)) & (h->size - 1); } @@ -488,7 +491,8 @@ static int policydb_init(struct policydb *p) if (rc) goto out; - p->filename_trans = hashtab_create(filenametr_hash, filenametr_cmp, (1 << 10)); + p->filename_trans = hashtab_create(filenametr_hash, filenametr_cmp, + (1 << 10)); if (!p->filename_trans) { rc = -ENOMEM; goto out; @@ -664,9 +668,9 @@ static void hash_eval(struct hashtab *h, const char *hash_name) struct hashtab_info info; hashtab_stat(h, &info); - pr_debug("SELinux: %s: %d entries and %d/%d buckets used, " - "longest chain length %d\n", hash_name, h->nel, - info.slots_used, h->size, info.max_chain_len); + pr_debug("SELinux: %s: %d entries and %d/%d buckets used, longest chain length %d\n", + hash_name, h->nel, info.slots_used, h->size, + info.max_chain_len); } static void symtab_hash_eval(struct symtab *s) From f07ea1d4eda2574c6b0f99576db61c86ec27ff5b Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Mon, 29 Jul 2019 10:41:17 +0200 Subject: [PATCH 036/721] selinux: policydb - rename type_val_to_struct_array The name is overly long and inconsistent with the other *_val_to_struct members. Dropping the "_array" prefix makes the code easier to read and gets rid of one line over 80 characters warning. Signed-off-by: Ondrej Mosnacek Signed-off-by: Paul Moore --- security/selinux/ss/policydb.c | 14 +++++++------- security/selinux/ss/policydb.h | 2 +- security/selinux/ss/services.c | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index 943c39b07bbb..a9dabe563ea6 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -590,7 +590,7 @@ static int type_index(void *key, void *datum, void *datap) || typdatum->bounds > p->p_types.nprim) return -EINVAL; p->sym_val_to_name[SYM_TYPES][typdatum->value - 1] = key; - p->type_val_to_struct_array[typdatum->value - 1] = typdatum; + p->type_val_to_struct[typdatum->value - 1] = typdatum; } return 0; @@ -732,10 +732,10 @@ static int policydb_index(struct policydb *p) if (!p->user_val_to_struct) return -ENOMEM; - p->type_val_to_struct_array = kvcalloc(p->p_types.nprim, - sizeof(*p->type_val_to_struct_array), - GFP_KERNEL); - if (!p->type_val_to_struct_array) + p->type_val_to_struct = kvcalloc(p->p_types.nprim, + sizeof(*p->type_val_to_struct), + GFP_KERNEL); + if (!p->type_val_to_struct) return -ENOMEM; rc = cond_init_bool_indexes(p); @@ -781,7 +781,7 @@ void policydb_destroy(struct policydb *p) kfree(p->class_val_to_struct); kfree(p->role_val_to_struct); kfree(p->user_val_to_struct); - kvfree(p->type_val_to_struct_array); + kvfree(p->type_val_to_struct); avtab_destroy(&p->te_avtab); @@ -1726,7 +1726,7 @@ static int type_bounds_sanity_check(void *key, void *datum, void *datap) return -EINVAL; } - upper = p->type_val_to_struct_array[upper->bounds - 1]; + upper = p->type_val_to_struct[upper->bounds - 1]; BUG_ON(!upper); if (upper->attribute) { diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index 27039149ff0a..05fc672831aa 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -255,7 +255,7 @@ struct policydb { struct class_datum **class_val_to_struct; struct role_datum **role_val_to_struct; struct user_datum **user_val_to_struct; - struct type_datum **type_val_to_struct_array; + struct type_datum **type_val_to_struct; /* type enforcement access vectors and transitions */ struct avtab te_avtab; diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index d3a8f6fbc552..ca56cd045bf9 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -544,13 +544,13 @@ static void type_attribute_bounds_av(struct policydb *policydb, struct type_datum *target; u32 masked = 0; - source = policydb->type_val_to_struct_array[scontext->type - 1]; + source = policydb->type_val_to_struct[scontext->type - 1]; BUG_ON(!source); if (!source->bounds) return; - target = policydb->type_val_to_struct_array[tcontext->type - 1]; + target = policydb->type_val_to_struct[tcontext->type - 1]; BUG_ON(!target); memset(&lo_avd, 0, sizeof(lo_avd)); @@ -893,7 +893,7 @@ int security_bounded_transition(struct selinux_state *state, index = new_context->type; while (true) { - type = policydb->type_val_to_struct_array[index - 1]; + type = policydb->type_val_to_struct[index - 1]; BUG_ON(!type); /* not bounded anymore */ From 9b80c36353ed4cce324af21244a65984db21991b Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 1 Aug 2019 17:55:06 -0400 Subject: [PATCH 037/721] selinux: always return a secid from the network caches if we find one Previously if we couldn't find an entry in the cache and we failed to allocate memory for a new cache entry we would fail the network object label lookup; this is obviously not ideal. This patch fixes this so that we return the object label even if we can't cache the object at this point in time due to memory pressure. The GitHub issue tracker is below: * https://github.com/SELinuxProject/selinux-kernel/issues/3 Signed-off-by: Paul Moore --- security/selinux/netif.c | 31 +++++++++++++------------------ security/selinux/netnode.c | 30 ++++++++++++++---------------- security/selinux/netport.c | 24 +++++++++++------------- 3 files changed, 38 insertions(+), 47 deletions(-) diff --git a/security/selinux/netif.c b/security/selinux/netif.c index 8c738c189942..cbe6ec246412 100644 --- a/security/selinux/netif.c +++ b/security/selinux/netif.c @@ -135,9 +135,9 @@ static void sel_netif_destroy(struct sel_netif *netif) */ static int sel_netif_sid_slow(struct net *ns, int ifindex, u32 *sid) { - int ret; + int ret = 0; struct sel_netif *netif; - struct sel_netif *new = NULL; + struct sel_netif *new; struct net_device *dev; /* NOTE: we always use init's network namespace since we don't @@ -154,32 +154,27 @@ static int sel_netif_sid_slow(struct net *ns, int ifindex, u32 *sid) netif = sel_netif_find(ns, ifindex); if (netif != NULL) { *sid = netif->nsec.sid; - ret = 0; goto out; } + + ret = security_netif_sid(&selinux_state, dev->name, sid); + if (ret != 0) + goto out; new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (new == NULL) { - ret = -ENOMEM; - goto out; + if (new) { + new->nsec.ns = ns; + new->nsec.ifindex = ifindex; + new->nsec.sid = *sid; + if (sel_netif_insert(new)) + kfree(new); } - ret = security_netif_sid(&selinux_state, dev->name, &new->nsec.sid); - if (ret != 0) - goto out; - new->nsec.ns = ns; - new->nsec.ifindex = ifindex; - ret = sel_netif_insert(new); - if (ret != 0) - goto out; - *sid = new->nsec.sid; out: spin_unlock_bh(&sel_netif_lock); dev_put(dev); - if (unlikely(ret)) { + if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network interface label (%d)\n", __func__, ifindex); - kfree(new); - } return ret; } diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c index afa0d432436b..df590aaa33f4 100644 --- a/security/selinux/netnode.c +++ b/security/selinux/netnode.c @@ -199,9 +199,9 @@ static void sel_netnode_insert(struct sel_netnode *node) */ static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) { - int ret = -ENOMEM; + int ret; struct sel_netnode *node; - struct sel_netnode *new = NULL; + struct sel_netnode *new; spin_lock_bh(&sel_netnode_lock); node = sel_netnode_find(addr, family); @@ -210,38 +210,36 @@ static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) spin_unlock_bh(&sel_netnode_lock); return 0; } + new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (new == NULL) - goto out; switch (family) { case PF_INET: ret = security_node_sid(&selinux_state, PF_INET, addr, sizeof(struct in_addr), sid); - new->nsec.addr.ipv4 = *(__be32 *)addr; + if (new) + new->nsec.addr.ipv4 = *(__be32 *)addr; break; case PF_INET6: ret = security_node_sid(&selinux_state, PF_INET6, addr, sizeof(struct in6_addr), sid); - new->nsec.addr.ipv6 = *(struct in6_addr *)addr; + if (new) + new->nsec.addr.ipv6 = *(struct in6_addr *)addr; break; default: BUG(); ret = -EINVAL; } - if (ret != 0) - goto out; + if (ret == 0 && new) { + new->nsec.family = family; + new->nsec.sid = *sid; + sel_netnode_insert(new); + } else + kfree(new); - new->nsec.family = family; - new->nsec.sid = *sid; - sel_netnode_insert(new); - -out: spin_unlock_bh(&sel_netnode_lock); - if (unlikely(ret)) { + if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network node label\n", __func__); - kfree(new); - } return ret; } diff --git a/security/selinux/netport.c b/security/selinux/netport.c index 7a141cadbffc..936d630a938d 100644 --- a/security/selinux/netport.c +++ b/security/selinux/netport.c @@ -147,9 +147,9 @@ static void sel_netport_insert(struct sel_netport *port) */ static int sel_netport_sid_slow(u8 protocol, u16 pnum, u32 *sid) { - int ret = -ENOMEM; + int ret; struct sel_netport *port; - struct sel_netport *new = NULL; + struct sel_netport *new; spin_lock_bh(&sel_netport_lock); port = sel_netport_find(protocol, pnum); @@ -158,25 +158,23 @@ static int sel_netport_sid_slow(u8 protocol, u16 pnum, u32 *sid) spin_unlock_bh(&sel_netport_lock); return 0; } - new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (new == NULL) - goto out; + ret = security_port_sid(&selinux_state, protocol, pnum, sid); if (ret != 0) goto out; - - new->psec.port = pnum; - new->psec.protocol = protocol; - new->psec.sid = *sid; - sel_netport_insert(new); + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (new) { + new->psec.port = pnum; + new->psec.protocol = protocol; + new->psec.sid = *sid; + sel_netport_insert(new); + } out: spin_unlock_bh(&sel_netport_lock); - if (unlikely(ret)) { + if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network port label\n", __func__); - kfree(new); - } return ret; } From f29b7f39c0acbc66f18309227719460adab6851d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 6 Aug 2019 22:17:51 +0200 Subject: [PATCH 038/721] Revert "dt-bindings: i2c: rcar: Rename bindings documentation file" This reverts commit d13ed84b195cc6e5789b446f07aede357939f7ad. I overlooked that the Rev-by tag was given in advance assuming a fix was made for the next revision. Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/{renesas,i2c.txt => i2c-rcar.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{renesas,i2c.txt => i2c-rcar.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/renesas,i2c.txt b/Documentation/devicetree/bindings/i2c/i2c-rcar.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/renesas,i2c.txt rename to Documentation/devicetree/bindings/i2c/i2c-rcar.txt diff --git a/MAINTAINERS b/MAINTAINERS index f89f27a2d09a..119d2542c897 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13650,7 +13650,7 @@ F: drivers/iio/adc/rcar-gyroadc.c RENESAS R-CAR I2C DRIVERS M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/renesas,i2c.txt +F: Documentation/devicetree/bindings/i2c/i2c-rcar.txt F: Documentation/devicetree/bindings/i2c/renesas,iic.txt F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c From e611ee0b3b06b4d28216567a251e4de46e595b5d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 6 Aug 2019 22:19:32 +0200 Subject: [PATCH 039/721] Revert "dt-bindings: i2c: riic: Rename bindings documentation file" This reverts commit 684ca71259a69c5a3019da72fe718bf983841926. I overlooked that the Rev-by tag was given in advance assuming a fix was made for the next revision. Signed-off-by: Wolfram Sang --- .../bindings/i2c/{renesas,iic-emev2.txt => i2c-emev2.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{renesas,iic-emev2.txt => i2c-emev2.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt b/Documentation/devicetree/bindings/i2c/i2c-emev2.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt rename to Documentation/devicetree/bindings/i2c/i2c-emev2.txt diff --git a/MAINTAINERS b/MAINTAINERS index 119d2542c897..b49db5ad0b64 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13628,7 +13628,7 @@ F: drivers/clk/renesas/ RENESAS EMEV2 I2C DRIVER M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt +F: Documentation/devicetree/bindings/i2c/i2c-emev2.txt F: drivers/i2c/busses/i2c-emev2.c RENESAS ETHERNET DRIVERS From f91b2ab0e0c5f7cb4b11012748a184b91a396f0f Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Mon, 5 Aug 2019 17:31:08 +0800 Subject: [PATCH 040/721] i2c: designware: Fix unused variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/i2c/busses/i2c-designware-master.c: In function ‘i2c_dw_init_recovery_info’: drivers/i2c/busses/i2c-designware-master.c:658:6: warning: unused variable ‘r’ [-Wunused-variable] int r; ^ Fixes: 33eb09a02e8d ("i2c: designware: make use of devm_gpiod_get_optional") Signed-off-by: Shaokun Zhang Acked-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-master.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index 867787dade43..e8b328242256 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -655,7 +655,6 @@ static int i2c_dw_init_recovery_info(struct dw_i2c_dev *dev) struct i2c_bus_recovery_info *rinfo = &dev->rinfo; struct i2c_adapter *adap = &dev->adapter; struct gpio_desc *gpio; - int r; gpio = devm_gpiod_get_optional(dev->dev, "scl", GPIOD_OUT_HIGH); if (IS_ERR_OR_NULL(gpio)) From 3e99834cc0c7f0612dc790ad7342f18c375d285b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 26 Jul 2019 16:14:21 +0300 Subject: [PATCH 041/721] i2c: Drop unneeded check for of_node of_find_property() will return NULL if of_node is NULL, thus of_irq_get_by_name() returns -EINVAL which we ignore, so no need to double check. Signed-off-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index f26ed495d384..b4d84bd475da 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -350,13 +350,11 @@ static int i2c_device_probe(struct device *dev) return -ENODEV; if (client->flags & I2C_CLIENT_WAKE) { - int wakeirq = -ENOENT; + int wakeirq; - if (dev->of_node) { - wakeirq = of_irq_get_byname(dev->of_node, "wakeup"); - if (wakeirq == -EPROBE_DEFER) - return wakeirq; - } + wakeirq = of_irq_get_byname(dev->of_node, "wakeup"); + if (wakeirq == -EPROBE_DEFER) + return wakeirq; device_init_wakeup(&client->dev, true); From 4d7802aa434a39921c699683c4ca2ad9068b0033 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 Jul 2019 15:56:16 +0800 Subject: [PATCH 042/721] i2c: sprd: Make I2C driver can be built as a module Now there is no need to keep our I2C driver to be initialized so early, thus changing to module level and let it can be built as a module, meanwhile adding some module information. Signed-off-by: Baolin Wang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 2 +- drivers/i2c/busses/i2c-sprd.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 09367fc014c3..69f19312a574 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -977,7 +977,7 @@ config I2C_SIRF will be called i2c-sirf. config I2C_SPRD - bool "Spreadtrum I2C interface" + tristate "Spreadtrum I2C interface" depends on I2C=y && ARCH_SPRD help If you say yes to this option, support will be included for the diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index 961123529678..8002835d1543 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -644,8 +645,7 @@ static struct platform_driver sprd_i2c_driver = { }, }; -static int sprd_i2c_init(void) -{ - return platform_driver_register(&sprd_i2c_driver); -} -arch_initcall_sync(sprd_i2c_init); +module_platform_driver(sprd_i2c_driver); + +MODULE_DESCRIPTION("Spreadtrum I2C master controller driver"); +MODULE_LICENSE("GPL v2"); From 3c2588fab65fb083873c6c1e74a1be54345615eb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 Jul 2019 15:56:17 +0800 Subject: [PATCH 043/721] i2c: sprd: Change to use devm_platform_ioremap_resource() Use the new helper that wraps the calls to platform_get_resource() and devm_ioremap_resource() together. Signed-off-by: Baolin Wang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-sprd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index 8002835d1543..bbcb0569522d 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -478,7 +478,6 @@ static int sprd_i2c_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct sprd_i2c *i2c_dev; - struct resource *res; u32 prop; int ret; @@ -488,8 +487,7 @@ static int sprd_i2c_probe(struct platform_device *pdev) if (!i2c_dev) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - i2c_dev->base = devm_ioremap_resource(dev, res); + i2c_dev->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(i2c_dev->base)) return PTR_ERR(i2c_dev->base); From bbeb6b6c07960b8d33bed5352ef462228110c5ab Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 25 Jul 2019 15:56:18 +0800 Subject: [PATCH 044/721] i2c: sprd: Validate the return value of clock initialization The 'enable' clock of I2C master is required, we should return an error if failed to get the 'enable' clock, to make sure the I2C driver can be defer probe if the clock resource is not ready. Signed-off-by: Baolin Wang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-sprd.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index bbcb0569522d..b432e7580458 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -466,9 +466,9 @@ static int sprd_i2c_clk_init(struct sprd_i2c *i2c_dev) i2c_dev->clk = devm_clk_get(i2c_dev->dev, "enable"); if (IS_ERR(i2c_dev->clk)) { - dev_warn(i2c_dev->dev, "i2c%d can't get the enable clock\n", - i2c_dev->adap.nr); - i2c_dev->clk = NULL; + dev_err(i2c_dev->dev, "i2c%d can't get the enable clock\n", + i2c_dev->adap.nr); + return PTR_ERR(i2c_dev->clk); } return 0; @@ -519,7 +519,10 @@ static int sprd_i2c_probe(struct platform_device *pdev) if (i2c_dev->bus_freq != 100000 && i2c_dev->bus_freq != 400000) return -EINVAL; - sprd_i2c_clk_init(i2c_dev); + ret = sprd_i2c_clk_init(i2c_dev); + if (ret) + return ret; + platform_set_drvdata(pdev, i2c_dev); ret = clk_prepare_enable(i2c_dev->clk); From f4c737d6194f509733641017b50548abc01f4b0d Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Fri, 28 Jun 2019 15:34:24 +0800 Subject: [PATCH 045/721] dt-bindings: PCI: Add support for MT7629 MT7629 is an ARM based platform SoC integrating the same PCIe IP as MT7622, add a binding for it. Signed-off-by: Jianjun Wang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Ryder Lee --- Documentation/devicetree/bindings/pci/mediatek-pcie.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie.txt b/Documentation/devicetree/bindings/pci/mediatek-pcie.txt index 92437a366e5f..7468d666763a 100644 --- a/Documentation/devicetree/bindings/pci/mediatek-pcie.txt +++ b/Documentation/devicetree/bindings/pci/mediatek-pcie.txt @@ -6,6 +6,7 @@ Required properties: "mediatek,mt2712-pcie" "mediatek,mt7622-pcie" "mediatek,mt7623-pcie" + "mediatek,mt7629-pcie" - device_type: Must be "pci" - reg: Base addresses and lengths of the PCIe subsys and root ports. - reg-names: Names of the above areas to use during resource lookup. From 0cccd42e6193e168cbecc271dae464e4a53fd7b3 Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Fri, 28 Jun 2019 15:34:25 +0800 Subject: [PATCH 046/721] PCI: mediatek: Add controller support for MT7629 MT7629 is an ARM platform SoC which has the same PCIe IP as MT7622. The HW default value of its PCI host controller Device ID is invalid, fix it to match the hardware implementation. Signed-off-by: Jianjun Wang [lorenzo.pieralisi@arm.com: commit log/minor spelling update] Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Acked-by: Ryder Lee --- drivers/pci/controller/pcie-mediatek.c | 18 ++++++++++++++++++ include/linux/pci_ids.h | 1 + 2 files changed, 19 insertions(+) diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c index 80601e1b939e..3eaa7081ab2a 100644 --- a/drivers/pci/controller/pcie-mediatek.c +++ b/drivers/pci/controller/pcie-mediatek.c @@ -73,6 +73,7 @@ #define PCIE_MSI_VECTOR 0x0c0 #define PCIE_CONF_VEND_ID 0x100 +#define PCIE_CONF_DEVICE_ID 0x102 #define PCIE_CONF_CLASS_ID 0x106 #define PCIE_INT_MASK 0x420 @@ -141,12 +142,16 @@ struct mtk_pcie_port; /** * struct mtk_pcie_soc - differentiate between host generations * @need_fix_class_id: whether this host's class ID needed to be fixed or not + * @need_fix_device_id: whether this host's device ID needed to be fixed or not + * @device_id: device ID which this host need to be fixed * @ops: pointer to configuration access functions * @startup: pointer to controller setting functions * @setup_irq: pointer to initialize IRQ functions */ struct mtk_pcie_soc { bool need_fix_class_id; + bool need_fix_device_id; + unsigned int device_id; struct pci_ops *ops; int (*startup)(struct mtk_pcie_port *port); int (*setup_irq)(struct mtk_pcie_port *port, struct device_node *node); @@ -696,6 +701,9 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port) writew(val, port->base + PCIE_CONF_CLASS_ID); } + if (soc->need_fix_device_id) + writew(soc->device_id, port->base + PCIE_CONF_DEVICE_ID); + /* 100ms timeout value should be enough for Gen1/2 training */ err = readl_poll_timeout(port->base + PCIE_LINK_STATUS_V2, val, !!(val & PCIE_PORT_LINKUP_V2), 20, @@ -1216,11 +1224,21 @@ static const struct mtk_pcie_soc mtk_pcie_soc_mt7622 = { .setup_irq = mtk_pcie_setup_irq, }; +static const struct mtk_pcie_soc mtk_pcie_soc_mt7629 = { + .need_fix_class_id = true, + .need_fix_device_id = true, + .device_id = PCI_DEVICE_ID_MEDIATEK_7629, + .ops = &mtk_pcie_ops_v2, + .startup = mtk_pcie_startup_port_v2, + .setup_irq = mtk_pcie_setup_irq, +}; + static const struct of_device_id mtk_pcie_ids[] = { { .compatible = "mediatek,mt2701-pcie", .data = &mtk_pcie_soc_v1 }, { .compatible = "mediatek,mt7623-pcie", .data = &mtk_pcie_soc_v1 }, { .compatible = "mediatek,mt2712-pcie", .data = &mtk_pcie_soc_mt2712 }, { .compatible = "mediatek,mt7622-pcie", .data = &mtk_pcie_soc_mt7622 }, + { .compatible = "mediatek,mt7629-pcie", .data = &mtk_pcie_soc_mt7629 }, {}, }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index c842735a4f45..f59a6f98900c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2132,6 +2132,7 @@ #define PCI_VENDOR_ID_MYRICOM 0x14c1 #define PCI_VENDOR_ID_MEDIATEK 0x14c3 +#define PCI_DEVICE_ID_MEDIATEK_7629 0x7629 #define PCI_VENDOR_ID_TITAN 0x14D2 #define PCI_DEVICE_ID_TITAN_010L 0x8001 From b8074aa2460b535915e8f65bf83c4bcb4220f804 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 29 Jul 2019 13:13:57 +0300 Subject: [PATCH 047/721] PCI: Convert pci_resource_to_user() to a weak function Convert pci_resource_to_user() to a weak function so the existing architecture-specific implementations will automatically override the generic one. This allows us to remove HAVE_ARCH_PCI_RESOURCE_TO_USER definitions and avoid the conditional compilation for this single function. Link: https://lore.kernel.org/r/20190729101401.28068-1-efremov@linux.com Link: https://lore.kernel.org/r/20190729101401.28068-2-efremov@linux.com Link: https://lore.kernel.org/r/20190729101401.28068-3-efremov@linux.com Link: https://lore.kernel.org/r/20190729101401.28068-4-efremov@linux.com Link: https://lore.kernel.org/r/20190729101401.28068-5-efremov@linux.com Link: https://lore.kernel.org/r/20190729101401.28068-6-efremov@linux.com Signed-off-by: Denis Efremov [bhelgaas: squash into one commit] Signed-off-by: Bjorn Helgaas Acked-by: Paul Burton # MIPS --- arch/microblaze/include/asm/pci.h | 2 -- arch/mips/include/asm/pci.h | 1 - arch/powerpc/include/asm/pci.h | 2 -- arch/sparc/include/asm/pci.h | 2 -- drivers/pci/pci.c | 12 ++++++++++++ include/linux/pci.h | 16 ---------------- 6 files changed, 12 insertions(+), 23 deletions(-) diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h index 21ddba9188b2..7c4dc5d85f53 100644 --- a/arch/microblaze/include/asm/pci.h +++ b/arch/microblaze/include/asm/pci.h @@ -66,8 +66,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, unsigned long size, pgprot_t prot); -#define HAVE_ARCH_PCI_RESOURCE_TO_USER - /* This part of code was originally in xilinx-pci.h */ #ifdef CONFIG_PCI_XILINX extern void __init xilinx_pci_init(void); diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h index 436099883022..6f48649201c5 100644 --- a/arch/mips/include/asm/pci.h +++ b/arch/mips/include/asm/pci.h @@ -108,7 +108,6 @@ extern unsigned long PCIBIOS_MIN_MEM; #define HAVE_PCI_MMAP #define ARCH_GENERIC_PCI_MMAP_RESOURCE -#define HAVE_ARCH_PCI_RESOURCE_TO_USER /* * Dynamic DMA mapping stuff. diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 2372d35533ad..327567b8f7d6 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -112,8 +112,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, unsigned long size, pgprot_t prot); -#define HAVE_ARCH_PCI_RESOURCE_TO_USER - extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose); extern void pcibios_setup_bus_devices(struct pci_bus *bus); extern void pcibios_setup_bus_self(struct pci_bus *bus); diff --git a/arch/sparc/include/asm/pci.h b/arch/sparc/include/asm/pci.h index cfec79bb1831..4deddf430e5d 100644 --- a/arch/sparc/include/asm/pci.h +++ b/arch/sparc/include/asm/pci.h @@ -38,8 +38,6 @@ static inline int pci_proc_domain(struct pci_bus *bus) #define arch_can_pci_mmap_io() 1 #define HAVE_ARCH_PCI_GET_UNMAPPED_AREA #define get_pci_unmapped_area get_fb_unmapped_area - -#define HAVE_ARCH_PCI_RESOURCE_TO_USER #endif /* CONFIG_SPARC64 */ #if defined(CONFIG_SPARC64) || defined(CONFIG_LEON_PCI) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 29ed5ec1ac27..da3241bb4479 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5932,6 +5932,18 @@ resource_size_t __weak pcibios_default_alignment(void) return 0; } +/* + * Arches that don't want to expose struct resource to userland as-is in + * sysfs and /proc can implement their own pci_resource_to_user(). + */ +void __weak pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + resource_size_t *start, resource_size_t *end) +{ + *start = rsrc->start; + *end = rsrc->end; +} + #define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0}; static DEFINE_SPINLOCK(resource_alignment_lock); diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..dcc5dd310c14 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1870,25 +1870,9 @@ static inline const char *pci_name(const struct pci_dev *pdev) return dev_name(&pdev->dev); } - -/* - * Some archs don't want to expose struct resource to userland as-is - * in sysfs and /proc - */ -#ifdef HAVE_ARCH_PCI_RESOURCE_TO_USER void pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end); -#else -static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, resource_size_t *start, - resource_size_t *end) -{ - *start = rsrc->start; - *end = rsrc->end; -} -#endif /* HAVE_ARCH_PCI_RESOURCE_TO_USER */ - /* * The world is not perfect and supplies us with broken PCI devices. From 39098edbd79e5c9a4357eb924cb259d1c8a11346 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 6 Aug 2019 17:07:15 +0300 Subject: [PATCH 048/721] PCI: Use PCI_SRIOV_NUM_BARS in loops instead of PCI_IOV_RESOURCE_END Writing loop conditions as "i < NUM" is a common C idiom; using "i <= END" is unusual and thus prone to errors. Change loops to use the former. Link: https://lore.kernel.org/r/20190806140715.19847-1-efremov@linux.com Signed-off-by: Denis Efremov Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/iov.c | 4 ++-- drivers/pci/setup-bus.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 525fd3f272b3..9b48818ced01 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -557,8 +557,8 @@ static void sriov_restore_state(struct pci_dev *dev) ctrl |= iov->ctrl & PCI_SRIOV_CTRL_ARI; pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, ctrl); - for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) - pci_update_resource(dev, i); + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) + pci_update_resource(dev, i + PCI_IOV_RESOURCES); pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz); pci_iov_set_numvfs(dev, iov->num_VFs); diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1fa6519be..e7dbe21705ba 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1662,8 +1662,8 @@ static int iov_resources_unassigned(struct pci_dev *dev, void *data) int i; bool *unassigned = data; - for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) { - struct resource *r = &dev->resource[i]; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + struct resource *r = &dev->resource[i + PCI_IOV_RESOURCES]; struct pci_bus_region region; /* Not assigned or rejected by kernel? */ From d2182b2d4b71ff0549a07f414d921525fade707b Mon Sep 17 00:00:00 2001 From: Sumit Saxena Date: Fri, 26 Jul 2019 00:55:52 +0530 Subject: [PATCH 049/721] PCI: Restore Resizable BAR size bits correctly for 1MB BARs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a Resizable BAR Control Register, bits 13:8 control the size of the BAR. The encoded values of these bits are as follows (see PCIe r5.0, sec 7.8.6.3): Value BAR size 0 1 MB (2^20 bytes) 1 2 MB (2^21 bytes) 2 4 MB (2^22 bytes) ... 43 8 EB (2^63 bytes) Previously we incorrectly set the BAR size bits for a 1 MB BAR to 0x1f instead of 0, so devices that support that size, e.g., new megaraid_sas and mpt3sas adapters, fail to initialize during resume from S3 sleep. Correctly calculate the BAR size bits for Resizable BAR control registers. Link: https://lore.kernel.org/r/20190725192552.24295-1-sumit.saxena@broadcom.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203939 Fixes: d3252ace0bc6 ("PCI: Restore resized BAR state on resume") Signed-off-by: Sumit Saxena Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Cc: stable@vger.kernel.org # v4.19+ --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index da3241bb4479..5836eb576d96 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1438,7 +1438,7 @@ static void pci_restore_rebar_state(struct pci_dev *pdev) pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX; res = pdev->resource + bar_idx; - size = order_base_2((resource_size(res) >> 20) | 1) - 1; + size = ilog2(resource_size(res)) - 20; ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE; ctrl |= size << PCI_REBAR_CTRL_BAR_SHIFT; pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl); From 3b1b1ce3596458de0d5c28309954886a9cb6d5cc Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 6 Jun 2019 13:25:57 +1000 Subject: [PATCH 050/721] PCI: Correct pci=resource_alignment parameter example The "pci=resource_alignment" parameter is described as requiring an order (not a size) and the code in pci_specified_resource_alignment() expects an order. But the example wrongly shows a size. Convert the example to an order. Fixes: 8b078c603249 ("PCI: Update "pci=resource_alignment" documentation") Link: https://lore.kernel.org/r/20190606032557.107542-1-aik@ozlabs.ru Signed-off-by: Alexey Kardashevskiy Signed-off-by: Bjorn Helgaas --- Documentation/admin-guide/kernel-parameters.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 46b826fcb5ad..7d407bdded99 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3452,12 +3452,13 @@ specify the device is described above. If is not specified, PAGE_SIZE is used as alignment. - PCI-PCI bridge can be specified, if resource + A PCI-PCI bridge can be specified if resource windows need to be expanded. To specify the alignment for several instances of a device, the PCI vendor, device, subvendor, and subdevice may be - specified, e.g., 4096@pci:8086:9c22:103c:198f + specified, e.g., 12@pci:8086:9c22:103c:198f + for 4096-byte alignment. ecrc= Enable/disable PCIe ECRC (transaction layer end-to-end CRC checking). bios: Use BIOS/firmware settings. This is the From 5a2e340690f2908371eb9c1b4cfb82ba0b235b51 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 1 Aug 2019 20:22:48 -0500 Subject: [PATCH 051/721] PCI: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark switch cases where we are expecting to fall through. This fixes the following warning (Building: allmodconfig i386): drivers/pci/hotplug/ibmphp_res.c: In function ‘update_bridge_ranges’: drivers/pci/hotplug/ibmphp_res.c:1943:16: warning: this statement may fall through [-Wimplicit-fallthrough=] function = 0x8; ~~~~~~~~~^~~~~ drivers/pci/hotplug/ibmphp_res.c:1944:6: note: here case PCI_HEADER_TYPE_MULTIBRIDGE: ^~~~ Link: https://lore.kernel.org/r/20190802012248.GA22622@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Bjorn Helgaas Reviewed-by: Kees Cook --- drivers/pci/hotplug/ibmphp_res.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/hotplug/ibmphp_res.c b/drivers/pci/hotplug/ibmphp_res.c index 5e8caf7a4452..5c93aa14f0de 100644 --- a/drivers/pci/hotplug/ibmphp_res.c +++ b/drivers/pci/hotplug/ibmphp_res.c @@ -1941,6 +1941,7 @@ static int __init update_bridge_ranges(struct bus_node **bus) break; case PCI_HEADER_TYPE_BRIDGE: function = 0x8; + /* fall through */ case PCI_HEADER_TYPE_MULTIBRIDGE: /* We assume here that only 1 bus behind the bridge TO DO: add functionality for several: From 2a9af0273c1cd6647df4f6475b45d823494bf13a Mon Sep 17 00:00:00 2001 From: Wesley Terpstra Date: Thu, 25 Jul 2019 14:28:07 -0700 Subject: [PATCH 052/721] PCI/MSI: Enable PCI_MSI_IRQ_DOMAIN support for RISC-V Add RISC-V as an arch that supports PCI_MSI_IRQ_DOMAIN. The related change to generate asm/msi.h is 251a44888183 ("riscv: include generic support for MSI irqdomains"). Link: https://lore.kernel.org/r/alpine.DEB.2.21.9999.1907251426450.32766@viisi.sifive.com Signed-off-by: Wesley Terpstra [paul.walmsley@sifive.com: wrote patch description; split this patch from the arch/riscv patch] Signed-off-by: Paul Walmsley Signed-off-by: Bjorn Helgaas Reviewed-by: Bin Meng --- drivers/pci/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2ab92409210a..beb3408a0272 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -52,7 +52,7 @@ config PCI_MSI If you don't know what to do here, say Y. config PCI_MSI_IRQ_DOMAIN - def_bool ARC || ARM || ARM64 || X86 + def_bool ARC || ARM || ARM64 || X86 || RISCV depends on PCI_MSI select GENERIC_MSI_IRQ_DOMAIN From efecc3b531a30004d0993907897ab977747af108 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 6 Jul 2019 18:47:20 +0200 Subject: [PATCH 053/721] mfd: ab3100: No need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Signed-off-by: Greg Kroah-Hartman Reviewed-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/ab3100-core.c | 45 ++++++--------------------------------- drivers/mfd/ab3100-otp.c | 21 ++++++------------ 2 files changed, 13 insertions(+), 53 deletions(-) diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c index e350ab64238e..9f3dbc31d3e9 100644 --- a/drivers/mfd/ab3100-core.c +++ b/drivers/mfd/ab3100-core.c @@ -575,58 +575,27 @@ static const struct file_operations ab3100_get_set_reg_fops = { .llseek = noop_llseek, }; -static struct dentry *ab3100_dir; -static struct dentry *ab3100_reg_file; static struct ab3100_get_set_reg_priv ab3100_get_priv; -static struct dentry *ab3100_get_reg_file; static struct ab3100_get_set_reg_priv ab3100_set_priv; -static struct dentry *ab3100_set_reg_file; static void ab3100_setup_debugfs(struct ab3100 *ab3100) { - int err; + struct dentry *ab3100_dir; ab3100_dir = debugfs_create_dir("ab3100", NULL); - if (!ab3100_dir) - goto exit_no_debugfs; - ab3100_reg_file = debugfs_create_file("registers", - S_IRUGO, ab3100_dir, ab3100, - &ab3100_registers_fops); - if (!ab3100_reg_file) { - err = -ENOMEM; - goto exit_destroy_dir; - } + debugfs_create_file("registers", S_IRUGO, ab3100_dir, ab3100, + &ab3100_registers_fops); ab3100_get_priv.ab3100 = ab3100; ab3100_get_priv.mode = false; - ab3100_get_reg_file = debugfs_create_file("get_reg", - S_IWUSR, ab3100_dir, &ab3100_get_priv, - &ab3100_get_set_reg_fops); - if (!ab3100_get_reg_file) { - err = -ENOMEM; - goto exit_destroy_reg; - } + debugfs_create_file("get_reg", S_IWUSR, ab3100_dir, &ab3100_get_priv, + &ab3100_get_set_reg_fops); ab3100_set_priv.ab3100 = ab3100; ab3100_set_priv.mode = true; - ab3100_set_reg_file = debugfs_create_file("set_reg", - S_IWUSR, ab3100_dir, &ab3100_set_priv, - &ab3100_get_set_reg_fops); - if (!ab3100_set_reg_file) { - err = -ENOMEM; - goto exit_destroy_get_reg; - } - return; - - exit_destroy_get_reg: - debugfs_remove(ab3100_get_reg_file); - exit_destroy_reg: - debugfs_remove(ab3100_reg_file); - exit_destroy_dir: - debugfs_remove(ab3100_dir); - exit_no_debugfs: - return; + debugfs_create_file("set_reg", S_IWUSR, ab3100_dir, &ab3100_set_priv, + &ab3100_get_set_reg_fops); } #else static inline void ab3100_setup_debugfs(struct ab3100 *ab3100) diff --git a/drivers/mfd/ab3100-otp.c b/drivers/mfd/ab3100-otp.c index b3f8d359f409..c4751fb9bc22 100644 --- a/drivers/mfd/ab3100-otp.c +++ b/drivers/mfd/ab3100-otp.c @@ -122,17 +122,11 @@ static const struct file_operations ab3100_otp_operations = { .release = single_release, }; -static int __init ab3100_otp_init_debugfs(struct device *dev, - struct ab3100_otp *otp) +static void __init ab3100_otp_init_debugfs(struct device *dev, + struct ab3100_otp *otp) { otp->debugfs = debugfs_create_file("ab3100_otp", S_IFREG | S_IRUGO, - NULL, otp, - &ab3100_otp_operations); - if (!otp->debugfs) { - dev_err(dev, "AB3100 debugfs OTP file registration failed!\n"); - return -ENOENT; - } - return 0; + NULL, otp, &ab3100_otp_operations); } static void __exit ab3100_otp_exit_debugfs(struct ab3100_otp *otp) @@ -141,10 +135,9 @@ static void __exit ab3100_otp_exit_debugfs(struct ab3100_otp *otp) } #else /* Compile this out if debugfs not selected */ -static inline int __init ab3100_otp_init_debugfs(struct device *dev, - struct ab3100_otp *otp) +static inline void __init ab3100_otp_init_debugfs(struct device *dev, + struct ab3100_otp *otp) { - return 0; } static inline void __exit ab3100_otp_exit_debugfs(struct ab3100_otp *otp) @@ -211,9 +204,7 @@ static int __init ab3100_otp_probe(struct platform_device *pdev) } /* debugfs entries */ - err = ab3100_otp_init_debugfs(&pdev->dev, otp); - if (err) - goto err; + ab3100_otp_init_debugfs(&pdev->dev, otp); return 0; From 64e8a9bacadb58e04ab517d8fb5735302b96db5f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 6 Jul 2019 18:47:21 +0200 Subject: [PATCH 054/721] mfd: ab8500: No need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Signed-off-by: Greg Kroah-Hartman Reviewed-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/ab8500-debugfs.c | 322 +++++++++++------------------------ 1 file changed, 97 insertions(+), 225 deletions(-) diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c index d24c6ecccb88..567a34b073dd 100644 --- a/drivers/mfd/ab8500-debugfs.c +++ b/drivers/mfd/ab8500-debugfs.c @@ -2644,12 +2644,10 @@ static const struct file_operations ab8500_hwreg_fops = { .owner = THIS_MODULE, }; -static struct dentry *ab8500_dir; -static struct dentry *ab8500_gpadc_dir; - static int ab8500_debug_probe(struct platform_device *plf) { - struct dentry *file; + struct dentry *ab8500_dir; + struct dentry *ab8500_gpadc_dir; struct ab8500 *ab8500; struct resource *res; @@ -2694,47 +2692,22 @@ static int ab8500_debug_probe(struct platform_device *plf) } ab8500_dir = debugfs_create_dir(AB8500_NAME_STRING, NULL); - if (!ab8500_dir) - goto err; ab8500_gpadc_dir = debugfs_create_dir(AB8500_ADC_NAME_STRING, ab8500_dir); - if (!ab8500_gpadc_dir) - goto err; - file = debugfs_create_file("all-bank-registers", S_IRUGO, ab8500_dir, - &plf->dev, &ab8500_bank_registers_fops); - if (!file) - goto err; - - file = debugfs_create_file("all-banks", S_IRUGO, ab8500_dir, - &plf->dev, &ab8500_all_banks_fops); - if (!file) - goto err; - - file = debugfs_create_file("register-bank", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_dir, &plf->dev, &ab8500_bank_fops); - if (!file) - goto err; - - file = debugfs_create_file("register-address", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_dir, &plf->dev, &ab8500_address_fops); - if (!file) - goto err; - - file = debugfs_create_file("register-value", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_dir, &plf->dev, &ab8500_val_fops); - if (!file) - goto err; - - file = debugfs_create_file("irq-subscribe", - (S_IRUGO | S_IWUSR | S_IWGRP), ab8500_dir, - &plf->dev, &ab8500_subscribe_fops); - if (!file) - goto err; + debugfs_create_file("all-bank-registers", S_IRUGO, ab8500_dir, + &plf->dev, &ab8500_bank_registers_fops); + debugfs_create_file("all-banks", S_IRUGO, ab8500_dir, + &plf->dev, &ab8500_all_banks_fops); + debugfs_create_file("register-bank", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_bank_fops); + debugfs_create_file("register-address", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_address_fops); + debugfs_create_file("register-value", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_val_fops); + debugfs_create_file("irq-subscribe", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_subscribe_fops); if (is_ab8500(ab8500)) { debug_ranges = ab8500_debug_ranges; @@ -2750,194 +2723,93 @@ static int ab8500_debug_probe(struct platform_device *plf) num_interrupt_lines = AB8540_NR_IRQS; } - file = debugfs_create_file("interrupts", (S_IRUGO), ab8500_dir, - &plf->dev, &ab8500_interrupts_fops); - if (!file) - goto err; - - file = debugfs_create_file("irq-unsubscribe", - (S_IRUGO | S_IWUSR | S_IWGRP), ab8500_dir, - &plf->dev, &ab8500_unsubscribe_fops); - if (!file) - goto err; - - file = debugfs_create_file("hwreg", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_dir, &plf->dev, &ab8500_hwreg_fops); - if (!file) - goto err; - - file = debugfs_create_file("all-modem-registers", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_dir, &plf->dev, &ab8500_modem_fops); - if (!file) - goto err; - - file = debugfs_create_file("bat_ctrl", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_bat_ctrl_fops); - if (!file) - goto err; - - file = debugfs_create_file("btemp_ball", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, - &plf->dev, &ab8500_gpadc_btemp_ball_fops); - if (!file) - goto err; - - file = debugfs_create_file("main_charger_v", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_main_charger_v_fops); - if (!file) - goto err; - - file = debugfs_create_file("acc_detect1", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_acc_detect1_fops); - if (!file) - goto err; - - file = debugfs_create_file("acc_detect2", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_acc_detect2_fops); - if (!file) - goto err; - - file = debugfs_create_file("adc_aux1", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_aux1_fops); - if (!file) - goto err; - - file = debugfs_create_file("adc_aux2", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_aux2_fops); - if (!file) - goto err; - - file = debugfs_create_file("main_bat_v", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_main_bat_v_fops); - if (!file) - goto err; - - file = debugfs_create_file("vbus_v", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_vbus_v_fops); - if (!file) - goto err; - - file = debugfs_create_file("main_charger_c", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_main_charger_c_fops); - if (!file) - goto err; - - file = debugfs_create_file("usb_charger_c", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, - &plf->dev, &ab8500_gpadc_usb_charger_c_fops); - if (!file) - goto err; - - file = debugfs_create_file("bk_bat_v", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_bk_bat_v_fops); - if (!file) - goto err; - - file = debugfs_create_file("die_temp", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_die_temp_fops); - if (!file) - goto err; - - file = debugfs_create_file("usb_id", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_usb_id_fops); - if (!file) - goto err; - + debugfs_create_file("interrupts", (S_IRUGO), ab8500_dir, &plf->dev, + &ab8500_interrupts_fops); + debugfs_create_file("irq-unsubscribe", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_unsubscribe_fops); + debugfs_create_file("hwreg", (S_IRUGO | S_IWUSR | S_IWGRP), ab8500_dir, + &plf->dev, &ab8500_hwreg_fops); + debugfs_create_file("all-modem-registers", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_dir, &plf->dev, &ab8500_modem_fops); + debugfs_create_file("bat_ctrl", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_bat_ctrl_fops); + debugfs_create_file("btemp_ball", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_btemp_ball_fops); + debugfs_create_file("main_charger_v", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_main_charger_v_fops); + debugfs_create_file("acc_detect1", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_acc_detect1_fops); + debugfs_create_file("acc_detect2", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_acc_detect2_fops); + debugfs_create_file("adc_aux1", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_aux1_fops); + debugfs_create_file("adc_aux2", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_aux2_fops); + debugfs_create_file("main_bat_v", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_main_bat_v_fops); + debugfs_create_file("vbus_v", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_vbus_v_fops); + debugfs_create_file("main_charger_c", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_main_charger_c_fops); + debugfs_create_file("usb_charger_c", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_usb_charger_c_fops); + debugfs_create_file("bk_bat_v", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_bk_bat_v_fops); + debugfs_create_file("die_temp", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_die_temp_fops); + debugfs_create_file("usb_id", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_usb_id_fops); if (is_ab8540(ab8500)) { - file = debugfs_create_file("xtal_temp", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8540_gpadc_xtal_temp_fops); - if (!file) - goto err; - file = debugfs_create_file("vbattruemeas", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8540_gpadc_vbat_true_meas_fops); - if (!file) - goto err; - file = debugfs_create_file("batctrl_and_ibat", - (S_IRUGO | S_IWUGO), - ab8500_gpadc_dir, - &plf->dev, - &ab8540_gpadc_bat_ctrl_and_ibat_fops); - if (!file) - goto err; - file = debugfs_create_file("vbatmeas_and_ibat", - (S_IRUGO | S_IWUGO), - ab8500_gpadc_dir, &plf->dev, - &ab8540_gpadc_vbat_meas_and_ibat_fops); - if (!file) - goto err; - file = debugfs_create_file("vbattruemeas_and_ibat", - (S_IRUGO | S_IWUGO), - ab8500_gpadc_dir, - &plf->dev, - &ab8540_gpadc_vbat_true_meas_and_ibat_fops); - if (!file) - goto err; - file = debugfs_create_file("battemp_and_ibat", - (S_IRUGO | S_IWUGO), - ab8500_gpadc_dir, - &plf->dev, &ab8540_gpadc_bat_temp_and_ibat_fops); - if (!file) - goto err; - file = debugfs_create_file("otp_calib", - (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, - &plf->dev, &ab8540_gpadc_otp_calib_fops); - if (!file) - goto err; + debugfs_create_file("xtal_temp", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_xtal_temp_fops); + debugfs_create_file("vbattruemeas", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_vbat_true_meas_fops); + debugfs_create_file("batctrl_and_ibat", (S_IRUGO | S_IWUGO), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_bat_ctrl_and_ibat_fops); + debugfs_create_file("vbatmeas_and_ibat", (S_IRUGO | S_IWUGO), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_vbat_meas_and_ibat_fops); + debugfs_create_file("vbattruemeas_and_ibat", (S_IRUGO | S_IWUGO), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_vbat_true_meas_and_ibat_fops); + debugfs_create_file("battemp_and_ibat", (S_IRUGO | S_IWUGO), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_bat_temp_and_ibat_fops); + debugfs_create_file("otp_calib", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8540_gpadc_otp_calib_fops); } - file = debugfs_create_file("avg_sample", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_avg_sample_fops); - if (!file) - goto err; - - file = debugfs_create_file("trig_edge", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_trig_edge_fops); - if (!file) - goto err; - - file = debugfs_create_file("trig_timer", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_trig_timer_fops); - if (!file) - goto err; - - file = debugfs_create_file("conv_type", (S_IRUGO | S_IWUSR | S_IWGRP), - ab8500_gpadc_dir, &plf->dev, - &ab8500_gpadc_conv_type_fops); - if (!file) - goto err; + debugfs_create_file("avg_sample", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_avg_sample_fops); + debugfs_create_file("trig_edge", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_trig_edge_fops); + debugfs_create_file("trig_timer", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_trig_timer_fops); + debugfs_create_file("conv_type", (S_IRUGO | S_IWUSR | S_IWGRP), + ab8500_gpadc_dir, &plf->dev, + &ab8500_gpadc_conv_type_fops); return 0; - -err: - debugfs_remove_recursive(ab8500_dir); - dev_err(&plf->dev, "failed to create debugfs entries.\n"); - - return -ENOMEM; } static struct platform_driver ab8500_debug_driver = { From cbfe612d471fc3eda048a6a70c5c8f5ee34026a4 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 6 Jul 2019 18:47:22 +0200 Subject: [PATCH 055/721] mfd: aat2870: No need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Lee Jones --- drivers/mfd/aat2870-core.c | 13 ++----------- include/linux/mfd/aat2870.h | 1 - 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/mfd/aat2870-core.c b/drivers/mfd/aat2870-core.c index 9f58cb2d3789..78ee4b28fca2 100644 --- a/drivers/mfd/aat2870-core.c +++ b/drivers/mfd/aat2870-core.c @@ -321,18 +321,9 @@ static const struct file_operations aat2870_reg_fops = { static void aat2870_init_debugfs(struct aat2870_data *aat2870) { aat2870->dentry_root = debugfs_create_dir("aat2870", NULL); - if (!aat2870->dentry_root) { - dev_warn(aat2870->dev, - "Failed to create debugfs root directory\n"); - return; - } - aat2870->dentry_reg = debugfs_create_file("regs", 0644, - aat2870->dentry_root, - aat2870, &aat2870_reg_fops); - if (!aat2870->dentry_reg) - dev_warn(aat2870->dev, - "Failed to create debugfs register file\n"); + debugfs_create_file("regs", 0644, aat2870->dentry_root, aat2870, + &aat2870_reg_fops); } #else diff --git a/include/linux/mfd/aat2870.h b/include/linux/mfd/aat2870.h index af7267c480ee..2445842d482d 100644 --- a/include/linux/mfd/aat2870.h +++ b/include/linux/mfd/aat2870.h @@ -136,7 +136,6 @@ struct aat2870_data { /* for debugfs */ struct dentry *dentry_root; - struct dentry *dentry_reg; }; struct aat2870_subdev_info { From ed075453d527a2635f245e0a91b92252ad3d1c8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Tue, 29 Jan 2019 14:59:17 +0100 Subject: [PATCH 056/721] dt-bindings: mfd: rn5t618: Document optional property system-power-controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RN5T618 family of PMICs can be used as system management controllers, in which case they handle poweroff and restart. Document this capability by referring to the corresponding generic DT binding. Signed-off-by: Jonathan Neuschäfer Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/rn5t618.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/mfd/rn5t618.txt b/Documentation/devicetree/bindings/mfd/rn5t618.txt index 65c23263cc54..b74e5e94d1cb 100644 --- a/Documentation/devicetree/bindings/mfd/rn5t618.txt +++ b/Documentation/devicetree/bindings/mfd/rn5t618.txt @@ -14,6 +14,10 @@ Required properties: "ricoh,rc5t619" - reg: the I2C slave address of the device +Optional properties: + - system-power-controller: + See Documentation/devicetree/bindings/power/power-controller.txt + Sub-nodes: - regulators: the node is required if the regulator functionality is needed. The valid regulator names are: DCDC1, DCDC2, DCDC3, DCDC4 @@ -28,6 +32,7 @@ Example: pmic@32 { compatible = "ricoh,rn5t618"; reg = <0x32>; + system-power-controller; regulators { DCDC1 { From 624e3fceb533b33927535aaec713bd91ec80e99b Mon Sep 17 00:00:00 2001 From: Yicheng Li Date: Mon, 8 Jul 2019 11:15:37 -0700 Subject: [PATCH 057/721] mfd: cros_ec: Update cros_ec_commands.h Update cros_ec_commands.h to match the fingerprint MCU section in the current ec_commands.h Signed-off-by: Yicheng Li Reviewed-by: Gwendal Grignou Reviewed-by: Enric Balletbo i Serra Signed-off-by: Lee Jones --- include/linux/mfd/cros_ec_commands.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h index 7ccb8757b79d..98415686cbfa 100644 --- a/include/linux/mfd/cros_ec_commands.h +++ b/include/linux/mfd/cros_ec_commands.h @@ -5513,6 +5513,18 @@ struct ec_params_fp_seed { uint8_t seed[FP_CONTEXT_TPM_BYTES]; } __ec_align4; +#define EC_CMD_FP_ENC_STATUS 0x0409 + +/* FP TPM seed has been set or not */ +#define FP_ENC_STATUS_SEED_SET BIT(0) + +struct ec_response_fp_encryption_status { + /* Used bits in encryption engine status */ + uint32_t valid_flags; + /* Encryption engine status */ + uint32_t status; +} __ec_align4; + /*****************************************************************************/ /* Touchpad MCU commands: range 0x0500-0x05FF */ From a604e5b29ce6cf5ec2950df7a5562ac2dd8d70e2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:35 +0200 Subject: [PATCH 058/721] mfd: tps80031: Convert to devm_i2c_new_dummy_device Move from i2c_new_dummy() to devm_i2c_new_dummy_device(). So, we now get an ERRPTR which we use in error handling and we can skip removal of the created devices. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/tps80031.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/mfd/tps80031.c b/drivers/mfd/tps80031.c index 865257ade8ac..907452b86e32 100644 --- a/drivers/mfd/tps80031.c +++ b/drivers/mfd/tps80031.c @@ -437,12 +437,11 @@ static int tps80031_probe(struct i2c_client *client, if (tps80031_slave_address[i] == client->addr) tps80031->clients[i] = client; else - tps80031->clients[i] = i2c_new_dummy(client->adapter, - tps80031_slave_address[i]); - if (!tps80031->clients[i]) { + tps80031->clients[i] = devm_i2c_new_dummy_device(&client->dev, + client->adapter, tps80031_slave_address[i]); + if (IS_ERR(tps80031->clients[i])) { dev_err(&client->dev, "can't attach client %d\n", i); - ret = -ENOMEM; - goto fail_client_reg; + return PTR_ERR(tps80031->clients[i]); } i2c_set_clientdata(tps80031->clients[i], tps80031); @@ -452,7 +451,7 @@ static int tps80031_probe(struct i2c_client *client, ret = PTR_ERR(tps80031->regmap[i]); dev_err(&client->dev, "regmap %d init failed, err %d\n", i, ret); - goto fail_client_reg; + return ret; } } @@ -461,7 +460,7 @@ static int tps80031_probe(struct i2c_client *client, if (ret < 0) { dev_err(&client->dev, "Silicon version number read failed: %d\n", ret); - goto fail_client_reg; + return ret; } ret = tps80031_read(&client->dev, TPS80031_SLAVE_ID3, @@ -469,7 +468,7 @@ static int tps80031_probe(struct i2c_client *client, if (ret < 0) { dev_err(&client->dev, "Silicon eeprom version read failed: %d\n", ret); - goto fail_client_reg; + return ret; } dev_info(&client->dev, "ES version 0x%02x and EPROM version 0x%02x\n", @@ -482,7 +481,7 @@ static int tps80031_probe(struct i2c_client *client, ret = tps80031_irq_init(tps80031, client->irq, pdata->irq_base); if (ret) { dev_err(&client->dev, "IRQ init failed: %d\n", ret); - goto fail_client_reg; + return ret; } tps80031_pupd_init(tps80031, pdata); @@ -506,12 +505,6 @@ static int tps80031_probe(struct i2c_client *client, fail_mfd_add: regmap_del_irq_chip(client->irq, tps80031->irq_data); - -fail_client_reg: - for (i = 0; i < TPS80031_NUM_SLAVES; i++) { - if (tps80031->clients[i] && (tps80031->clients[i] != client)) - i2c_unregister_device(tps80031->clients[i]); - } return ret; } From e406b832d89d63b9eb525fa8fea18eb7a1598c60 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 20:26:28 +0200 Subject: [PATCH 059/721] mfd: da9063: Remove now unused platform_data All preparational patches have been applied, we can now remove the include file for platform_data. Yiha! Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- include/Kbuild | 1 - include/linux/mfd/da9063/pdata.h | 60 -------------------------------- 2 files changed, 61 deletions(-) delete mode 100644 include/linux/mfd/da9063/pdata.h diff --git a/include/Kbuild b/include/Kbuild index c38f0d46b267..68aa094fe86e 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -313,7 +313,6 @@ header-test- += linux/mfd/as3722.h header-test- += linux/mfd/cros_ec_commands.h header-test- += linux/mfd/da903x.h header-test- += linux/mfd/da9055/pdata.h -header-test- += linux/mfd/da9063/pdata.h header-test- += linux/mfd/db8500-prcmu.h header-test- += linux/mfd/dbx500-prcmu.h header-test- += linux/mfd/dln2.h diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h deleted file mode 100644 index 085edbf7601b..000000000000 --- a/include/linux/mfd/da9063/pdata.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Platform configuration options for DA9063 - * - * Copyright 2012 Dialog Semiconductor Ltd. - * - * Author: Michal Hajduk, Dialog Semiconductor - * Author: Krystian Garbaciak, Dialog Semiconductor - */ - -#ifndef __MFD_DA9063_PDATA_H__ -#define __MFD_DA9063_PDATA_H__ - -/* - * RGB LED configuration - */ -/* LED IDs for flags in struct led_info. */ -enum { - DA9063_GPIO11_LED, - DA9063_GPIO14_LED, - DA9063_GPIO15_LED, - - DA9063_LED_NUM -}; -#define DA9063_LED_ID_MASK 0x3 - -/* LED polarity for flags in struct led_info. */ -#define DA9063_LED_HIGH_LEVEL_ACTIVE 0x0 -#define DA9063_LED_LOW_LEVEL_ACTIVE 0x4 - - -/* - * General PMIC configuration - */ -/* HWMON ADC channels configuration */ -#define DA9063_FLG_FORCE_IN0_MANUAL_MODE 0x0010 -#define DA9063_FLG_FORCE_IN0_AUTO_MODE 0x0020 -#define DA9063_FLG_FORCE_IN1_MANUAL_MODE 0x0040 -#define DA9063_FLG_FORCE_IN1_AUTO_MODE 0x0080 -#define DA9063_FLG_FORCE_IN2_MANUAL_MODE 0x0100 -#define DA9063_FLG_FORCE_IN2_AUTO_MODE 0x0200 -#define DA9063_FLG_FORCE_IN3_MANUAL_MODE 0x0400 -#define DA9063_FLG_FORCE_IN3_AUTO_MODE 0x0800 - -/* Disable register caching. */ -#define DA9063_FLG_NO_CACHE 0x0008 - -struct da9063; - -/* DA9063 platform data */ -struct da9063_pdata { - int (*init)(struct da9063 *da9063); - int irq_base; - bool key_power; - unsigned flags; - struct da9063_regulators_pdata *regulators_pdata; - struct led_platform_data *leds_pdata; -}; - -#endif /* __MFD_DA9063_PDATA_H__ */ From aee36174b22d57775b3445399491777843f294ed Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Tue, 23 Jul 2019 19:51:03 +0800 Subject: [PATCH 060/721] mfd: timberdale: Use dev_get_drvdata Instead of using to_pci_dev + pci_get_drvdata, use dev_get_drvdata to make code simpler. Signed-off-by: Chuhong Yuan Signed-off-by: Lee Jones --- drivers/mfd/timberdale.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mfd/timberdale.c b/drivers/mfd/timberdale.c index 60c122e9b39f..faecbca6dba3 100644 --- a/drivers/mfd/timberdale.c +++ b/drivers/mfd/timberdale.c @@ -626,8 +626,7 @@ static const struct mfd_cell timberdale_cells_bar2[] = { static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf) { - struct pci_dev *pdev = to_pci_dev(dev); - struct timberdale_device *priv = pci_get_drvdata(pdev); + struct timberdale_device *priv = dev_get_drvdata(dev); return sprintf(buf, "%d.%d.%d\n", priv->fw.major, priv->fw.minor, priv->fw.config); From 83215897356fb54fdb7b9e399d5256421d4cfe4a Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:08 +0200 Subject: [PATCH 061/721] mfd: 88pm800: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/88pm800.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mfd/88pm800.c b/drivers/mfd/88pm800.c index f2d9fb4c4e8e..4e8d0d6b9b5c 100644 --- a/drivers/mfd/88pm800.c +++ b/drivers/mfd/88pm800.c @@ -425,10 +425,10 @@ static int pm800_pages_init(struct pm80x_chip *chip) return -ENODEV; /* PM800 block power page */ - subchip->power_page = i2c_new_dummy(client->adapter, + subchip->power_page = i2c_new_dummy_device(client->adapter, subchip->power_page_addr); - if (subchip->power_page == NULL) { - ret = -ENODEV; + if (IS_ERR(subchip->power_page)) { + ret = PTR_ERR(subchip->power_page); goto out; } @@ -444,10 +444,10 @@ static int pm800_pages_init(struct pm80x_chip *chip) i2c_set_clientdata(subchip->power_page, chip); /* PM800 block GPADC */ - subchip->gpadc_page = i2c_new_dummy(client->adapter, + subchip->gpadc_page = i2c_new_dummy_device(client->adapter, subchip->gpadc_page_addr); - if (subchip->gpadc_page == NULL) { - ret = -ENODEV; + if (IS_ERR(subchip->gpadc_page)) { + ret = PTR_ERR(subchip->gpadc_page); goto out; } From 9520b835ffda85cdc1c84b1946c4dd3b6d00184c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:09 +0200 Subject: [PATCH 062/721] mfd: 88pm860x-core: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/88pm860x-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/88pm860x-core.c b/drivers/mfd/88pm860x-core.c index 9e0bd135730f..c9bae71f643a 100644 --- a/drivers/mfd/88pm860x-core.c +++ b/drivers/mfd/88pm860x-core.c @@ -1178,12 +1178,12 @@ static int pm860x_probe(struct i2c_client *client) */ if (pdata->companion_addr && (pdata->companion_addr != client->addr)) { chip->companion_addr = pdata->companion_addr; - chip->companion = i2c_new_dummy(chip->client->adapter, + chip->companion = i2c_new_dummy_device(chip->client->adapter, chip->companion_addr); - if (!chip->companion) { + if (IS_ERR(chip->companion)) { dev_err(&client->dev, "Failed to allocate I2C companion device\n"); - return -ENODEV; + return PTR_ERR(chip->companion); } chip->regmap_companion = regmap_init_i2c(chip->companion, &pm860x_regmap_config); From f5d5d193c5f777fb9c03298992668a34c575abdd Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:10 +0200 Subject: [PATCH 063/721] mfd: ab3100-core: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Reviewed-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/ab3100-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c index 9f3dbc31d3e9..57723f116bb5 100644 --- a/drivers/mfd/ab3100-core.c +++ b/drivers/mfd/ab3100-core.c @@ -865,10 +865,10 @@ static int ab3100_probe(struct i2c_client *client, &ab3100->chip_name[0]); /* Attach a second dummy i2c_client to the test register address */ - ab3100->testreg_client = i2c_new_dummy(client->adapter, + ab3100->testreg_client = i2c_new_dummy_device(client->adapter, client->addr + 1); - if (!ab3100->testreg_client) { - err = -ENOMEM; + if (IS_ERR(ab3100->testreg_client)) { + err = PTR_ERR(ab3100->testreg_client); goto exit_no_testreg_client; } From 98f0c05f409e4fbf186b11dcb2baed84a253c224 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:11 +0200 Subject: [PATCH 064/721] mfd: bcm590xx: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/bcm590xx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/bcm590xx.c b/drivers/mfd/bcm590xx.c index 1aeb5e498d91..bfac5dc091ca 100644 --- a/drivers/mfd/bcm590xx.c +++ b/drivers/mfd/bcm590xx.c @@ -61,11 +61,11 @@ static int bcm590xx_i2c_probe(struct i2c_client *i2c_pri, } /* Secondary I2C slave address is the base address with A(2) asserted */ - bcm590xx->i2c_sec = i2c_new_dummy(i2c_pri->adapter, + bcm590xx->i2c_sec = i2c_new_dummy_device(i2c_pri->adapter, i2c_pri->addr | BIT(2)); - if (!bcm590xx->i2c_sec) { + if (IS_ERR(bcm590xx->i2c_sec)) { dev_err(&i2c_pri->dev, "failed to add secondary I2C device\n"); - return -ENODEV; + return PTR_ERR(bcm590xx->i2c_sec); } i2c_set_clientdata(bcm590xx->i2c_sec, bcm590xx); From e310ee86f9efb92104ac84c178fd11bdf83216eb Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:12 +0200 Subject: [PATCH 065/721] mfd: da9150-core: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/da9150-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/da9150-core.c b/drivers/mfd/da9150-core.c index 13033068721a..7f0aa1e8db96 100644 --- a/drivers/mfd/da9150-core.c +++ b/drivers/mfd/da9150-core.c @@ -420,10 +420,10 @@ static int da9150_probe(struct i2c_client *client, qif_addr = da9150_reg_read(da9150, DA9150_CORE2WIRE_CTRL_A); qif_addr = (qif_addr & DA9150_CORE_BASE_ADDR_MASK) >> 1; qif_addr |= DA9150_QIF_I2C_ADDR_LSB; - da9150->core_qif = i2c_new_dummy(client->adapter, qif_addr); - if (!da9150->core_qif) { + da9150->core_qif = i2c_new_dummy_device(client->adapter, qif_addr); + if (IS_ERR(da9150->core_qif)) { dev_err(da9150->dev, "Failed to attach QIF client\n"); - return -ENODEV; + return PTR_ERR(da9150->core_qif); } i2c_set_clientdata(da9150->core_qif, da9150); From f6ae8129631f6a3e072ec8d3ab60a65aa825f5f0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:13 +0200 Subject: [PATCH 066/721] mfd: max14577: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Reviewed-by: Krzysztof Kozlowski Signed-off-by: Lee Jones --- drivers/mfd/max14577.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/max14577.c b/drivers/mfd/max14577.c index ebb13d5de530..fd8864cafd25 100644 --- a/drivers/mfd/max14577.c +++ b/drivers/mfd/max14577.c @@ -297,11 +297,11 @@ static int max77836_init(struct max14577 *max14577) int ret; u8 intsrc_mask; - max14577->i2c_pmic = i2c_new_dummy(max14577->i2c->adapter, + max14577->i2c_pmic = i2c_new_dummy_device(max14577->i2c->adapter, I2C_ADDR_PMIC); - if (!max14577->i2c_pmic) { + if (IS_ERR(max14577->i2c_pmic)) { dev_err(max14577->dev, "Failed to register PMIC I2C device\n"); - return -ENODEV; + return PTR_ERR(max14577->i2c_pmic); } i2c_set_clientdata(max14577->i2c_pmic, max14577); From ad28edcb87337a1c9e52cae4694eb419883b16a9 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:14 +0200 Subject: [PATCH 067/721] mfd: max77693: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Reviewed-by: Krzysztof Kozlowski Signed-off-by: Lee Jones --- drivers/mfd/max77693.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c index 901d99d65924..596ed85cab3b 100644 --- a/drivers/mfd/max77693.c +++ b/drivers/mfd/max77693.c @@ -183,17 +183,17 @@ static int max77693_i2c_probe(struct i2c_client *i2c, } else dev_info(max77693->dev, "device ID: 0x%x\n", reg_data); - max77693->i2c_muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC); - if (!max77693->i2c_muic) { + max77693->i2c_muic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_MUIC); + if (IS_ERR(max77693->i2c_muic)) { dev_err(max77693->dev, "Failed to allocate I2C device for MUIC\n"); - return -ENODEV; + return PTR_ERR(max77693->i2c_muic); } i2c_set_clientdata(max77693->i2c_muic, max77693); - max77693->i2c_haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC); - if (!max77693->i2c_haptic) { + max77693->i2c_haptic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_HAPTIC); + if (IS_ERR(max77693->i2c_haptic)) { dev_err(max77693->dev, "Failed to allocate I2C device for Haptic\n"); - ret = -ENODEV; + ret = PTR_ERR(max77693->i2c_haptic); goto err_i2c_haptic; } i2c_set_clientdata(max77693->i2c_haptic, max77693); From 0005a9e1bab7d2d46538d3db16ef1fde3433c710 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:15 +0200 Subject: [PATCH 068/721] mfd: max77843: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/max77843.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/max77843.c b/drivers/mfd/max77843.c index 25cbb2242b26..209ee24d9ce1 100644 --- a/drivers/mfd/max77843.c +++ b/drivers/mfd/max77843.c @@ -70,11 +70,11 @@ static int max77843_chg_init(struct max77693_dev *max77843) { int ret; - max77843->i2c_chg = i2c_new_dummy(max77843->i2c->adapter, I2C_ADDR_CHG); - if (!max77843->i2c_chg) { + max77843->i2c_chg = i2c_new_dummy_device(max77843->i2c->adapter, I2C_ADDR_CHG); + if (IS_ERR(max77843->i2c_chg)) { dev_err(&max77843->i2c->dev, "Cannot allocate I2C device for Charger\n"); - return -ENODEV; + return PTR_ERR(max77843->i2c_chg); } i2c_set_clientdata(max77843->i2c_chg, max77843); From b8afcd54db8aae362095c8b490f6240520c4e9a4 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:16 +0200 Subject: [PATCH 069/721] mfd: max8907: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/max8907.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/max8907.c b/drivers/mfd/max8907.c index cc01f706cb32..d44baafd9d14 100644 --- a/drivers/mfd/max8907.c +++ b/drivers/mfd/max8907.c @@ -214,9 +214,9 @@ static int max8907_i2c_probe(struct i2c_client *i2c, goto err_regmap_gen; } - max8907->i2c_rtc = i2c_new_dummy(i2c->adapter, MAX8907_RTC_I2C_ADDR); - if (!max8907->i2c_rtc) { - ret = -ENOMEM; + max8907->i2c_rtc = i2c_new_dummy_device(i2c->adapter, MAX8907_RTC_I2C_ADDR); + if (IS_ERR(max8907->i2c_rtc)) { + ret = PTR_ERR(max8907->i2c_rtc); goto err_dummy_rtc; } i2c_set_clientdata(max8907->i2c_rtc, max8907); From ddbf6ffeb63cf06cfc9707a550ece773bbd2a925 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:17 +0200 Subject: [PATCH 070/721] mfd: max8925-i2c: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/max8925-i2c.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mfd/max8925-i2c.c b/drivers/mfd/max8925-i2c.c index 20bb19b71109..114e905bef25 100644 --- a/drivers/mfd/max8925-i2c.c +++ b/drivers/mfd/max8925-i2c.c @@ -176,18 +176,18 @@ static int max8925_probe(struct i2c_client *client, dev_set_drvdata(chip->dev, chip); mutex_init(&chip->io_lock); - chip->rtc = i2c_new_dummy(chip->i2c->adapter, RTC_I2C_ADDR); - if (!chip->rtc) { + chip->rtc = i2c_new_dummy_device(chip->i2c->adapter, RTC_I2C_ADDR); + if (IS_ERR(chip->rtc)) { dev_err(chip->dev, "Failed to allocate I2C device for RTC\n"); - return -ENODEV; + return PTR_ERR(chip->rtc); } i2c_set_clientdata(chip->rtc, chip); - chip->adc = i2c_new_dummy(chip->i2c->adapter, ADC_I2C_ADDR); - if (!chip->adc) { + chip->adc = i2c_new_dummy_device(chip->i2c->adapter, ADC_I2C_ADDR); + if (IS_ERR(chip->adc)) { dev_err(chip->dev, "Failed to allocate I2C device for ADC\n"); i2c_unregister_device(chip->rtc); - return -ENODEV; + return PTR_ERR(chip->adc); } i2c_set_clientdata(chip->adc, chip); From 4e32bff681fb4a7eb552af7187735a68c5afeab1 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:18 +0200 Subject: [PATCH 071/721] mfd: max8997: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/max8997.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/mfd/max8997.c b/drivers/mfd/max8997.c index 8c06c09e36d1..68d8f2b95287 100644 --- a/drivers/mfd/max8997.c +++ b/drivers/mfd/max8997.c @@ -185,25 +185,25 @@ static int max8997_i2c_probe(struct i2c_client *i2c, mutex_init(&max8997->iolock); - max8997->rtc = i2c_new_dummy(i2c->adapter, I2C_ADDR_RTC); - if (!max8997->rtc) { + max8997->rtc = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_RTC); + if (IS_ERR(max8997->rtc)) { dev_err(max8997->dev, "Failed to allocate I2C device for RTC\n"); - return -ENODEV; + return PTR_ERR(max8997->rtc); } i2c_set_clientdata(max8997->rtc, max8997); - max8997->haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC); - if (!max8997->haptic) { + max8997->haptic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_HAPTIC); + if (IS_ERR(max8997->haptic)) { dev_err(max8997->dev, "Failed to allocate I2C device for Haptic\n"); - ret = -ENODEV; + ret = PTR_ERR(max8997->haptic); goto err_i2c_haptic; } i2c_set_clientdata(max8997->haptic, max8997); - max8997->muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC); - if (!max8997->muic) { + max8997->muic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_MUIC); + if (IS_ERR(max8997->muic)) { dev_err(max8997->dev, "Failed to allocate I2C device for MUIC\n"); - ret = -ENODEV; + ret = PTR_ERR(max8997->muic); goto err_i2c_muic; } i2c_set_clientdata(max8997->muic, max8997); From 7a99c8f3310b13c87ec87114e6fa3e915005b187 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:19 +0200 Subject: [PATCH 072/721] mfd: max8998: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/max8998.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c index 56409df120f8..785f8e9841b7 100644 --- a/drivers/mfd/max8998.c +++ b/drivers/mfd/max8998.c @@ -195,10 +195,10 @@ static int max8998_i2c_probe(struct i2c_client *i2c, } mutex_init(&max8998->iolock); - max8998->rtc = i2c_new_dummy(i2c->adapter, RTC_I2C_ADDR); - if (!max8998->rtc) { + max8998->rtc = i2c_new_dummy_device(i2c->adapter, RTC_I2C_ADDR); + if (IS_ERR(max8998->rtc)) { dev_err(&i2c->dev, "Failed to allocate I2C device for RTC\n"); - return -ENODEV; + return PTR_ERR(max8998->rtc); } i2c_set_clientdata(max8998->rtc, max8998); From ad9fc1f4229ed390590c3ea5be23addf84d54672 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:20 +0200 Subject: [PATCH 073/721] mfd: palmas: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/palmas.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/palmas.c b/drivers/mfd/palmas.c index 6818ff34837c..f5b3fa973b13 100644 --- a/drivers/mfd/palmas.c +++ b/drivers/mfd/palmas.c @@ -549,12 +549,12 @@ static int palmas_i2c_probe(struct i2c_client *i2c, palmas->i2c_clients[i] = i2c; else { palmas->i2c_clients[i] = - i2c_new_dummy(i2c->adapter, + i2c_new_dummy_device(i2c->adapter, i2c->addr + i); - if (!palmas->i2c_clients[i]) { + if (IS_ERR(palmas->i2c_clients[i])) { dev_err(palmas->dev, "can't attach client %d\n", i); - ret = -ENOMEM; + ret = PTR_ERR(palmas->i2c_clients[i]); goto err_i2c; } palmas->i2c_clients[i]->dev.of_node = of_node_get(node); From ba972dac9854a76202c4e798fa618e6d56ca2f81 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 22 Jul 2019 19:26:21 +0200 Subject: [PATCH 074/721] mfd: twl-core: Convert to i2c_new_dummy_device Move from i2c_new_dummy() to i2c_new_dummy_device(), so we now get an ERRPTR which we use in error handling. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/twl-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c index 448d9397ff04..20cf8cfe4f3b 100644 --- a/drivers/mfd/twl-core.c +++ b/drivers/mfd/twl-core.c @@ -1141,12 +1141,12 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id) if (i == 0) { twl->client = client; } else { - twl->client = i2c_new_dummy(client->adapter, + twl->client = i2c_new_dummy_device(client->adapter, client->addr + i); - if (!twl->client) { + if (IS_ERR(twl->client)) { dev_err(&client->dev, "can't attach client %d\n", i); - status = -ENOMEM; + status = PTR_ERR(twl->client); goto fail; } } From 865cac14c2dac2fa5f16d1781ca2746b0d323796 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 8 Jul 2019 14:41:29 +0200 Subject: [PATCH 075/721] backlight: rave-sp: Leave initial state and register with correct device This way the backlight can be referenced through its device node and enabling/disabling can be managed through the panel driver. Signed-off-by: Lucas Stach Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/rave-sp-backlight.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/video/backlight/rave-sp-backlight.c b/drivers/video/backlight/rave-sp-backlight.c index 462f14a1b19d..05b5f003a3d1 100644 --- a/drivers/video/backlight/rave-sp-backlight.c +++ b/drivers/video/backlight/rave-sp-backlight.c @@ -48,14 +48,20 @@ static int rave_sp_backlight_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct backlight_device *bd; - bd = devm_backlight_device_register(dev, pdev->name, dev->parent, + bd = devm_backlight_device_register(dev, pdev->name, dev, dev_get_drvdata(dev->parent), &rave_sp_backlight_ops, &rave_sp_backlight_props); if (IS_ERR(bd)) return PTR_ERR(bd); - backlight_update_status(bd); + /* + * If there is a phandle pointing to the device node we can + * assume that another device will manage the status changes. + * If not we make sure the backlight is in a consistent state. + */ + if (!dev->of_node->phandle) + backlight_update_status(bd); return 0; } From 072e2c8192cfc6ae1de1a2aca02d4512b69bdeed Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 23 Jul 2019 22:34:00 +0300 Subject: [PATCH 076/721] backlight: lm3630a: Switch to use fwnode_property_count_uXX() Use use fwnode_property_count_uXX() directly, that makes code neater. Signed-off-by: Andy Shevchenko Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/lm3630a_bl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/video/backlight/lm3630a_bl.c b/drivers/video/backlight/lm3630a_bl.c index b04b35d007a2..2d8e8192e4e2 100644 --- a/drivers/video/backlight/lm3630a_bl.c +++ b/drivers/video/backlight/lm3630a_bl.c @@ -377,8 +377,7 @@ static int lm3630a_parse_led_sources(struct fwnode_handle *node, u32 sources[LM3630A_NUM_SINKS]; int ret, num_sources, i; - num_sources = fwnode_property_read_u32_array(node, "led-sources", NULL, - 0); + num_sources = fwnode_property_count_u32(node, "led-sources"); if (num_sources < 0) return default_led_sources; else if (num_sources > ARRAY_SIZE(sources)) From 30f644fd6ec9a0a241adb91579aaca7404132d7b Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 24 Jul 2019 23:38:28 +0200 Subject: [PATCH 077/721] backlight: lms283gf05: Fix a typo in the description passed to 'devm_gpio_request_one()' The description passed to 'devm_gpio_request_one()' should be related to LMS283GF05, not LMS285GF05. Signed-off-by: Christophe JAILLET Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/lms283gf05.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/backlight/lms283gf05.c b/drivers/video/backlight/lms283gf05.c index 35bc012b22cc..0e45685bcc1c 100644 --- a/drivers/video/backlight/lms283gf05.c +++ b/drivers/video/backlight/lms283gf05.c @@ -158,7 +158,7 @@ static int lms283gf05_probe(struct spi_device *spi) ret = devm_gpio_request_one(&spi->dev, pdata->reset_gpio, GPIOF_DIR_OUT | (!pdata->reset_inverted ? GPIOF_INIT_HIGH : GPIOF_INIT_LOW), - "LMS285GF05 RESET"); + "LMS283GF05 RESET"); if (ret) return ret; } From 76380a607ba0b28627c9b4b55cd47a079a59624b Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 5 Jul 2019 12:55:03 +0800 Subject: [PATCH 078/721] mfd: intel-lpss: Remove D3cold delay Goodix touchpad may drop its first couple input events when i2c-designware-platdrv and intel-lpss it connects to took too long to runtime resume from runtime suspended state. This issue happens becuase the touchpad has a rather small buffer to store up to 13 input events, so if the host doesn't read those events in time (i.e. runtime resume takes too long), events are dropped from the touchpad's buffer. The bottleneck is D3cold delay it waits when transitioning from D3cold to D0, hence remove the delay to make the resume faster. I've tested some systems with intel-lpss and haven't seen any regression. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202683 Signed-off-by: Kai-Heng Feng Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss-pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c index ade6e1ce5a98..e3a04929aaa3 100644 --- a/drivers/mfd/intel-lpss-pci.c +++ b/drivers/mfd/intel-lpss-pci.c @@ -35,6 +35,8 @@ static int intel_lpss_pci_probe(struct pci_dev *pdev, info->mem = &pdev->resource[0]; info->irq = pdev->irq; + pdev->d3cold_delay = 0; + /* Probably it is enough to set this for iDMA capable devices only */ pci_set_master(pdev); pci_try_set_mwi(pdev); From ea1acf11ee7ac2d63d93e78c638ec8abc710a1a7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 28 Jul 2019 18:58:58 -0500 Subject: [PATCH 079/721] mfd: omap-usb-host: Mark expected switch fall-throughs Mark switch cases where we are expecting to fall through. This patch fixes the following warnings: drivers/mfd/omap-usb-host.c: In function 'usbhs_runtime_resume': drivers/mfd/omap-usb-host.c:303:7: warning: this statement may fall through [-Wimplicit-fallthrough=] if (!IS_ERR(omap->hsic480m_clk[i])) { ^ drivers/mfd/omap-usb-host.c:313:3: note: here case OMAP_EHCI_PORT_MODE_TLL: ^~~~ drivers/mfd/omap-usb-host.c: In function 'usbhs_runtime_suspend': drivers/mfd/omap-usb-host.c:345:7: warning: this statement may fall through [-Wimplicit-fallthrough=] if (!IS_ERR(omap->hsic480m_clk[i])) ^ drivers/mfd/omap-usb-host.c:349:3: note: here case OMAP_EHCI_PORT_MODE_TLL: ^~~~ Reported-by: Stephen Rothwell Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Lee Jones --- drivers/mfd/omap-usb-host.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c index 792b855a9104..4798d9f3f9d5 100644 --- a/drivers/mfd/omap-usb-host.c +++ b/drivers/mfd/omap-usb-host.c @@ -308,7 +308,7 @@ static int usbhs_runtime_resume(struct device *dev) i, r); } } - /* Fall through as HSIC mode needs utmi_clk */ + /* Fall through - as HSIC mode needs utmi_clk */ case OMAP_EHCI_PORT_MODE_TLL: if (!IS_ERR(omap->utmi_clk[i])) { @@ -344,7 +344,7 @@ static int usbhs_runtime_suspend(struct device *dev) if (!IS_ERR(omap->hsic480m_clk[i])) clk_disable_unprepare(omap->hsic480m_clk[i]); - /* Fall through as utmi_clks were used in HSIC mode */ + /* Fall through - as utmi_clks were used in HSIC mode */ case OMAP_EHCI_PORT_MODE_TLL: if (!IS_ERR(omap->utmi_clk[i])) From ff71266aa490a9f8ed761d47945f300ba19e7c93 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Thu, 25 Jul 2019 18:02:14 -0400 Subject: [PATCH 080/721] mfd: Drop obsolete JZ4740 driver It has been replaced with the ingenic-iio driver for the ADC. Signed-off-by: Paul Cercueil Tested-by: Artur Rojek Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 9 -- drivers/mfd/Makefile | 1 - drivers/mfd/jz4740-adc.c | 324 --------------------------------------- 3 files changed, 334 deletions(-) delete mode 100644 drivers/mfd/jz4740-adc.c diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index f129f9678940..4a07afe50b35 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -649,15 +649,6 @@ config MFD_JANZ_CMODIO host many different types of MODULbus daughterboards, including CAN and GPIO controllers. -config MFD_JZ4740_ADC - bool "Janz JZ4740 ADC core" - select MFD_CORE - select GENERIC_IRQ_CHIP - depends on MACH_JZ4740 - help - Say yes here if you want support for the ADC unit in the JZ4740 SoC. - This driver is necessary for jz4740-battery and jz4740-hwmon driver. - config MFD_KEMPLD tristate "Kontron module PLD device" select MFD_CORE diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f026ada68f6a..446d5df7cacb 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -191,7 +191,6 @@ obj-$(CONFIG_LPC_SCH) += lpc_sch.o obj-$(CONFIG_LPC_ICH) += lpc_ich.o obj-$(CONFIG_MFD_RDC321X) += rdc321x-southbridge.o obj-$(CONFIG_MFD_JANZ_CMODIO) += janz-cmodio.o -obj-$(CONFIG_MFD_JZ4740_ADC) += jz4740-adc.o obj-$(CONFIG_MFD_TPS6586X) += tps6586x.o obj-$(CONFIG_MFD_VX855) += vx855.o obj-$(CONFIG_MFD_WL1273_CORE) += wl1273-core.o diff --git a/drivers/mfd/jz4740-adc.c b/drivers/mfd/jz4740-adc.c deleted file mode 100644 index 082f16917519..000000000000 --- a/drivers/mfd/jz4740-adc.c +++ /dev/null @@ -1,324 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2009-2010, Lars-Peter Clausen - * JZ4740 SoC ADC driver - * - * This driver synchronizes access to the JZ4740 ADC core between the - * JZ4740 battery and hwmon drivers. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - - -#define JZ_REG_ADC_ENABLE 0x00 -#define JZ_REG_ADC_CFG 0x04 -#define JZ_REG_ADC_CTRL 0x08 -#define JZ_REG_ADC_STATUS 0x0c - -#define JZ_REG_ADC_TOUCHSCREEN_BASE 0x10 -#define JZ_REG_ADC_BATTERY_BASE 0x1c -#define JZ_REG_ADC_HWMON_BASE 0x20 - -#define JZ_ADC_ENABLE_TOUCH BIT(2) -#define JZ_ADC_ENABLE_BATTERY BIT(1) -#define JZ_ADC_ENABLE_ADCIN BIT(0) - -enum { - JZ_ADC_IRQ_ADCIN = 0, - JZ_ADC_IRQ_BATTERY, - JZ_ADC_IRQ_TOUCH, - JZ_ADC_IRQ_PENUP, - JZ_ADC_IRQ_PENDOWN, -}; - -struct jz4740_adc { - struct resource *mem; - void __iomem *base; - - int irq; - struct irq_chip_generic *gc; - - struct clk *clk; - atomic_t clk_ref; - - spinlock_t lock; -}; - -static void jz4740_adc_irq_demux(struct irq_desc *desc) -{ - struct irq_chip_generic *gc = irq_desc_get_handler_data(desc); - uint8_t status; - unsigned int i; - - status = readb(gc->reg_base + JZ_REG_ADC_STATUS); - - for (i = 0; i < 5; ++i) { - if (status & BIT(i)) - generic_handle_irq(gc->irq_base + i); - } -} - - -/* Refcounting for the ADC clock is done in here instead of in the clock - * framework, because it is the only clock which is shared between multiple - * devices and thus is the only clock which needs refcounting */ -static inline void jz4740_adc_clk_enable(struct jz4740_adc *adc) -{ - if (atomic_inc_return(&adc->clk_ref) == 1) - clk_prepare_enable(adc->clk); -} - -static inline void jz4740_adc_clk_disable(struct jz4740_adc *adc) -{ - if (atomic_dec_return(&adc->clk_ref) == 0) - clk_disable_unprepare(adc->clk); -} - -static inline void jz4740_adc_set_enabled(struct jz4740_adc *adc, int engine, - bool enabled) -{ - unsigned long flags; - uint8_t val; - - spin_lock_irqsave(&adc->lock, flags); - - val = readb(adc->base + JZ_REG_ADC_ENABLE); - if (enabled) - val |= BIT(engine); - else - val &= ~BIT(engine); - writeb(val, adc->base + JZ_REG_ADC_ENABLE); - - spin_unlock_irqrestore(&adc->lock, flags); -} - -static int jz4740_adc_cell_enable(struct platform_device *pdev) -{ - struct jz4740_adc *adc = dev_get_drvdata(pdev->dev.parent); - - jz4740_adc_clk_enable(adc); - jz4740_adc_set_enabled(adc, pdev->id, true); - - return 0; -} - -static int jz4740_adc_cell_disable(struct platform_device *pdev) -{ - struct jz4740_adc *adc = dev_get_drvdata(pdev->dev.parent); - - jz4740_adc_set_enabled(adc, pdev->id, false); - jz4740_adc_clk_disable(adc); - - return 0; -} - -int jz4740_adc_set_config(struct device *dev, uint32_t mask, uint32_t val) -{ - struct jz4740_adc *adc = dev_get_drvdata(dev); - unsigned long flags; - uint32_t cfg; - - if (!adc) - return -ENODEV; - - spin_lock_irqsave(&adc->lock, flags); - - cfg = readl(adc->base + JZ_REG_ADC_CFG); - - cfg &= ~mask; - cfg |= val; - - writel(cfg, adc->base + JZ_REG_ADC_CFG); - - spin_unlock_irqrestore(&adc->lock, flags); - - return 0; -} -EXPORT_SYMBOL_GPL(jz4740_adc_set_config); - -static struct resource jz4740_hwmon_resources[] = { - { - .start = JZ_ADC_IRQ_ADCIN, - .flags = IORESOURCE_IRQ, - }, - { - .start = JZ_REG_ADC_HWMON_BASE, - .end = JZ_REG_ADC_HWMON_BASE + 3, - .flags = IORESOURCE_MEM, - }, -}; - -static struct resource jz4740_battery_resources[] = { - { - .start = JZ_ADC_IRQ_BATTERY, - .flags = IORESOURCE_IRQ, - }, - { - .start = JZ_REG_ADC_BATTERY_BASE, - .end = JZ_REG_ADC_BATTERY_BASE + 3, - .flags = IORESOURCE_MEM, - }, -}; - -static const struct mfd_cell jz4740_adc_cells[] = { - { - .id = 0, - .name = "jz4740-hwmon", - .num_resources = ARRAY_SIZE(jz4740_hwmon_resources), - .resources = jz4740_hwmon_resources, - - .enable = jz4740_adc_cell_enable, - .disable = jz4740_adc_cell_disable, - }, - { - .id = 1, - .name = "jz4740-battery", - .num_resources = ARRAY_SIZE(jz4740_battery_resources), - .resources = jz4740_battery_resources, - - .enable = jz4740_adc_cell_enable, - .disable = jz4740_adc_cell_disable, - }, -}; - -static int jz4740_adc_probe(struct platform_device *pdev) -{ - struct irq_chip_generic *gc; - struct irq_chip_type *ct; - struct jz4740_adc *adc; - struct resource *mem_base; - int ret; - int irq_base; - - adc = devm_kzalloc(&pdev->dev, sizeof(*adc), GFP_KERNEL); - if (!adc) - return -ENOMEM; - - adc->irq = platform_get_irq(pdev, 0); - if (adc->irq < 0) { - ret = adc->irq; - dev_err(&pdev->dev, "Failed to get platform irq: %d\n", ret); - return ret; - } - - irq_base = platform_get_irq(pdev, 1); - if (irq_base < 0) { - dev_err(&pdev->dev, "Failed to get irq base: %d\n", irq_base); - return irq_base; - } - - mem_base = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!mem_base) { - dev_err(&pdev->dev, "Failed to get platform mmio resource\n"); - return -ENOENT; - } - - /* Only request the shared registers for the MFD driver */ - adc->mem = request_mem_region(mem_base->start, JZ_REG_ADC_STATUS, - pdev->name); - if (!adc->mem) { - dev_err(&pdev->dev, "Failed to request mmio memory region\n"); - return -EBUSY; - } - - adc->base = ioremap_nocache(adc->mem->start, resource_size(adc->mem)); - if (!adc->base) { - ret = -EBUSY; - dev_err(&pdev->dev, "Failed to ioremap mmio memory\n"); - goto err_release_mem_region; - } - - adc->clk = clk_get(&pdev->dev, "adc"); - if (IS_ERR(adc->clk)) { - ret = PTR_ERR(adc->clk); - dev_err(&pdev->dev, "Failed to get clock: %d\n", ret); - goto err_iounmap; - } - - spin_lock_init(&adc->lock); - atomic_set(&adc->clk_ref, 0); - - platform_set_drvdata(pdev, adc); - - gc = irq_alloc_generic_chip("INTC", 1, irq_base, adc->base, - handle_level_irq); - - ct = gc->chip_types; - ct->regs.mask = JZ_REG_ADC_CTRL; - ct->regs.ack = JZ_REG_ADC_STATUS; - ct->chip.irq_mask = irq_gc_mask_set_bit; - ct->chip.irq_unmask = irq_gc_mask_clr_bit; - ct->chip.irq_ack = irq_gc_ack_set_bit; - - irq_setup_generic_chip(gc, IRQ_MSK(5), IRQ_GC_INIT_MASK_CACHE, 0, - IRQ_NOPROBE | IRQ_LEVEL); - - adc->gc = gc; - - irq_set_chained_handler_and_data(adc->irq, jz4740_adc_irq_demux, gc); - - writeb(0x00, adc->base + JZ_REG_ADC_ENABLE); - writeb(0xff, adc->base + JZ_REG_ADC_CTRL); - - ret = mfd_add_devices(&pdev->dev, 0, jz4740_adc_cells, - ARRAY_SIZE(jz4740_adc_cells), mem_base, - irq_base, NULL); - if (ret < 0) - goto err_clk_put; - - return 0; - -err_clk_put: - clk_put(adc->clk); -err_iounmap: - iounmap(adc->base); -err_release_mem_region: - release_mem_region(adc->mem->start, resource_size(adc->mem)); - return ret; -} - -static int jz4740_adc_remove(struct platform_device *pdev) -{ - struct jz4740_adc *adc = platform_get_drvdata(pdev); - - mfd_remove_devices(&pdev->dev); - - irq_remove_generic_chip(adc->gc, IRQ_MSK(5), IRQ_NOPROBE | IRQ_LEVEL, 0); - kfree(adc->gc); - irq_set_chained_handler_and_data(adc->irq, NULL, NULL); - - iounmap(adc->base); - release_mem_region(adc->mem->start, resource_size(adc->mem)); - - clk_put(adc->clk); - - return 0; -} - -static struct platform_driver jz4740_adc_driver = { - .probe = jz4740_adc_probe, - .remove = jz4740_adc_remove, - .driver = { - .name = "jz4740-adc", - }, -}; - -module_platform_driver(jz4740_adc_driver); - -MODULE_DESCRIPTION("JZ4740 SoC ADC driver"); -MODULE_AUTHOR("Lars-Peter Clausen "); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:jz4740-adc"); From ec65b56046d27a21a5ae02eb7fcb321e1942a541 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 1 Aug 2019 16:28:41 +0300 Subject: [PATCH 081/721] mfd: intel-lpss: Add Intel Tiger Lake PCI IDs Intel Tiger Lake has the same LPSS than Intel Broxton. Add the new IDs to the list of supported devices. Signed-off-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss-pci.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c index e3a04929aaa3..9355db29d2f9 100644 --- a/drivers/mfd/intel-lpss-pci.c +++ b/drivers/mfd/intel-lpss-pci.c @@ -258,6 +258,29 @@ static const struct pci_device_id intel_lpss_pci_ids[] = { { PCI_VDEVICE(INTEL, 0x9dea), (kernel_ulong_t)&cnl_i2c_info }, { PCI_VDEVICE(INTEL, 0x9deb), (kernel_ulong_t)&cnl_i2c_info }, { PCI_VDEVICE(INTEL, 0x9dfb), (kernel_ulong_t)&spt_info }, + /* TGL-LP */ + { PCI_VDEVICE(INTEL, 0xa0a8), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0a9), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0aa), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0ab), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0c5), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0c6), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0c7), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0d8), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0d9), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0da), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0db), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0dc), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0dd), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0de), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0df), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0e8), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0e9), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0ea), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0eb), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0fb), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0fd), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0fe), (kernel_ulong_t)&spt_info }, /* SPT-H */ { PCI_VDEVICE(INTEL, 0xa127), (kernel_ulong_t)&spt_uart_info }, { PCI_VDEVICE(INTEL, 0xa128), (kernel_ulong_t)&spt_uart_info }, From b620c17672b9c162bbc1e480eaf43a825345cb2a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 28 Jul 2019 18:56:14 -0500 Subject: [PATCH 082/721] mfd: db8500-prcmu: Mark expected switch fall-throughs Mark switch cases where we are expecting to fall through. This patch fixes the following warnings: drivers/mfd/db8500-prcmu.c: In function 'dsiclk_rate': drivers/mfd/db8500-prcmu.c:1592:7: warning: this statement may fall through [-Wimplicit-fallthrough=] div *= 2; ~~~~^~~~ drivers/mfd/db8500-prcmu.c:1593:2: note: here case PRCM_DSI_PLLOUT_SEL_PHI_2: ^~~~ drivers/mfd/db8500-prcmu.c:1594:7: warning: this statement may fall through [-Wimplicit-fallthrough=] div *= 2; ~~~~^~~~ drivers/mfd/db8500-prcmu.c:1595:2: note: here case PRCM_DSI_PLLOUT_SEL_PHI: ^~~~ Reported-by: Stephen Rothwell Signed-off-by: Gustavo A. R. Silva Reviewed-by: Linus Walleij Reviewed-by: Kees Cook Signed-off-by: Lee Jones --- drivers/mfd/db8500-prcmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index 3f21e26b8d36..90e0f21bc49c 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -1590,8 +1590,10 @@ static unsigned long dsiclk_rate(u8 n) switch (divsel) { case PRCM_DSI_PLLOUT_SEL_PHI_4: div *= 2; + /* Fall through */ case PRCM_DSI_PLLOUT_SEL_PHI_2: div *= 2; + /* Fall through */ case PRCM_DSI_PLLOUT_SEL_PHI: return pll_rate(PRCM_PLLDSI_FREQ, clock_rate(PRCMU_HDMICLK), PLL_RAW) / div; From 802d9bd4fac70be2ea61fa83660a87a57d06bab0 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 30 Jul 2019 11:15:27 -0700 Subject: [PATCH 083/721] mfd: Remove dev_err() usage after platform_get_irq() We don't need dev_err() messages when platform_get_irq() fails now that platform_get_irq() prints an error message itself when something goes wrong. Let's remove these prints with a simple semantic patch. // @@ expression ret; struct platform_device *E; @@ ret = ( platform_get_irq(E, ...) | platform_get_irq_byname(E, ...) ); if ( \( ret < 0 \| ret <= 0 \) ) { ( -if (ret != -EPROBE_DEFER) -{ ... -dev_err(...); -... } | ... -dev_err(...); ) ... } // While we're here, remove braces on if statements that only have one statement (manually). Signed-off-by: Stephen Boyd Signed-off-by: Lee Jones --- drivers/mfd/ab8500-debugfs.c | 8 ++------ drivers/mfd/db8500-prcmu.c | 4 +--- drivers/mfd/fsl-imx25-tsadc.c | 4 +--- drivers/mfd/intel_soc_pmic_bxtwc.c | 4 +--- drivers/mfd/qcom_rpm.c | 12 +++--------- drivers/mfd/sm501.c | 4 +--- 6 files changed, 9 insertions(+), 27 deletions(-) diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c index 567a34b073dd..f4e26b6e5362 100644 --- a/drivers/mfd/ab8500-debugfs.c +++ b/drivers/mfd/ab8500-debugfs.c @@ -2680,16 +2680,12 @@ static int ab8500_debug_probe(struct platform_device *plf) irq_ab8500 = res->start; irq_first = platform_get_irq_byname(plf, "IRQ_FIRST"); - if (irq_first < 0) { - dev_err(&plf->dev, "First irq not found, err %d\n", irq_first); + if (irq_first < 0) return irq_first; - } irq_last = platform_get_irq_byname(plf, "IRQ_LAST"); - if (irq_last < 0) { - dev_err(&plf->dev, "Last irq not found, err %d\n", irq_last); + if (irq_last < 0) return irq_last; - } ab8500_dir = debugfs_create_dir(AB8500_NAME_STRING, NULL); diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index 90e0f21bc49c..0f459c246634 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -3130,10 +3130,8 @@ static int db8500_prcmu_probe(struct platform_device *pdev) writel(ALL_MBOX_BITS, PRCM_ARM_IT1_CLR); irq = platform_get_irq(pdev, 0); - if (irq <= 0) { - dev_err(&pdev->dev, "no prcmu irq provided\n"); + if (irq <= 0) return irq; - } err = request_threaded_irq(irq, prcmu_irq_handler, prcmu_irq_thread_fn, IRQF_NO_SUSPEND, "prcmu", NULL); diff --git a/drivers/mfd/fsl-imx25-tsadc.c b/drivers/mfd/fsl-imx25-tsadc.c index 20791cab7263..a016b39fe9b0 100644 --- a/drivers/mfd/fsl-imx25-tsadc.c +++ b/drivers/mfd/fsl-imx25-tsadc.c @@ -69,10 +69,8 @@ static int mx25_tsadc_setup_irq(struct platform_device *pdev, int irq; irq = platform_get_irq(pdev, 0); - if (irq <= 0) { - dev_err(dev, "Failed to get irq\n"); + if (irq <= 0) return irq; - } tsadc->domain = irq_domain_add_simple(np, 2, 0, &mx25_tsadc_domain_ops, tsadc); diff --git a/drivers/mfd/intel_soc_pmic_bxtwc.c b/drivers/mfd/intel_soc_pmic_bxtwc.c index 6310c3bdb991..739cfb5b69fe 100644 --- a/drivers/mfd/intel_soc_pmic_bxtwc.c +++ b/drivers/mfd/intel_soc_pmic_bxtwc.c @@ -450,10 +450,8 @@ static int bxtwc_probe(struct platform_device *pdev) return -ENOMEM; ret = platform_get_irq(pdev, 0); - if (ret < 0) { - dev_err(&pdev->dev, "Invalid IRQ\n"); + if (ret < 0) return ret; - } pmic->irq = ret; dev_set_drvdata(&pdev->dev, pmic); diff --git a/drivers/mfd/qcom_rpm.c b/drivers/mfd/qcom_rpm.c index 4d7e9008628c..71bc34b74bc9 100644 --- a/drivers/mfd/qcom_rpm.c +++ b/drivers/mfd/qcom_rpm.c @@ -561,22 +561,16 @@ static int qcom_rpm_probe(struct platform_device *pdev) clk_prepare_enable(rpm->ramclk); /* Accepts NULL */ irq_ack = platform_get_irq_byname(pdev, "ack"); - if (irq_ack < 0) { - dev_err(&pdev->dev, "required ack interrupt missing\n"); + if (irq_ack < 0) return irq_ack; - } irq_err = platform_get_irq_byname(pdev, "err"); - if (irq_err < 0) { - dev_err(&pdev->dev, "required err interrupt missing\n"); + if (irq_err < 0) return irq_err; - } irq_wakeup = platform_get_irq_byname(pdev, "wakeup"); - if (irq_wakeup < 0) { - dev_err(&pdev->dev, "required wakeup interrupt missing\n"); + if (irq_wakeup < 0) return irq_wakeup; - } match = of_match_device(qcom_rpm_of_match, &pdev->dev); if (!match) diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c index 9b9b06d36cb1..d5e34a5eb250 100644 --- a/drivers/mfd/sm501.c +++ b/drivers/mfd/sm501.c @@ -1394,10 +1394,8 @@ static int sm501_plat_probe(struct platform_device *dev) sm->platdata = dev_get_platdata(&dev->dev); ret = platform_get_irq(dev, 0); - if (ret < 0) { - dev_err(&dev->dev, "failed to get irq resource\n"); + if (ret < 0) goto err_res; - } sm->irq = ret; sm->io_res = platform_get_resource(dev, IORESOURCE_MEM, 1); From 9e38e690ace3e7a22a81fc02652fc101efb340cf Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Wed, 24 Jul 2019 13:54:12 +0530 Subject: [PATCH 084/721] PCI: tegra: Fix OF node reference leak Each iteration of for_each_child_of_node() executes of_node_put() on the previous node, but in some return paths in the middle of the loop of_node_put() is missing thus causing a reference leak. Hence stash these mid-loop return values in a variable 'err' and add a new label err_node_put which executes of_node_put() on the previous node and returns 'err' on failure. Change mid-loop return statements to point to jump to this label to fix the reference leak. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta [lorenzo.pieralisi@arm.com: rewrote commit log] Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pci-tegra.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c index 9a917b2456f6..673a1725ef38 100644 --- a/drivers/pci/controller/pci-tegra.c +++ b/drivers/pci/controller/pci-tegra.c @@ -2237,14 +2237,15 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) err = of_pci_get_devfn(port); if (err < 0) { dev_err(dev, "failed to parse address: %d\n", err); - return err; + goto err_node_put; } index = PCI_SLOT(err); if (index < 1 || index > soc->num_ports) { dev_err(dev, "invalid port number: %d\n", index); - return -EINVAL; + err = -EINVAL; + goto err_node_put; } index--; @@ -2253,12 +2254,13 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) if (err < 0) { dev_err(dev, "failed to parse # of lanes: %d\n", err); - return err; + goto err_node_put; } if (value > 16) { dev_err(dev, "invalid # of lanes: %u\n", value); - return -EINVAL; + err = -EINVAL; + goto err_node_put; } lanes |= value << (index << 3); @@ -2272,13 +2274,15 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) lane += value; rp = devm_kzalloc(dev, sizeof(*rp), GFP_KERNEL); - if (!rp) - return -ENOMEM; + if (!rp) { + err = -ENOMEM; + goto err_node_put; + } err = of_address_to_resource(port, 0, &rp->regs); if (err < 0) { dev_err(dev, "failed to parse address: %d\n", err); - return err; + goto err_node_put; } INIT_LIST_HEAD(&rp->list); @@ -2330,6 +2334,10 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) return err; return 0; + +err_node_put: + of_node_put(port); + return err; } /* From 708cb5cc3fde63a33afa0aaf7a26fd7ea9015dd3 Mon Sep 17 00:00:00 2001 From: Hsin-Hsiung Wang Date: Mon, 5 Aug 2019 13:21:49 +0800 Subject: [PATCH 085/721] mfd: mt6397: Rename macros to something more readable Signed-off-by: Hsin-Hsiung Wang Signed-off-by: Lee Jones --- drivers/mfd/mt6397-core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index 337bcccdb914..c0708622f41b 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -10,17 +10,17 @@ #include #include #include -#include #include -#include +#include #include +#include #define MT6397_RTC_BASE 0xe000 #define MT6397_RTC_SIZE 0x3e -#define MT6323_CID_CODE 0x23 -#define MT6391_CID_CODE 0x91 -#define MT6397_CID_CODE 0x97 +#define MT6323_CHIP_ID 0x23 +#define MT6391_CHIP_ID 0x91 +#define MT6397_CHIP_ID 0x97 static const struct resource mt6397_rtc_resources[] = { { @@ -290,7 +290,7 @@ static int mt6397_probe(struct platform_device *pdev) return pmic->irq; switch (id & 0xff) { - case MT6323_CID_CODE: + case MT6323_CHIP_ID: pmic->int_con[0] = MT6323_INT_CON0; pmic->int_con[1] = MT6323_INT_CON1; pmic->int_status[0] = MT6323_INT_STATUS0; @@ -304,8 +304,8 @@ static int mt6397_probe(struct platform_device *pdev) 0, pmic->irq_domain); break; - case MT6397_CID_CODE: - case MT6391_CID_CODE: + case MT6391_CHIP_ID: + case MT6397_CHIP_ID: pmic->int_con[0] = MT6397_INT_CON0; pmic->int_con[1] = MT6397_INT_CON1; pmic->int_status[0] = MT6397_INT_STATUS0; From a4872e80ce7d2a1844328176dbf279d0a2b89bdb Mon Sep 17 00:00:00 2001 From: Hsin-Hsiung Wang Date: Mon, 5 Aug 2019 13:21:50 +0800 Subject: [PATCH 086/721] mfd: mt6397: Extract IRQ related code from core driver In order to support different types of irq design, we decide to add separate irq drivers for different design and keep mt6397 mfd core simple and reusable to all generations of PMICs so far. Signed-off-by: Hsin-Hsiung Wang Signed-off-by: Lee Jones --- drivers/mfd/Makefile | 3 +- drivers/mfd/mt6397-core.c | 146 -------------------------- drivers/mfd/mt6397-irq.c | 181 ++++++++++++++++++++++++++++++++ include/linux/mfd/mt6397/core.h | 9 ++ 4 files changed, 192 insertions(+), 147 deletions(-) create mode 100644 drivers/mfd/mt6397-irq.c diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 446d5df7cacb..7b6a6aa4fe42 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -240,7 +240,8 @@ obj-$(CONFIG_INTEL_SOC_PMIC) += intel-soc-pmic.o obj-$(CONFIG_INTEL_SOC_PMIC_BXTWC) += intel_soc_pmic_bxtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTWC) += intel_soc_pmic_chtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI) += intel_soc_pmic_chtdc_ti.o -obj-$(CONFIG_MFD_MT6397) += mt6397-core.o +mt6397-objs := mt6397-core.o mt6397-irq.o +obj-$(CONFIG_MFD_MT6397) += mt6397.o obj-$(CONFIG_MFD_ALTERA_A10SR) += altera-a10sr.o obj-$(CONFIG_MFD_ALTERA_SYSMGR) += altera-sysmgr.o diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index c0708622f41b..93c888153b77 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -18,10 +18,6 @@ #define MT6397_RTC_BASE 0xe000 #define MT6397_RTC_SIZE 0x3e -#define MT6323_CHIP_ID 0x23 -#define MT6391_CHIP_ID 0x91 -#define MT6397_CHIP_ID 0x97 - static const struct resource mt6397_rtc_resources[] = { { .start = MT6397_RTC_BASE, @@ -86,148 +82,6 @@ static const struct mfd_cell mt6397_devs[] = { } }; -static void mt6397_irq_lock(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - - mutex_lock(&mt6397->irqlock); -} - -static void mt6397_irq_sync_unlock(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - - regmap_write(mt6397->regmap, mt6397->int_con[0], - mt6397->irq_masks_cur[0]); - regmap_write(mt6397->regmap, mt6397->int_con[1], - mt6397->irq_masks_cur[1]); - - mutex_unlock(&mt6397->irqlock); -} - -static void mt6397_irq_disable(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - int shift = data->hwirq & 0xf; - int reg = data->hwirq >> 4; - - mt6397->irq_masks_cur[reg] &= ~BIT(shift); -} - -static void mt6397_irq_enable(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - int shift = data->hwirq & 0xf; - int reg = data->hwirq >> 4; - - mt6397->irq_masks_cur[reg] |= BIT(shift); -} - -#ifdef CONFIG_PM_SLEEP -static int mt6397_irq_set_wake(struct irq_data *irq_data, unsigned int on) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(irq_data); - int shift = irq_data->hwirq & 0xf; - int reg = irq_data->hwirq >> 4; - - if (on) - mt6397->wake_mask[reg] |= BIT(shift); - else - mt6397->wake_mask[reg] &= ~BIT(shift); - - return 0; -} -#else -#define mt6397_irq_set_wake NULL -#endif - -static struct irq_chip mt6397_irq_chip = { - .name = "mt6397-irq", - .irq_bus_lock = mt6397_irq_lock, - .irq_bus_sync_unlock = mt6397_irq_sync_unlock, - .irq_enable = mt6397_irq_enable, - .irq_disable = mt6397_irq_disable, - .irq_set_wake = mt6397_irq_set_wake, -}; - -static void mt6397_irq_handle_reg(struct mt6397_chip *mt6397, int reg, - int irqbase) -{ - unsigned int status; - int i, irq, ret; - - ret = regmap_read(mt6397->regmap, reg, &status); - if (ret) { - dev_err(mt6397->dev, "Failed to read irq status: %d\n", ret); - return; - } - - for (i = 0; i < 16; i++) { - if (status & BIT(i)) { - irq = irq_find_mapping(mt6397->irq_domain, irqbase + i); - if (irq) - handle_nested_irq(irq); - } - } - - regmap_write(mt6397->regmap, reg, status); -} - -static irqreturn_t mt6397_irq_thread(int irq, void *data) -{ - struct mt6397_chip *mt6397 = data; - - mt6397_irq_handle_reg(mt6397, mt6397->int_status[0], 0); - mt6397_irq_handle_reg(mt6397, mt6397->int_status[1], 16); - - return IRQ_HANDLED; -} - -static int mt6397_irq_domain_map(struct irq_domain *d, unsigned int irq, - irq_hw_number_t hw) -{ - struct mt6397_chip *mt6397 = d->host_data; - - irq_set_chip_data(irq, mt6397); - irq_set_chip_and_handler(irq, &mt6397_irq_chip, handle_level_irq); - irq_set_nested_thread(irq, 1); - irq_set_noprobe(irq); - - return 0; -} - -static const struct irq_domain_ops mt6397_irq_domain_ops = { - .map = mt6397_irq_domain_map, -}; - -static int mt6397_irq_init(struct mt6397_chip *mt6397) -{ - int ret; - - mutex_init(&mt6397->irqlock); - - /* Mask all interrupt sources */ - regmap_write(mt6397->regmap, mt6397->int_con[0], 0x0); - regmap_write(mt6397->regmap, mt6397->int_con[1], 0x0); - - mt6397->irq_domain = irq_domain_add_linear(mt6397->dev->of_node, - MT6397_IRQ_NR, &mt6397_irq_domain_ops, mt6397); - if (!mt6397->irq_domain) { - dev_err(mt6397->dev, "could not create irq domain\n"); - return -ENOMEM; - } - - ret = devm_request_threaded_irq(mt6397->dev, mt6397->irq, NULL, - mt6397_irq_thread, IRQF_ONESHOT, "mt6397-pmic", mt6397); - if (ret) { - dev_err(mt6397->dev, "failed to register irq=%d; err: %d\n", - mt6397->irq, ret); - return ret; - } - - return 0; -} - #ifdef CONFIG_PM_SLEEP static int mt6397_irq_suspend(struct device *dev) { diff --git a/drivers/mfd/mt6397-irq.c b/drivers/mfd/mt6397-irq.c new file mode 100644 index 000000000000..b2d3ce1f3115 --- /dev/null +++ b/drivers/mfd/mt6397-irq.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Copyright (c) 2019 MediaTek Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void mt6397_irq_lock(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + + mutex_lock(&mt6397->irqlock); +} + +static void mt6397_irq_sync_unlock(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + + regmap_write(mt6397->regmap, mt6397->int_con[0], + mt6397->irq_masks_cur[0]); + regmap_write(mt6397->regmap, mt6397->int_con[1], + mt6397->irq_masks_cur[1]); + + mutex_unlock(&mt6397->irqlock); +} + +static void mt6397_irq_disable(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + int shift = data->hwirq & 0xf; + int reg = data->hwirq >> 4; + + mt6397->irq_masks_cur[reg] &= ~BIT(shift); +} + +static void mt6397_irq_enable(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + int shift = data->hwirq & 0xf; + int reg = data->hwirq >> 4; + + mt6397->irq_masks_cur[reg] |= BIT(shift); +} + +#ifdef CONFIG_PM_SLEEP +static int mt6397_irq_set_wake(struct irq_data *irq_data, unsigned int on) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(irq_data); + int shift = irq_data->hwirq & 0xf; + int reg = irq_data->hwirq >> 4; + + if (on) + mt6397->wake_mask[reg] |= BIT(shift); + else + mt6397->wake_mask[reg] &= ~BIT(shift); + + return 0; +} +#else +#define mt6397_irq_set_wake NULL +#endif + +static struct irq_chip mt6397_irq_chip = { + .name = "mt6397-irq", + .irq_bus_lock = mt6397_irq_lock, + .irq_bus_sync_unlock = mt6397_irq_sync_unlock, + .irq_enable = mt6397_irq_enable, + .irq_disable = mt6397_irq_disable, + .irq_set_wake = mt6397_irq_set_wake, +}; + +static void mt6397_irq_handle_reg(struct mt6397_chip *mt6397, int reg, + int irqbase) +{ + unsigned int status; + int i, irq, ret; + + ret = regmap_read(mt6397->regmap, reg, &status); + if (ret) { + dev_err(mt6397->dev, "Failed to read irq status: %d\n", ret); + return; + } + + for (i = 0; i < 16; i++) { + if (status & BIT(i)) { + irq = irq_find_mapping(mt6397->irq_domain, irqbase + i); + if (irq) + handle_nested_irq(irq); + } + } + + regmap_write(mt6397->regmap, reg, status); +} + +static irqreturn_t mt6397_irq_thread(int irq, void *data) +{ + struct mt6397_chip *mt6397 = data; + + mt6397_irq_handle_reg(mt6397, mt6397->int_status[0], 0); + mt6397_irq_handle_reg(mt6397, mt6397->int_status[1], 16); + + return IRQ_HANDLED; +} + +static int mt6397_irq_domain_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hw) +{ + struct mt6397_chip *mt6397 = d->host_data; + + irq_set_chip_data(irq, mt6397); + irq_set_chip_and_handler(irq, &mt6397_irq_chip, handle_level_irq); + irq_set_nested_thread(irq, 1); + irq_set_noprobe(irq); + + return 0; +} + +static const struct irq_domain_ops mt6397_irq_domain_ops = { + .map = mt6397_irq_domain_map, +}; + +int mt6397_irq_init(struct mt6397_chip *chip) +{ + int ret; + + mutex_init(&chip->irqlock); + + switch (chip->chip_id) { + case MT6323_CHIP_ID: + chip->int_con[0] = MT6323_INT_CON0; + chip->int_con[1] = MT6323_INT_CON1; + chip->int_status[0] = MT6323_INT_STATUS0; + chip->int_status[1] = MT6323_INT_STATUS1; + break; + + case MT6391_CHIP_ID: + case MT6397_CHIP_ID: + chip->int_con[0] = MT6397_INT_CON0; + chip->int_con[1] = MT6397_INT_CON1; + chip->int_status[0] = MT6397_INT_STATUS0; + chip->int_status[1] = MT6397_INT_STATUS1; + break; + + default: + dev_err(chip->dev, "unsupported chip: 0x%x\n", chip->chip_id); + return -ENODEV; + } + + /* Mask all interrupt sources */ + regmap_write(chip->regmap, chip->int_con[0], 0x0); + regmap_write(chip->regmap, chip->int_con[1], 0x0); + + chip->irq_domain = irq_domain_add_linear(chip->dev->of_node, + MT6397_IRQ_NR, + &mt6397_irq_domain_ops, + chip); + if (!chip->irq_domain) { + dev_err(chip->dev, "could not create irq domain\n"); + return -ENOMEM; + } + + ret = devm_request_threaded_irq(chip->dev, chip->irq, NULL, + mt6397_irq_thread, IRQF_ONESHOT, + "mt6397-pmic", chip); + if (ret) { + dev_err(chip->dev, "failed to register irq=%d; err: %d\n", + chip->irq, ret); + return ret; + } + + return 0; +} diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h index 25a95e72179b..9320c2a1e3e6 100644 --- a/include/linux/mfd/mt6397/core.h +++ b/include/linux/mfd/mt6397/core.h @@ -7,6 +7,12 @@ #ifndef __MFD_MT6397_CORE_H__ #define __MFD_MT6397_CORE_H__ +enum chip_id { + MT6323_CHIP_ID = 0x23, + MT6391_CHIP_ID = 0x91, + MT6397_CHIP_ID = 0x97, +}; + enum mt6397_irq_numbers { MT6397_IRQ_SPKL_AB = 0, MT6397_IRQ_SPKR_AB, @@ -54,6 +60,9 @@ struct mt6397_chip { u16 irq_masks_cache[2]; u16 int_con[2]; u16 int_status[2]; + u16 chip_id; }; +int mt6397_irq_init(struct mt6397_chip *chip); + #endif /* __MFD_MT6397_CORE_H__ */ From 533ca1feed98b0bf024779a14760694c7cb4d431 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 2 Aug 2019 22:50:20 +0000 Subject: [PATCH 087/721] PCI: hv: Avoid use of hv_pci_dev->pci_slot after freeing it The slot must be removed before the pci_dev is removed, otherwise a panic can happen due to use-after-free. Fixes: 15becc2b56c6 ("PCI: hv: Add hv_pci_remove_slots() when we unload the driver") Signed-off-by: Dexuan Cui Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org --- drivers/pci/controller/pci-hyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 40b625458afa..2b53976cd9f9 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2701,8 +2701,8 @@ static int hv_pci_remove(struct hv_device *hdev) /* Remove the bus from PCI's point of view. */ pci_lock_rescan_remove(); pci_stop_root_bus(hbus->pci_bus); - pci_remove_root_bus(hbus->pci_bus); hv_pci_remove_slots(hbus); + pci_remove_root_bus(hbus->pci_bus); pci_unlock_rescan_remove(); hbus->state = hv_pcibus_removed; } From ac5656d8a4cdd93cd2c74355ed12e5617817e0e7 Mon Sep 17 00:00:00 2001 From: Aaron Goidel Date: Mon, 12 Aug 2019 11:20:00 -0400 Subject: [PATCH 088/721] fanotify, inotify, dnotify, security: add security hook for fs notifications As of now, setting watches on filesystem objects has, at most, applied a check for read access to the inode, and in the case of fanotify, requires CAP_SYS_ADMIN. No specific security hook or permission check has been provided to control the setting of watches. Using any of inotify, dnotify, or fanotify, it is possible to observe, not only write-like operations, but even read access to a file. Modeling the watch as being merely a read from the file is insufficient for the needs of SELinux. This is due to the fact that read access should not necessarily imply access to information about when another process reads from a file. Furthermore, fanotify watches grant more power to an application in the form of permission events. While notification events are solely, unidirectional (i.e. they only pass information to the receiving application), permission events are blocking. Permission events make a request to the receiving application which will then reply with a decision as to whether or not that action may be completed. This causes the issue of the watching application having the ability to exercise control over the triggering process. Without drawing a distinction within the permission check, the ability to read would imply the greater ability to control an application. Additionally, mount and superblock watches apply to all files within the same mount or superblock. Read access to one file should not necessarily imply the ability to watch all files accessed within a given mount or superblock. In order to solve these issues, a new LSM hook is implemented and has been placed within the system calls for marking filesystem objects with inotify, fanotify, and dnotify watches. These calls to the hook are placed at the point at which the target path has been resolved and are provided with the path struct, the mask of requested notification events, and the type of object on which the mark is being set (inode, superblock, or mount). The mask and obj_type have already been translated into common FS_* values shared by the entirety of the fs notification infrastructure. The path struct is passed rather than just the inode so that the mount is available, particularly for mount watches. This also allows for use of the hook by pathname-based security modules. However, since the hook is intended for use even by inode based security modules, it is not placed under the CONFIG_SECURITY_PATH conditional. Otherwise, the inode-based security modules would need to enable all of the path hooks, even though they do not use any of them. This only provides a hook at the point of setting a watch, and presumes that permission to set a particular watch implies the ability to receive all notification about that object which match the mask. This is all that is required for SELinux. If other security modules require additional hooks or infrastructure to control delivery of notification, these can be added by them. It does not make sense for us to propose hooks for which we have no implementation. The understanding that all notifications received by the requesting application are all strictly of a type for which the application has been granted permission shows that this implementation is sufficient in its coverage. Security modules wishing to provide complete control over fanotify must also implement a security_file_open hook that validates that the access requested by the watching application is authorized. Fanotify has the issue that it returns a file descriptor with the file mode specified during fanotify_init() to the watching process on event. This is already covered by the LSM security_file_open hook if the security module implements checking of the requested file mode there. Otherwise, a watching process can obtain escalated access to a file for which it has not been authorized. The selinux_path_notify hook implementation works by adding five new file permissions: watch, watch_mount, watch_sb, watch_reads, and watch_with_perm (descriptions about which will follow), and one new filesystem permission: watch (which is applied to superblock checks). The hook then decides which subset of these permissions must be held by the requesting application based on the contents of the provided mask and the obj_type. The selinux_file_open hook already checks the requested file mode and therefore ensures that a watching process cannot escalate its access through fanotify. The watch, watch_mount, and watch_sb permissions are the baseline permissions for setting a watch on an object and each are a requirement for any watch to be set on a file, mount, or superblock respectively. It should be noted that having either of the other two permissions (watch_reads and watch_with_perm) does not imply the watch, watch_mount, or watch_sb permission. Superblock watches further require the filesystem watch permission to the superblock. As there is no labeled object in view for mounts, there is no specific check for mount watches beyond watch_mount to the inode. Such a check could be added in the future, if a suitable labeled object existed representing the mount. The watch_reads permission is required to receive notifications from read-exclusive events on filesystem objects. These events include accessing a file for the purpose of reading and closing a file which has been opened read-only. This distinction has been drawn in order to provide a direct indication in the policy for this otherwise not obvious capability. Read access to a file should not necessarily imply the ability to observe read events on a file. Finally, watch_with_perm only applies to fanotify masks since it is the only way to set a mask which allows for the blocking, permission event. This permission is needed for any watch which is of this type. Though fanotify requires CAP_SYS_ADMIN, this is insufficient as it gives implicit trust to root, which we do not do, and does not support least privilege. Signed-off-by: Aaron Goidel Acked-by: Casey Schaufler Acked-by: Jan Kara Signed-off-by: Paul Moore --- fs/notify/dnotify/dnotify.c | 15 +++++++-- fs/notify/fanotify/fanotify_user.c | 19 ++++++++++-- fs/notify/inotify/inotify_user.c | 14 +++++++-- include/linux/lsm_hooks.h | 9 +++++- include/linux/security.h | 10 ++++-- security/security.c | 6 ++++ security/selinux/hooks.c | 47 +++++++++++++++++++++++++++++ security/selinux/include/classmap.h | 5 +-- 8 files changed, 113 insertions(+), 12 deletions(-) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 250369d6901d..87a7f61bc91c 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -288,6 +289,17 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) goto out_err; } + /* + * convert the userspace DN_* "arg" to the internal FS_* + * defined in fsnotify + */ + mask = convert_arg(arg); + + error = security_path_notify(&filp->f_path, mask, + FSNOTIFY_OBJ_TYPE_INODE); + if (error) + goto out_err; + /* expect most fcntl to add new rather than augment old */ dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL); if (!dn) { @@ -302,9 +314,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) goto out_err; } - /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */ - mask = convert_arg(arg); - /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; fsnotify_init_mark(new_fsn_mark, dnotify_group); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index a90bb19dcfa2..8b4e2ad6d208 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -528,7 +528,8 @@ static const struct file_operations fanotify_fops = { }; static int fanotify_find_path(int dfd, const char __user *filename, - struct path *path, unsigned int flags) + struct path *path, unsigned int flags, __u64 mask, + unsigned int obj_type) { int ret; @@ -567,8 +568,15 @@ static int fanotify_find_path(int dfd, const char __user *filename, /* you can only watch an inode if you have read permissions on it */ ret = inode_permission(path->dentry->d_inode, MAY_READ); + if (ret) { + path_put(path); + goto out; + } + + ret = security_path_notify(path, mask, obj_type); if (ret) path_put(path); + out: return ret; } @@ -931,6 +939,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, __kernel_fsid_t __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + unsigned int obj_type; int ret; pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", @@ -945,8 +954,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, switch (mark_type) { case FAN_MARK_INODE: + obj_type = FSNOTIFY_OBJ_TYPE_INODE; + break; case FAN_MARK_MOUNT: + obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT; + break; case FAN_MARK_FILESYSTEM: + obj_type = FSNOTIFY_OBJ_TYPE_SB; break; default: return -EINVAL; @@ -1014,7 +1028,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; } - ret = fanotify_find_path(dfd, pathname, &path, flags); + ret = fanotify_find_path(dfd, pathname, &path, flags, + (mask & ALL_FSNOTIFY_EVENTS), obj_type); if (ret) goto fput_and_out; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 7b53598c8804..408e9917ed42 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "inotify.h" #include "../fdinfo.h" @@ -342,7 +343,8 @@ static const struct file_operations inotify_fops = { /* * find_inode - resolve a user-given path to a specific inode */ -static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags) +static int inotify_find_inode(const char __user *dirname, struct path *path, + unsigned int flags, __u64 mask) { int error; @@ -351,8 +353,15 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns return error; /* you can only watch an inode if you have read permissions on it */ error = inode_permission(path->dentry->d_inode, MAY_READ); + if (error) { + path_put(path); + return error; + } + error = security_path_notify(path, mask, + FSNOTIFY_OBJ_TYPE_INODE); if (error) path_put(path); + return error; } @@ -744,7 +753,8 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, if (mask & IN_ONLYDIR) flags |= LOOKUP_DIRECTORY; - ret = inotify_find_inode(pathname, &path, flags); + ret = inotify_find_inode(pathname, &path, flags, + (mask & IN_ALL_EVENTS)); if (ret) goto fput_and_out; diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 47f58cfb6a19..ead98af9c602 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -339,6 +339,9 @@ * Check for permission to change root directory. * @path contains the path structure. * Return 0 if permission is granted. + * @path_notify: + * Check permissions before setting a watch on events as defined by @mask, + * on an object at @path, whose type is defined by @obj_type. * @inode_readlink: * Check the permission to read the symbolic link. * @dentry contains the dentry structure for the file link. @@ -1535,7 +1538,9 @@ union security_list_options { int (*path_chown)(const struct path *path, kuid_t uid, kgid_t gid); int (*path_chroot)(const struct path *path); #endif - + /* Needed for inode based security check */ + int (*path_notify)(const struct path *path, u64 mask, + unsigned int obj_type); int (*inode_alloc_security)(struct inode *inode); void (*inode_free_security)(struct inode *inode); int (*inode_init_security)(struct inode *inode, struct inode *dir, @@ -1860,6 +1865,8 @@ struct security_hook_heads { struct hlist_head path_chown; struct hlist_head path_chroot; #endif + /* Needed for inode based modules as well */ + struct hlist_head path_notify; struct hlist_head inode_alloc_security; struct hlist_head inode_free_security; struct hlist_head inode_init_security; diff --git a/include/linux/security.h b/include/linux/security.h index 659071c2e57c..7d9c1da1f659 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -259,7 +259,8 @@ int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new); - +int security_path_notify(const struct path *path, u64 mask, + unsigned int obj_type); int security_inode_alloc(struct inode *inode); void security_inode_free(struct inode *inode); int security_inode_init_security(struct inode *inode, struct inode *dir, @@ -387,7 +388,6 @@ int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(char *secdata, u32 seclen); - void security_inode_invalidate_secctx(struct inode *inode); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); @@ -621,6 +621,12 @@ static inline int security_move_mount(const struct path *from_path, return 0; } +static inline int security_path_notify(const struct path *path, u64 mask, + unsigned int obj_type) +{ + return 0; +} + static inline int security_inode_alloc(struct inode *inode) { return 0; diff --git a/security/security.c b/security/security.c index 613a5c00e602..30687e1366b7 100644 --- a/security/security.c +++ b/security/security.c @@ -871,6 +871,12 @@ int security_move_mount(const struct path *from_path, const struct path *to_path return call_int_hook(move_mount, 0, from_path, to_path); } +int security_path_notify(const struct path *path, u64 mask, + unsigned int obj_type) +{ + return call_int_hook(path_notify, 0, path, mask, obj_type); +} + int security_inode_alloc(struct inode *inode) { int rc = lsm_inode_alloc(inode); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index f77b314d0575..d55571c585ff 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -92,6 +92,8 @@ #include #include /* for hashlen_string() */ #include +#include +#include #include "avc.h" #include "objsec.h" @@ -3261,6 +3263,50 @@ static int selinux_inode_removexattr(struct dentry *dentry, const char *name) return -EACCES; } +static int selinux_path_notify(const struct path *path, u64 mask, + unsigned int obj_type) +{ + int ret; + u32 perm; + + struct common_audit_data ad; + + ad.type = LSM_AUDIT_DATA_PATH; + ad.u.path = *path; + + /* + * Set permission needed based on the type of mark being set. + * Performs an additional check for sb watches. + */ + switch (obj_type) { + case FSNOTIFY_OBJ_TYPE_VFSMOUNT: + perm = FILE__WATCH_MOUNT; + break; + case FSNOTIFY_OBJ_TYPE_SB: + perm = FILE__WATCH_SB; + ret = superblock_has_perm(current_cred(), path->dentry->d_sb, + FILESYSTEM__WATCH, &ad); + if (ret) + return ret; + break; + case FSNOTIFY_OBJ_TYPE_INODE: + perm = FILE__WATCH; + break; + default: + return -EINVAL; + } + + /* blocking watches require the file:watch_with_perm permission */ + if (mask & (ALL_FSNOTIFY_PERM_EVENTS)) + perm |= FILE__WATCH_WITH_PERM; + + /* watches on read-like events need the file:watch_reads permission */ + if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE)) + perm |= FILE__WATCH_READS; + + return path_has_perm(current_cred(), path, perm); +} + /* * Copy the inode security context value to the user. * @@ -6798,6 +6844,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(inode_getsecid, selinux_inode_getsecid), LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up), LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr), + LSM_HOOK_INIT(path_notify, selinux_path_notify), LSM_HOOK_INIT(kernfs_init_security, selinux_kernfs_init_security), diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 201f7e588a29..32e9b03be3dd 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -7,7 +7,8 @@ #define COMMON_FILE_PERMS COMMON_FILE_SOCK_PERMS, "unlink", "link", \ "rename", "execute", "quotaon", "mounton", "audit_access", \ - "open", "execmod" + "open", "execmod", "watch", "watch_mount", "watch_sb", \ + "watch_with_perm", "watch_reads" #define COMMON_SOCK_PERMS COMMON_FILE_SOCK_PERMS, "bind", "connect", \ "listen", "accept", "getopt", "setopt", "shutdown", "recvfrom", \ @@ -60,7 +61,7 @@ struct security_class_mapping secclass_map[] = { { "filesystem", { "mount", "remount", "unmount", "getattr", "relabelfrom", "relabelto", "associate", "quotamod", - "quotaget", NULL } }, + "quotaget", "watch", NULL } }, { "file", { COMMON_FILE_PERMS, "execute_no_trans", "entrypoint", NULL } }, From 448d5a55759a2e60208b75ceed4bd88cd4f5a963 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:15 +0530 Subject: [PATCH 089/721] PCI: Add #defines for some of PCIe spec r4.0 features Add #defines only for the Data Link Feature and Physical Layer 16.0 GT/s features as defined in PCIe spec r4.0, sec 7.7.4 for Data Link Feature and sec 7.7.5 for Physical Layer 16.0 GT/s. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding Acked-by: Bjorn Helgaas --- include/uapi/linux/pci_regs.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f28e562d7ca8..d28d0319d932 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -713,7 +713,9 @@ #define PCI_EXT_CAP_ID_DPC 0x1D /* Downstream Port Containment */ #define PCI_EXT_CAP_ID_L1SS 0x1E /* L1 PM Substates */ #define PCI_EXT_CAP_ID_PTM 0x1F /* Precision Time Measurement */ -#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PTM +#define PCI_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */ +#define PCI_EXT_CAP_ID_PL_16GT 0x26 /* Physical Layer 16.0 GT/s */ +#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_16GT #define PCI_EXT_CAP_DSN_SIZEOF 12 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40 @@ -1053,4 +1055,14 @@ #define PCI_L1SS_CTL1_LTR_L12_TH_SCALE 0xe0000000 /* LTR_L1.2_THRESHOLD_Scale */ #define PCI_L1SS_CTL2 0x0c /* Control 2 Register */ +/* Data Link Feature */ +#define PCI_DLF_CAP 0x04 /* Capabilities Register */ +#define PCI_DLF_EXCHANGE_ENABLE 0x80000000 /* Data Link Feature Exchange Enable */ + +/* Physical Layer 16.0 GT/s */ +#define PCI_PL_16GT_LE_CTRL 0x20 /* Lane Equalization Control Register */ +#define PCI_PL_16GT_LE_CTRL_DSP_TX_PRESET_MASK 0x0000000F +#define PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK 0x000000F0 +#define PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT 4 + #endif /* LINUX_PCI_REGS_H */ From 8c7e96d3fe75bb41fadf23e7a494f37d1cd9906c Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:16 +0530 Subject: [PATCH 090/721] PCI: Disable MSI for Tegra root ports Tegra PCIe rootports don't generate MSI interrupts for PME and AER events. Since PCIe spec (Ref: r4.0 sec 7.7.1.2 and 7.7.2.2) doesn't support using a mix of INTx and MSI/MSI-X, MSI needs to be disabled to avoid root ports service drivers registering their respective ISRs with MSI interrupt and to let only INTx be used for all events. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding --- drivers/pci/quirks.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 208aacf39329..168782c5d23b 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2592,6 +2592,59 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_15, nvenet_msi_disable); +/* + * PCIe spec r4.0 sec 7.7.1.2 and sec 7.7.2.2 say that if MSI/MSI-X is enabled, + * then the device can't use INTx interrupts. Tegra's PCIe root ports don't + * generate MSI interrupts for PME and AER events instead only INTx interrupts + * are generated. Though Tegra's PCIe root ports can generate MSI interrupts + * for other events, since PCIe specificiation doesn't support using a mix of + * INTx and MSI/MSI-X, it is required to disable MSI interrupts to avoid port + * service drivers registering their respective ISRs for MSIs. + */ +static void pci_quirk_nvidia_tegra_disable_rp_msi(struct pci_dev *dev) +{ + dev->no_msi = 1; +} +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad0, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad1, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad2, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0bf0, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0bf1, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1c, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1d, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e12, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e13, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0fae, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0faf, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x10e5, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x10e6, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); + /* * Some versions of the MCP55 bridge from Nvidia have a legacy IRQ routing * config register. This register controls the routing of legacy From 3924bc2fd1b607faa5cb0bc0655e9853656de31a Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:17 +0530 Subject: [PATCH 091/721] PCI: dwc: Group DBI registers writes requiring unlocking Some of DesignWare core's DBI registers (a.k.a configuration space registers) are write-protected with a lock without enabling which they are read-only by default. These write-protected registers are implementation specific. Tegra194's BAR-0 register which is at offset 0x10 in the configuration space is an example. Current implementation in dw_pcie_setup_rc() API attempts to unlock those write-protected registers whenever they are updated and lock them back again for writing. Group all write-protected registers writes so that locking and unlocking is performed once to avoid bloating the code with multiple unlock/lock sequences for all those write-protected registers. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding Acked-by: Jingoo Han --- drivers/pci/controller/dwc/pcie-designware-host.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index f93252d0da5b..d3156446ff27 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -628,6 +628,12 @@ void dw_pcie_setup_rc(struct pcie_port *pp) u32 val, ctrl, num_ctrls; struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + /* + * Enable DBI read-only registers for writing/updating configuration. + * Write permission gets disabled towards the end of this function. + */ + dw_pcie_dbi_ro_wr_en(pci); + dw_pcie_setup(pci); if (!pp->ops->msi_host_init) { @@ -650,12 +656,10 @@ void dw_pcie_setup_rc(struct pcie_port *pp) dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_1, 0x00000000); /* Setup interrupt pins */ - dw_pcie_dbi_ro_wr_en(pci); val = dw_pcie_readl_dbi(pci, PCI_INTERRUPT_LINE); val &= 0xffff00ff; val |= 0x00000100; dw_pcie_writel_dbi(pci, PCI_INTERRUPT_LINE, val); - dw_pcie_dbi_ro_wr_dis(pci); /* Setup bus numbers */ val = dw_pcie_readl_dbi(pci, PCI_PRIMARY_BUS); @@ -687,15 +691,13 @@ void dw_pcie_setup_rc(struct pcie_port *pp) dw_pcie_wr_own_conf(pp, PCI_BASE_ADDRESS_0, 4, 0); - /* Enable write permission for the DBI read-only register */ - dw_pcie_dbi_ro_wr_en(pci); /* Program correct class for RC */ dw_pcie_wr_own_conf(pp, PCI_CLASS_DEVICE, 2, PCI_CLASS_BRIDGE_PCI); - /* Better disable write permission right after the update */ - dw_pcie_dbi_ro_wr_dis(pci); dw_pcie_rd_own_conf(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, 4, &val); val |= PORT_LOGIC_SPEED_CHANGE; dw_pcie_wr_own_conf(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, 4, val); + + dw_pcie_dbi_ro_wr_dis(pci); } EXPORT_SYMBOL_GPL(dw_pcie_setup_rc); From 7a6854f6874ff416afa6552706a0011418f07888 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:18 +0530 Subject: [PATCH 092/721] PCI: dwc: Move config space capability search API Move PCIe config space capability search API to common DesignWare file as this can be used by both host and EP mode drivers. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding Acked-by: Gustavo Pimentel --- .../pci/controller/dwc/pcie-designware-ep.c | 37 +----------------- drivers/pci/controller/dwc/pcie-designware.c | 39 +++++++++++++++++++ drivers/pci/controller/dwc/pcie-designware.h | 2 + 3 files changed, 43 insertions(+), 35 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 2bf5a35c0570..65f479250087 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -40,39 +40,6 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar) __dw_pcie_ep_reset_bar(pci, bar, 0); } -static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie *pci, u8 cap_ptr, - u8 cap) -{ - u8 cap_id, next_cap_ptr; - u16 reg; - - if (!cap_ptr) - return 0; - - reg = dw_pcie_readw_dbi(pci, cap_ptr); - cap_id = (reg & 0x00ff); - - if (cap_id > PCI_CAP_ID_MAX) - return 0; - - if (cap_id == cap) - return cap_ptr; - - next_cap_ptr = (reg & 0xff00) >> 8; - return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap); -} - -static u8 dw_pcie_ep_find_capability(struct dw_pcie *pci, u8 cap) -{ - u8 next_cap_ptr; - u16 reg; - - reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST); - next_cap_ptr = (reg & 0x00ff); - - return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap); -} - static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 func_no, struct pci_epf_header *hdr) { @@ -612,9 +579,9 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) dev_err(dev, "Failed to reserve memory for MSI/MSI-X\n"); return -ENOMEM; } - ep->msi_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSI); + ep->msi_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSI); - ep->msix_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSIX); + ep->msix_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSIX); offset = dw_pcie_ep_find_ext_capability(pci, PCI_EXT_CAP_ID_REBAR); if (offset) { diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 7d25102c304c..7818b4febb08 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -14,6 +14,45 @@ #include "pcie-designware.h" +/* + * These interfaces resemble the pci_find_*capability() interfaces, but these + * are for configuring host controllers, which are bridges *to* PCI devices but + * are not PCI devices themselves. + */ +static u8 __dw_pcie_find_next_cap(struct dw_pcie *pci, u8 cap_ptr, + u8 cap) +{ + u8 cap_id, next_cap_ptr; + u16 reg; + + if (!cap_ptr) + return 0; + + reg = dw_pcie_readw_dbi(pci, cap_ptr); + cap_id = (reg & 0x00ff); + + if (cap_id > PCI_CAP_ID_MAX) + return 0; + + if (cap_id == cap) + return cap_ptr; + + next_cap_ptr = (reg & 0xff00) >> 8; + return __dw_pcie_find_next_cap(pci, next_cap_ptr, cap); +} + +u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap) +{ + u8 next_cap_ptr; + u16 reg; + + reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST); + next_cap_ptr = (reg & 0x00ff); + + return __dw_pcie_find_next_cap(pci, next_cap_ptr, cap); +} +EXPORT_SYMBOL_GPL(dw_pcie_find_capability); + int dw_pcie_read(void __iomem *addr, int size, u32 *val) { if (!IS_ALIGNED((uintptr_t)addr, size)) { diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index ffed084a0b4f..d8c66a6827dc 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -251,6 +251,8 @@ struct dw_pcie { #define to_dw_pcie_from_ep(endpoint) \ container_of((endpoint), struct dw_pcie, ep) +u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap); + int dw_pcie_read(void __iomem *addr, int size, u32 *val); int dw_pcie_write(void __iomem *addr, int size, u32 val); From 5b0841fa653f6c156500ecfe7e996837884363c4 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:19 +0530 Subject: [PATCH 093/721] PCI: dwc: Add extended configuration space capability search API Add extended configuration space capability search API using struct dw_pcie* pointer. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Acked-by: Gustavo Pimentel Acked-by: Thierry Reding --- drivers/pci/controller/dwc/pcie-designware.c | 41 ++++++++++++++++++++ drivers/pci/controller/dwc/pcie-designware.h | 1 + 2 files changed, 42 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 7818b4febb08..181449e342f1 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -53,6 +53,47 @@ u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap) } EXPORT_SYMBOL_GPL(dw_pcie_find_capability); +static u16 dw_pcie_find_next_ext_capability(struct dw_pcie *pci, u16 start, + u8 cap) +{ + u32 header; + int ttl; + int pos = PCI_CFG_SPACE_SIZE; + + /* minimum 8 bytes per capability */ + ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8; + + if (start) + pos = start; + + header = dw_pcie_readl_dbi(pci, pos); + /* + * If we have no capabilities, this is indicated by cap ID, + * cap version and next pointer all being 0. + */ + if (header == 0) + return 0; + + while (ttl-- > 0) { + if (PCI_EXT_CAP_ID(header) == cap && pos != start) + return pos; + + pos = PCI_EXT_CAP_NEXT(header); + if (pos < PCI_CFG_SPACE_SIZE) + break; + + header = dw_pcie_readl_dbi(pci, pos); + } + + return 0; +} + +u16 dw_pcie_find_ext_capability(struct dw_pcie *pci, u8 cap) +{ + return dw_pcie_find_next_ext_capability(pci, 0, cap); +} +EXPORT_SYMBOL_GPL(dw_pcie_find_ext_capability); + int dw_pcie_read(void __iomem *addr, int size, u32 *val) { if (!IS_ALIGNED((uintptr_t)addr, size)) { diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index d8c66a6827dc..11c223471416 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -252,6 +252,7 @@ struct dw_pcie { container_of((endpoint), struct dw_pcie, ep) u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap); +u16 dw_pcie_find_ext_capability(struct dw_pcie *pci, u8 cap); int dw_pcie_read(void __iomem *addr, int size, u32 *val); int dw_pcie_write(void __iomem *addr, int size, u32 val); From feee519ae55c8859a50eca0da83b1ef4ea45f65d Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:20 +0530 Subject: [PATCH 094/721] PCI: dwc: Export dw_pcie_wait_for_link() API Export the dw_pcie_wait_for_link() function to be able to build drivers using it as loadable modules. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/pcie-designware.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 181449e342f1..1d87e823de21 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -460,6 +460,7 @@ int dw_pcie_wait_for_link(struct dw_pcie *pci) return -ETIMEDOUT; } +EXPORT_SYMBOL_GPL(dw_pcie_wait_for_link); int dw_pcie_link_up(struct dw_pcie *pci) { From f4e84a5d9db7dfea96832157bc461177e87a47dd Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:21 +0530 Subject: [PATCH 095/721] dt-bindings: PCI: designware: Add binding for CDM register check Add support to enable CDM (Configuration Dependent Module) registers check for any data corruption. CDM registers include standard PCIe configuration space registers, Port Logic registers and iATU and DMA registers. Refer Section S.4 of Synopsys DesignWare Cores PCI Express Controller Databook Version 4.90a. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding Reviewed-by: Rob Herring --- Documentation/devicetree/bindings/pci/designware-pcie.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/designware-pcie.txt b/Documentation/devicetree/bindings/pci/designware-pcie.txt index 5561a1c060d0..3fba04da6a59 100644 --- a/Documentation/devicetree/bindings/pci/designware-pcie.txt +++ b/Documentation/devicetree/bindings/pci/designware-pcie.txt @@ -34,6 +34,11 @@ Optional properties: - clock-names: Must include the following entries: - "pcie" - "pcie_bus" +- snps,enable-cdm-check: This is a boolean property and if present enables + automatic checking of CDM (Configuration Dependent Module) registers + for data corruption. CDM registers include standard PCIe configuration + space registers, Port Logic registers, DMA and iATU (internal Address + Translation Unit) registers. RC mode: - num-viewport: number of view ports configured in hardware. If a platform does not specify it, the driver assumes 2. From 07f123def73e07e398537be7a5b43a244cb0cbf3 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:22 +0530 Subject: [PATCH 096/721] PCI: dwc: Add support to enable CDM register check Add support to enable CDM (Configuration Dependent Module) register check for any data corruption based on the DT property 'snps,enable-cdm-check'. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Thierry Reding Acked-by: Gustavo Pimentel --- drivers/pci/controller/dwc/pcie-designware.c | 7 +++++++ drivers/pci/controller/dwc/pcie-designware.h | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 1d87e823de21..59eaeeb21dbe 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -547,4 +547,11 @@ void dw_pcie_setup(struct dw_pcie *pci) break; } dw_pcie_writel_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL, val); + + if (of_property_read_bool(np, "snps,enable-cdm-check")) { + val = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS); + val |= PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS | + PCIE_PL_CHK_REG_CHK_REG_START; + dw_pcie_writel_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS, val); + } } diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index 11c223471416..5a18e94e52c8 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -86,6 +86,15 @@ #define PCIE_MISC_CONTROL_1_OFF 0x8BC #define PCIE_DBI_RO_WR_EN BIT(0) +#define PCIE_PL_CHK_REG_CONTROL_STATUS 0xB20 +#define PCIE_PL_CHK_REG_CHK_REG_START BIT(0) +#define PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS BIT(1) +#define PCIE_PL_CHK_REG_CHK_REG_COMPARISON_ERROR BIT(16) +#define PCIE_PL_CHK_REG_CHK_REG_LOGIC_ERROR BIT(17) +#define PCIE_PL_CHK_REG_CHK_REG_COMPLETE BIT(18) + +#define PCIE_PL_CHK_REG_ERR_ADDR 0xB28 + /* * iATU Unroll-specific register definitions * From 4.80 core version the address translation will be made by unroll From 7b31a72e7195aebd620d504f28b97ad8398e95ec Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:23 +0530 Subject: [PATCH 097/721] dt-bindings: Add PCIe supports-clkreq property Some host controllers need to know the existence of clkreq signal routing to downstream devices to be able to advertise low power features like ASPM L1 substates. Without clkreq signal routing being present, enabling ASPM L1 substates might lead to downstream devices being disconnected from the bus. Hence a new device tree property 'supports-clkreq' is added to make such host controllers aware of clkreq signal routing to downstream devices. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Reviewed-by: Thierry Reding --- Documentation/devicetree/bindings/pci/pci.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/pci.txt b/Documentation/devicetree/bindings/pci/pci.txt index 2a5d91024059..29bcbd88f457 100644 --- a/Documentation/devicetree/bindings/pci/pci.txt +++ b/Documentation/devicetree/bindings/pci/pci.txt @@ -27,6 +27,11 @@ driver implementation may support the following properties: - reset-gpios: If present this property specifies PERST# GPIO. Host drivers can parse the GPIO and apply fundamental reset to endpoints. +- supports-clkreq: + If present this property specifies that CLKREQ signal routing exists from + root port to downstream device and host bridge drivers can do programming + which depends on CLKREQ signal existence. For example, programming root port + not to advertise ASPM L1 Sub-States support if there is no CLKREQ signal. PCI-PCI Bridge properties ------------------------- From 42c253ecaa53c0a2fee431018dd18d2fa797580f Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:24 +0530 Subject: [PATCH 098/721] dt-bindings: PCI: tegra: Add device tree support for Tegra194 Add support for Tegra194 PCIe controllers. These controllers are based on Synopsys DesignWare core IP. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Thierry Reding --- .../bindings/pci/nvidia,tegra194-pcie.txt | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt new file mode 100644 index 000000000000..674e5adb2895 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt @@ -0,0 +1,155 @@ +NVIDIA Tegra PCIe controller (Synopsys DesignWare Core based) + +This PCIe host controller is based on the Synopsis Designware PCIe IP +and thus inherits all the common properties defined in designware-pcie.txt. + +Required properties: +- compatible: For Tegra19x, must contain "nvidia,tegra194-pcie". +- device_type: Must be "pci" +- power-domains: A phandle to the node that controls power to the respective + PCIe controller and a specifier name for the PCIe controller. Following are + the specifiers for the different PCIe controllers + TEGRA194_POWER_DOMAIN_PCIEX8B: C0 + TEGRA194_POWER_DOMAIN_PCIEX1A: C1 + TEGRA194_POWER_DOMAIN_PCIEX1A: C2 + TEGRA194_POWER_DOMAIN_PCIEX1A: C3 + TEGRA194_POWER_DOMAIN_PCIEX4A: C4 + TEGRA194_POWER_DOMAIN_PCIEX8A: C5 + these specifiers are defined in + "include/dt-bindings/power/tegra194-powergate.h" file. +- reg: A list of physical base address and length pairs for each set of + controller registers. Must contain an entry for each entry in the reg-names + property. +- reg-names: Must include the following entries: + "appl": Controller's application logic registers + "config": As per the definition in designware-pcie.txt + "atu_dma": iATU and DMA registers. This is where the iATU (internal Address + Translation Unit) registers of the PCIe core are made available + for SW access. + "dbi": The aperture where root port's own configuration registers are + available +- interrupts: A list of interrupt outputs of the controller. Must contain an + entry for each entry in the interrupt-names property. +- interrupt-names: Must include the following entries: + "intr": The Tegra interrupt that is asserted for controller interrupts + "msi": The Tegra interrupt that is asserted when an MSI is received +- bus-range: Range of bus numbers associated with this controller +- #address-cells: Address representation for root ports (must be 3) + - cell 0 specifies the bus and device numbers of the root port: + [23:16]: bus number + [15:11]: device number + - cell 1 denotes the upper 32 address bits and should be 0 + - cell 2 contains the lower 32 address bits and is used to translate to the + CPU address space +- #size-cells: Size representation for root ports (must be 2) +- ranges: Describes the translation of addresses for root ports and standard + PCI regions. The entries must be 7 cells each, where the first three cells + correspond to the address as described for the #address-cells property + above, the fourth and fifth cells are for the physical CPU address to + translate to and the sixth and seventh cells are as described for the + #size-cells property above. + - Entries setup the mapping for the standard I/O, memory and + prefetchable PCI regions. The first cell determines the type of region + that is setup: + - 0x81000000: I/O memory region + - 0x82000000: non-prefetchable memory region + - 0xc2000000: prefetchable memory region + Please refer to the standard PCI bus binding document for a more detailed + explanation. +- #interrupt-cells: Size representation for interrupts (must be 1) +- interrupt-map-mask and interrupt-map: Standard PCI IRQ mapping properties + Please refer to the standard PCI bus binding document for a more detailed + explanation. +- clocks: Must contain an entry for each entry in clock-names. + See ../clocks/clock-bindings.txt for details. +- clock-names: Must include the following entries: + - core +- resets: Must contain an entry for each entry in reset-names. + See ../reset/reset.txt for details. +- reset-names: Must include the following entries: + - apb + - core +- phys: Must contain a phandle to P2U PHY for each entry in phy-names. +- phy-names: Must include an entry for each active lane. + "p2u-N": where N ranges from 0 to one less than the total number of lanes +- nvidia,bpmp: Must contain a pair of phandle to BPMP controller node followed + by controller-id. Following are the controller ids for each controller. + 0: C0 + 1: C1 + 2: C2 + 3: C3 + 4: C4 + 5: C5 +- vddio-pex-ctl-supply: Regulator supply for PCIe side band signals + +Optional properties: +- supports-clkreq: Refer to Documentation/devicetree/bindings/pci/pci.txt +- nvidia,update-fc-fixup: This is a boolean property and needs to be present to + improve performance when a platform is designed in such a way that it + satisfies at least one of the following conditions thereby enabling root + port to exchange optimum number of FC (Flow Control) credits with + downstream devices + 1. If C0/C4/C5 run at x1/x2 link widths (irrespective of speed and MPS) + 2. If C0/C1/C2/C3/C4/C5 operate at their respective max link widths and + a) speed is Gen-2 and MPS is 256B + b) speed is >= Gen-3 with any MPS +- nvidia,aspm-cmrt-us: Common Mode Restore Time for proper operation of ASPM + to be specified in microseconds +- nvidia,aspm-pwr-on-t-us: Power On time for proper operation of ASPM to be + specified in microseconds +- nvidia,aspm-l0s-entrance-latency-us: ASPM L0s entrance latency to be + specified in microseconds + +Examples: +========= + +Tegra194: +-------- + + pcie@14180000 { + compatible = "nvidia,tegra194-pcie", "snps,dw-pcie"; + power-domains = <&bpmp TEGRA194_POWER_DOMAIN_PCIEX8B>; + reg = <0x00 0x14180000 0x0 0x00020000 /* appl registers (128K) */ + 0x00 0x38000000 0x0 0x00040000 /* configuration space (256K) */ + 0x00 0x38040000 0x0 0x00040000>; /* iATU_DMA reg space (256K) */ + reg-names = "appl", "config", "atu_dma"; + + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + num-lanes = <8>; + linux,pci-domain = <0>; + + clocks = <&bpmp TEGRA194_CLK_PEX0_CORE_0>; + clock-names = "core"; + + resets = <&bpmp TEGRA194_RESET_PEX0_CORE_0_APB>, + <&bpmp TEGRA194_RESET_PEX0_CORE_0>; + reset-names = "apb", "core"; + + interrupts = , /* controller interrupt */ + ; /* MSI interrupt */ + interrupt-names = "intr", "msi"; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SPI 72 IRQ_TYPE_LEVEL_HIGH>; + + nvidia,bpmp = <&bpmp 0>; + + supports-clkreq; + nvidia,aspm-cmrt-us = <60>; + nvidia,aspm-pwr-on-t-us = <20>; + nvidia,aspm-l0s-entrance-latency-us = <3>; + + bus-range = <0x0 0xff>; + ranges = <0x81000000 0x0 0x38100000 0x0 0x38100000 0x0 0x00100000 /* downstream I/O (1MB) */ + 0x82000000 0x0 0x38200000 0x0 0x38200000 0x0 0x01E00000 /* non-prefetchable memory (30MB) */ + 0xc2000000 0x18 0x00000000 0x18 0x00000000 0x4 0x00000000>; /* prefetchable memory (16GB) */ + + vddio-pex-ctl-supply = <&vdd_1v8ao>; + + phys = <&p2u_hsio_2>, <&p2u_hsio_3>, <&p2u_hsio_4>, + <&p2u_hsio_5>; + phy-names = "p2u-0", "p2u-1", "p2u-2", "p2u-3"; + }; From 813c6fbcb7756e96f96c5931d8830970ff50e3b0 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:25 +0530 Subject: [PATCH 099/721] dt-bindings: PHY: P2U: Add Tegra194 P2U block Add support for Tegra194 P2U (PIPE to UPHY) module block which is a glue module instantiated once for each PCIe lane between Synopsys DesignWare core based PCIe IP and Universal PHY block. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Thierry Reding Acked-by: Kishon Vijay Abraham I --- .../bindings/phy/phy-tegra194-p2u.txt | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt diff --git a/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt b/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt new file mode 100644 index 000000000000..d23ff90baad5 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt @@ -0,0 +1,28 @@ +NVIDIA Tegra194 P2U binding + +Tegra194 has two PHY bricks namely HSIO (High Speed IO) and NVHS (NVIDIA High +Speed) each interfacing with 12 and 8 P2U instances respectively. +A P2U instance is a glue logic between Synopsys DesignWare Core PCIe IP's PIPE +interface and PHY of HSIO/NVHS bricks. Each P2U instance represents one PCIe +lane. + +Required properties: +- compatible: For Tegra19x, must contain "nvidia,tegra194-p2u". +- reg: Should be the physical address space and length of respective each P2U + instance. +- reg-names: Must include the entry "ctl". + +Required properties for PHY port node: +- #phy-cells: Defined by generic PHY bindings. Must be 0. + +Refer to phy/phy-bindings.txt for the generic PHY binding properties. + +Example: + +p2u_hsio_0: phy@3e10000 { + compatible = "nvidia,tegra194-p2u"; + reg = <0x03e10000 0x10000>; + reg-names = "ctl"; + + #phy-cells = <0>; +}; From 5dae15b21d36a2e6d7e92c6af39c33dea5c39cc3 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:26 +0530 Subject: [PATCH 100/721] phy: tegra: Add PCIe PIPE2UPHY support Synopsys DesignWare core based PCIe controllers in Tegra 194 SoC interface with Universal PHY (UPHY) module through a PIPE2UPHY (P2U) module. For each PCIe lane of a controller, there is a P2U unit instantiated at hardware level. This driver provides support for the programming required for each P2U that is going to be used for a PCIe controller. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I Acked-by: Thierry Reding --- drivers/phy/tegra/Kconfig | 7 ++ drivers/phy/tegra/Makefile | 1 + drivers/phy/tegra/phy-tegra194-p2u.c | 120 +++++++++++++++++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 drivers/phy/tegra/phy-tegra194-p2u.c diff --git a/drivers/phy/tegra/Kconfig b/drivers/phy/tegra/Kconfig index e516967d695b..f9817c3ae85f 100644 --- a/drivers/phy/tegra/Kconfig +++ b/drivers/phy/tegra/Kconfig @@ -7,3 +7,10 @@ config PHY_TEGRA_XUSB To compile this driver as a module, choose M here: the module will be called phy-tegra-xusb. + +config PHY_TEGRA194_P2U + tristate "NVIDIA Tegra194 PIPE2UPHY PHY driver" + depends on ARCH_TEGRA_194_SOC || COMPILE_TEST + select GENERIC_PHY + help + Enable this to support the P2U (PIPE to UPHY) that is part of Tegra 19x SOCs. diff --git a/drivers/phy/tegra/Makefile b/drivers/phy/tegra/Makefile index 64ccaeacb631..320dd389f34d 100644 --- a/drivers/phy/tegra/Makefile +++ b/drivers/phy/tegra/Makefile @@ -6,3 +6,4 @@ phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_124_SOC) += xusb-tegra124.o phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_132_SOC) += xusb-tegra124.o phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_210_SOC) += xusb-tegra210.o phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_186_SOC) += xusb-tegra186.o +obj-$(CONFIG_PHY_TEGRA194_P2U) += phy-tegra194-p2u.o diff --git a/drivers/phy/tegra/phy-tegra194-p2u.c b/drivers/phy/tegra/phy-tegra194-p2u.c new file mode 100644 index 000000000000..7042bed9feaa --- /dev/null +++ b/drivers/phy/tegra/phy-tegra194-p2u.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * P2U (PIPE to UPHY) driver for Tegra T194 SoC + * + * Copyright (C) 2019 NVIDIA Corporation. + * + * Author: Vidya Sagar + */ + +#include +#include +#include +#include +#include +#include + +#define P2U_PERIODIC_EQ_CTRL_GEN3 0xc0 +#define P2U_PERIODIC_EQ_CTRL_GEN3_PERIODIC_EQ_EN BIT(0) +#define P2U_PERIODIC_EQ_CTRL_GEN3_INIT_PRESET_EQ_TRAIN_EN BIT(1) +#define P2U_PERIODIC_EQ_CTRL_GEN4 0xc4 +#define P2U_PERIODIC_EQ_CTRL_GEN4_INIT_PRESET_EQ_TRAIN_EN BIT(1) + +#define P2U_RX_DEBOUNCE_TIME 0xa4 +#define P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_MASK 0xffff +#define P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_VAL 160 + +struct tegra_p2u { + void __iomem *base; +}; + +static inline void p2u_writel(struct tegra_p2u *phy, const u32 value, + const u32 reg) +{ + writel_relaxed(value, phy->base + reg); +} + +static inline u32 p2u_readl(struct tegra_p2u *phy, const u32 reg) +{ + return readl_relaxed(phy->base + reg); +} + +static int tegra_p2u_power_on(struct phy *x) +{ + struct tegra_p2u *phy = phy_get_drvdata(x); + u32 val; + + val = p2u_readl(phy, P2U_PERIODIC_EQ_CTRL_GEN3); + val &= ~P2U_PERIODIC_EQ_CTRL_GEN3_PERIODIC_EQ_EN; + val |= P2U_PERIODIC_EQ_CTRL_GEN3_INIT_PRESET_EQ_TRAIN_EN; + p2u_writel(phy, val, P2U_PERIODIC_EQ_CTRL_GEN3); + + val = p2u_readl(phy, P2U_PERIODIC_EQ_CTRL_GEN4); + val |= P2U_PERIODIC_EQ_CTRL_GEN4_INIT_PRESET_EQ_TRAIN_EN; + p2u_writel(phy, val, P2U_PERIODIC_EQ_CTRL_GEN4); + + val = p2u_readl(phy, P2U_RX_DEBOUNCE_TIME); + val &= ~P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_MASK; + val |= P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_VAL; + p2u_writel(phy, val, P2U_RX_DEBOUNCE_TIME); + + return 0; +} + +static const struct phy_ops ops = { + .power_on = tegra_p2u_power_on, + .owner = THIS_MODULE, +}; + +static int tegra_p2u_probe(struct platform_device *pdev) +{ + struct phy_provider *phy_provider; + struct device *dev = &pdev->dev; + struct phy *generic_phy; + struct tegra_p2u *phy; + struct resource *res; + + phy = devm_kzalloc(dev, sizeof(*phy), GFP_KERNEL); + if (!phy) + return -ENOMEM; + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ctl"); + phy->base = devm_ioremap_resource(dev, res); + if (IS_ERR(phy->base)) + return PTR_ERR(phy->base); + + platform_set_drvdata(pdev, phy); + + generic_phy = devm_phy_create(dev, NULL, &ops); + if (IS_ERR(generic_phy)) + return PTR_ERR(generic_phy); + + phy_set_drvdata(generic_phy, phy); + + phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate); + if (IS_ERR(phy_provider)) + return PTR_ERR(phy_provider); + + return 0; +} + +static const struct of_device_id tegra_p2u_id_table[] = { + { + .compatible = "nvidia,tegra194-p2u", + }, + {} +}; +MODULE_DEVICE_TABLE(of, tegra_p2u_id_table); + +static struct platform_driver tegra_p2u_driver = { + .probe = tegra_p2u_probe, + .driver = { + .name = "tegra194-p2u", + .of_match_table = tegra_p2u_id_table, + }, +}; +module_platform_driver(tegra_p2u_driver); + +MODULE_AUTHOR("Vidya Sagar "); +MODULE_DESCRIPTION("NVIDIA Tegra194 PIPE2UPHY PHY driver"); +MODULE_LICENSE("GPL v2"); From 40667d1bd822f7a9d961bac190cd36505de0de4e Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 13 Aug 2019 08:58:41 +0300 Subject: [PATCH 101/721] MAINTAINERS: altera-sysmgr: Fix typo in a filepath Fix typo (s/sysgmr/sysmgr/) in the header filepath. Fixes: f36e789a1f8d ("mfd: altera-sysmgr: Add SOCFPGA System Manager") Signed-off-by: Denis Efremov Signed-off-by: Lee Jones --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..99ac7a9921f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -723,7 +723,7 @@ ALTERA SYSTEM MANAGER DRIVER M: Thor Thayer S: Maintained F: drivers/mfd/altera-sysmgr.c -F: include/linux/mfd/altera-sysgmr.h +F: include/linux/mfd/altera-sysmgr.h ALTERA SYSTEM RESOURCE DRIVER FOR ARRIA10 DEVKIT M: Thor Thayer From 5cd690a308e86bcb10b8a0d6d42f153e82f695f6 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 14 Aug 2019 09:24:03 +0200 Subject: [PATCH 102/721] mfd: asic3: Include the right header This is a GPIO driver, use the appropriate header rather than the legacy header. Signed-off-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/asic3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c index 83b18c998d6f..a6bd2134cea2 100644 --- a/drivers/mfd/asic3.c +++ b/drivers/mfd/asic3.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include From fd5d16531a39322c3d7433d9f8a36203c9aaeddc Mon Sep 17 00:00:00 2001 From: Xiaowei Bao Date: Wed, 14 Aug 2019 10:03:29 +0800 Subject: [PATCH 103/721] PCI: layerscape: Add the bar_fixed_64bit property to the endpoint driver The layerscape PCIe controller have 4 BARs. BAR0 and BAR1 are 32bit, BAR2 and BAR4 are 64bit and that's a fixed hardware configuration. Set the bar_fixed_64bit variable accordingly. Signed-off-by: Xiaowei Bao [lorenzo.pieralisi@arm.com: commit log] Signed-off-by: Lorenzo Pieralisi Acked-by: Kishon Vijay Abraham I --- drivers/pci/controller/dwc/pci-layerscape-ep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c index be61d96cc95e..ca9aa4501e7e 100644 --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c @@ -44,6 +44,7 @@ static const struct pci_epc_features ls_pcie_epc_features = { .linkup_notifier = false, .msi_capable = true, .msix_capable = false, + .bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4), }; static const struct pci_epc_features* From b5b24617987f522abb5c42851ae505557f2c84f8 Mon Sep 17 00:00:00 2001 From: Xiaowei Bao Date: Wed, 14 Aug 2019 10:03:30 +0800 Subject: [PATCH 104/721] PCI: layerscape: Add CONFIG_PCI_LAYERSCAPE_EP to build EP/RC separately Add CONFIG_PCI_LAYERSCAPE_EP so that endpoint and host controller drivers can be built separately. Signed-off-by: Xiaowei Bao Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/Kconfig | 20 ++++++++++++++++++-- drivers/pci/controller/dwc/Makefile | 3 ++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 6ea778ae4877..869c645dd135 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -131,13 +131,29 @@ config PCI_KEYSTONE_EP DesignWare core functions to implement the driver. config PCI_LAYERSCAPE - bool "Freescale Layerscape PCIe controller" + bool "Freescale Layerscape PCIe controller - Host mode" depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST) depends on PCI_MSI_IRQ_DOMAIN select MFD_SYSCON select PCIE_DW_HOST help - Say Y here if you want PCIe controller support on Layerscape SoCs. + Say Y here if you want to enable PCIe controller support on Layerscape + SoCs to work in Host mode. + This controller can work either as EP or RC. The RCW[HOST_AGT_PEX] + determines which PCIe controller works in EP mode and which PCIe + controller works in RC mode. + +config PCI_LAYERSCAPE_EP + bool "Freescale Layerscape PCIe controller - Endpoint mode" + depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST) + depends on PCI_ENDPOINT + select PCIE_DW_EP + help + Say Y here if you want to enable PCIe controller support on Layerscape + SoCs to work in Endpoint mode. + This controller can work either as EP or RC. The RCW[HOST_AGT_PEX] + determines which PCIe controller works in EP mode and which PCIe + controller works in RC mode. config PCI_HISI depends on OF && (ARM64 || COMPILE_TEST) diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index b085dfd4fab7..824fde7ae750 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -8,7 +8,8 @@ obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o obj-$(CONFIG_PCI_IMX6) += pci-imx6.o obj-$(CONFIG_PCIE_SPEAR13XX) += pcie-spear13xx.o obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o -obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o pci-layerscape-ep.o +obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o +obj-$(CONFIG_PCI_LAYERSCAPE_EP) += pci-layerscape-ep.o obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o From af80559b4d9cd481c63505edbe425ff6bad4ab8d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 9 Aug 2019 17:40:47 +0200 Subject: [PATCH 105/721] i2c: replace i2c_new_secondary_device with an ERR_PTR variant In the general move to have i2c_new_*_device functions which return ERR_PTR instead of NULL, this patch converts i2c_new_secondary_device(). There are only few users, so this patch converts the I2C core and all users in one go. The function gets renamed to i2c_new_ancillary_device() so out-of-tree users will get a build failure to understand they need to adapt their error checking code. Signed-off-by: Wolfram Sang Reviewed-by: Kieran Bingham # adv748x Reviewed-by: Laurent Pinchart # adv7511 + adv7604 Reviewed-by: Hans Verkuil # adv7604 Signed-off-by: Wolfram Sang --- drivers/gpu/drm/bridge/adv7511/adv7511_drv.c | 18 ++++++++-------- drivers/i2c/i2c-core-base.c | 10 ++++----- drivers/media/i2c/adv748x/adv748x-core.c | 6 +++--- drivers/media/i2c/adv7604.c | 22 +++++++++++--------- include/linux/i2c.h | 2 +- 5 files changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c index f6d2681f6927..9e13e466e72c 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c @@ -981,10 +981,10 @@ static int adv7511_init_cec_regmap(struct adv7511 *adv) { int ret; - adv->i2c_cec = i2c_new_secondary_device(adv->i2c_main, "cec", + adv->i2c_cec = i2c_new_ancillary_device(adv->i2c_main, "cec", ADV7511_CEC_I2C_ADDR_DEFAULT); - if (!adv->i2c_cec) - return -EINVAL; + if (IS_ERR(adv->i2c_cec)) + return PTR_ERR(adv->i2c_cec); i2c_set_clientdata(adv->i2c_cec, adv); adv->regmap_cec = devm_regmap_init_i2c(adv->i2c_cec, @@ -1165,20 +1165,20 @@ static int adv7511_probe(struct i2c_client *i2c, const struct i2c_device_id *id) adv7511_packet_disable(adv7511, 0xffff); - adv7511->i2c_edid = i2c_new_secondary_device(i2c, "edid", + adv7511->i2c_edid = i2c_new_ancillary_device(i2c, "edid", ADV7511_EDID_I2C_ADDR_DEFAULT); - if (!adv7511->i2c_edid) { - ret = -EINVAL; + if (IS_ERR(adv7511->i2c_edid)) { + ret = PTR_ERR(adv7511->i2c_edid); goto uninit_regulators; } regmap_write(adv7511->regmap, ADV7511_REG_EDID_I2C_ADDR, adv7511->i2c_edid->addr << 1); - adv7511->i2c_packet = i2c_new_secondary_device(i2c, "packet", + adv7511->i2c_packet = i2c_new_ancillary_device(i2c, "packet", ADV7511_PACKET_I2C_ADDR_DEFAULT); - if (!adv7511->i2c_packet) { - ret = -EINVAL; + if (IS_ERR(adv7511->i2c_packet)) { + ret = PTR_ERR(adv7511->i2c_packet); goto err_i2c_unregister_edid; } diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index b4d84bd475da..ca10525e6d94 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -964,7 +964,7 @@ struct i2c_client *devm_i2c_new_dummy_device(struct device *dev, EXPORT_SYMBOL_GPL(devm_i2c_new_dummy_device); /** - * i2c_new_secondary_device - Helper to get the instantiated secondary address + * i2c_new_ancillary_device - Helper to get the instantiated secondary address * and create the associated device * @client: Handle to the primary client * @name: Handle to specify which secondary address to get @@ -983,9 +983,9 @@ EXPORT_SYMBOL_GPL(devm_i2c_new_dummy_device); * cell whose "reg-names" value matches the slave name. * * This returns the new i2c client, which should be saved for later use with - * i2c_unregister_device(); or NULL to indicate an error. + * i2c_unregister_device(); or an ERR_PTR to describe the error. */ -struct i2c_client *i2c_new_secondary_device(struct i2c_client *client, +struct i2c_client *i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr) { @@ -1000,9 +1000,9 @@ struct i2c_client *i2c_new_secondary_device(struct i2c_client *client, } dev_dbg(&client->adapter->dev, "Address for %s : 0x%x\n", name, addr); - return i2c_new_dummy(client->adapter, addr); + return i2c_new_dummy_device(client->adapter, addr); } -EXPORT_SYMBOL_GPL(i2c_new_secondary_device); +EXPORT_SYMBOL_GPL(i2c_new_ancillary_device); /* ------------------------------------------------------------------------- */ diff --git a/drivers/media/i2c/adv748x/adv748x-core.c b/drivers/media/i2c/adv748x/adv748x-core.c index f57cd77a32fa..2567de2b0037 100644 --- a/drivers/media/i2c/adv748x/adv748x-core.c +++ b/drivers/media/i2c/adv748x/adv748x-core.c @@ -183,14 +183,14 @@ static int adv748x_initialise_clients(struct adv748x_state *state) int ret; for (i = ADV748X_PAGE_DPLL; i < ADV748X_PAGE_MAX; ++i) { - state->i2c_clients[i] = i2c_new_secondary_device( + state->i2c_clients[i] = i2c_new_ancillary_device( state->client, adv748x_default_addresses[i].name, adv748x_default_addresses[i].default_addr); - if (state->i2c_clients[i] == NULL) { + if (IS_ERR(state->i2c_clients[i])) { adv_err(state, "failed to create i2c client %u\n", i); - return -ENOMEM; + return PTR_ERR(state->i2c_clients[i]); } ret = adv748x_configure_regmap(state, i); diff --git a/drivers/media/i2c/adv7604.c b/drivers/media/i2c/adv7604.c index 28a84bf9f8a9..2dedd6ebb236 100644 --- a/drivers/media/i2c/adv7604.c +++ b/drivers/media/i2c/adv7604.c @@ -2862,10 +2862,8 @@ static void adv76xx_unregister_clients(struct adv76xx_state *state) { unsigned int i; - for (i = 1; i < ARRAY_SIZE(state->i2c_clients); ++i) { - if (state->i2c_clients[i]) - i2c_unregister_device(state->i2c_clients[i]); - } + for (i = 1; i < ARRAY_SIZE(state->i2c_clients); ++i) + i2c_unregister_device(state->i2c_clients[i]); } static struct i2c_client *adv76xx_dummy_client(struct v4l2_subdev *sd, @@ -2878,14 +2876,14 @@ static struct i2c_client *adv76xx_dummy_client(struct v4l2_subdev *sd, struct i2c_client *new_client; if (pdata && pdata->i2c_addresses[page]) - new_client = i2c_new_dummy(client->adapter, + new_client = i2c_new_dummy_device(client->adapter, pdata->i2c_addresses[page]); else - new_client = i2c_new_secondary_device(client, + new_client = i2c_new_ancillary_device(client, adv76xx_default_addresses[page].name, adv76xx_default_addresses[page].default_addr); - if (new_client) + if (!IS_ERR(new_client)) io_write(sd, io_reg, new_client->addr << 1); return new_client; @@ -3516,15 +3514,19 @@ static int adv76xx_probe(struct i2c_client *client, } for (i = 1; i < ADV76XX_PAGE_MAX; ++i) { + struct i2c_client *dummy_client; + if (!(BIT(i) & state->info->page_mask)) continue; - state->i2c_clients[i] = adv76xx_dummy_client(sd, i); - if (!state->i2c_clients[i]) { - err = -EINVAL; + dummy_client = adv76xx_dummy_client(sd, i); + if (IS_ERR(dummy_client)) { + err = PTR_ERR(dummy_client); v4l2_err(sd, "failed to create i2c client %u\n", i); goto err_i2c; } + + state->i2c_clients[i] = dummy_client; } INIT_DELAYED_WORK(&state->delayed_work_enable_hotplug, diff --git a/include/linux/i2c.h b/include/linux/i2c.h index fa5552c2307b..ebbe024dd9e0 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -473,7 +473,7 @@ extern struct i2c_client * devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adap, u16 address); extern struct i2c_client * -i2c_new_secondary_device(struct i2c_client *client, +i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr); From 4e4521f76ff96f83561555f76c50d25314320d10 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 9 Aug 2019 14:30:02 -0700 Subject: [PATCH 106/721] dt-bindings: i2c: rcar: Rename bindings documentation file Rename the bindings documentation file for R-Car I2C controller from i2c-rcar.txt to renesas,i2c.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/{i2c-rcar.txt => renesas,i2c.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-rcar.txt => renesas,i2c.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-rcar.txt b/Documentation/devicetree/bindings/i2c/renesas,i2c.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-rcar.txt rename to Documentation/devicetree/bindings/i2c/renesas,i2c.txt diff --git a/MAINTAINERS b/MAINTAINERS index b49db5ad0b64..a14ce44c33b3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13650,7 +13650,7 @@ F: drivers/iio/adc/rcar-gyroadc.c RENESAS R-CAR I2C DRIVERS M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-rcar.txt +F: Documentation/devicetree/bindings/i2c/renesas,i2c.txt F: Documentation/devicetree/bindings/i2c/renesas,iic.txt F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c From 747bee3574040246be18e2e9db709a7cb058b35b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 9 Aug 2019 14:30:04 -0700 Subject: [PATCH 107/721] dt-bindings: i2c: i2c-emev2: Rename bindings documentation file Rename the bindings documentation file for Renesas EMEV2 IIC controller from i2c-emev2.txt to renesas,iic-emev2.txt. This is part of an ongoing effort to name bindings documentation files for Renesas IP blocks consistently, in line with the compat strings they document. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- .../bindings/i2c/{i2c-emev2.txt => renesas,iic-emev2.txt} | 0 MAINTAINERS | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/devicetree/bindings/i2c/{i2c-emev2.txt => renesas,iic-emev2.txt} (100%) diff --git a/Documentation/devicetree/bindings/i2c/i2c-emev2.txt b/Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-emev2.txt rename to Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt diff --git a/MAINTAINERS b/MAINTAINERS index a14ce44c33b3..f89f27a2d09a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13628,7 +13628,7 @@ F: drivers/clk/renesas/ RENESAS EMEV2 I2C DRIVER M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-emev2.txt +F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt F: drivers/i2c/busses/i2c-emev2.c RENESAS ETHERNET DRIVERS From 232219b9a464c2479c98aa589acb1bd3383ae9d6 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 13 Aug 2019 12:03:01 +0200 Subject: [PATCH 108/721] i2c-cht-wc: Fix lockdep warning When the kernel is build with lockdep support and the i2c-cht-wc driver is used, the following warning is shown: [ 66.674334] ====================================================== [ 66.674337] WARNING: possible circular locking dependency detected [ 66.674340] 5.3.0-rc4+ #83 Not tainted [ 66.674342] ------------------------------------------------------ [ 66.674345] systemd-udevd/1232 is trying to acquire lock: [ 66.674349] 00000000a74dab07 (intel_soc_pmic_chtwc:167:(&cht_wc_regmap_cfg)->lock){+.+.}, at: regmap_write+0x31/0x70 [ 66.674360] but task is already holding lock: [ 66.674362] 00000000d44a85b7 (i2c_register_adapter){+.+.}, at: i2c_smbus_xfer+0x49/0xf0 [ 66.674370] which lock already depends on the new lock. [ 66.674371] the existing dependency chain (in reverse order) is: [ 66.674374] -> #1 (i2c_register_adapter){+.+.}: [ 66.674381] rt_mutex_lock_nested+0x46/0x60 [ 66.674384] i2c_smbus_xfer+0x49/0xf0 [ 66.674387] i2c_smbus_read_byte_data+0x45/0x70 [ 66.674391] cht_wc_byte_reg_read+0x35/0x50 [ 66.674394] _regmap_read+0x63/0x1a0 [ 66.674396] _regmap_update_bits+0xa8/0xe0 [ 66.674399] regmap_update_bits_base+0x63/0xa0 [ 66.674403] regmap_irq_update_bits.isra.0+0x3b/0x50 [ 66.674406] regmap_add_irq_chip+0x592/0x7a0 [ 66.674409] devm_regmap_add_irq_chip+0x89/0xed [ 66.674412] cht_wc_probe+0x102/0x158 [ 66.674415] i2c_device_probe+0x95/0x250 [ 66.674419] really_probe+0xf3/0x380 [ 66.674422] driver_probe_device+0x59/0xd0 [ 66.674425] device_driver_attach+0x53/0x60 [ 66.674428] __driver_attach+0x92/0x150 [ 66.674431] bus_for_each_dev+0x7d/0xc0 [ 66.674434] bus_add_driver+0x14d/0x1f0 [ 66.674437] driver_register+0x6d/0xb0 [ 66.674440] i2c_register_driver+0x45/0x80 [ 66.674445] do_one_initcall+0x60/0x2f4 [ 66.674450] kernel_init_freeable+0x20d/0x2b4 [ 66.674453] kernel_init+0xa/0x10c [ 66.674457] ret_from_fork+0x3a/0x50 [ 66.674459] -> #0 (intel_soc_pmic_chtwc:167:(&cht_wc_regmap_cfg)->lock){+.+.}: [ 66.674465] __lock_acquire+0xe07/0x1930 [ 66.674468] lock_acquire+0x9d/0x1a0 [ 66.674472] __mutex_lock+0xa8/0x9a0 [ 66.674474] regmap_write+0x31/0x70 [ 66.674480] cht_wc_i2c_adap_smbus_xfer+0x72/0x240 [i2c_cht_wc] [ 66.674483] __i2c_smbus_xfer+0x1a3/0x640 [ 66.674486] i2c_smbus_xfer+0x67/0xf0 [ 66.674489] i2c_smbus_read_byte_data+0x45/0x70 [ 66.674494] bq24190_probe+0x26b/0x410 [bq24190_charger] [ 66.674497] i2c_device_probe+0x189/0x250 [ 66.674500] really_probe+0xf3/0x380 [ 66.674503] driver_probe_device+0x59/0xd0 [ 66.674506] device_driver_attach+0x53/0x60 [ 66.674509] __driver_attach+0x92/0x150 [ 66.674512] bus_for_each_dev+0x7d/0xc0 [ 66.674515] bus_add_driver+0x14d/0x1f0 [ 66.674518] driver_register+0x6d/0xb0 [ 66.674521] i2c_register_driver+0x45/0x80 [ 66.674524] do_one_initcall+0x60/0x2f4 [ 66.674528] do_init_module+0x5c/0x230 [ 66.674531] load_module+0x2707/0x2a20 [ 66.674534] __do_sys_init_module+0x188/0x1b0 [ 66.674537] do_syscall_64+0x5c/0xb0 [ 66.674541] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 66.674543] other info that might help us debug this: [ 66.674545] Possible unsafe locking scenario: [ 66.674547] CPU0 CPU1 [ 66.674548] ---- ---- [ 66.674550] lock(i2c_register_adapter); [ 66.674553] lock(intel_soc_pmic_chtwc:167:(&cht_wc_regmap_cfg)->lock); [ 66.674556] lock(i2c_register_adapter); [ 66.674559] lock(intel_soc_pmic_chtwc:167:(&cht_wc_regmap_cfg)->lock); [ 66.674561] *** DEADLOCK *** The problem is that the CHT Whiskey Cove PMIC's builtin i2c-adapter is itself a part of an i2c-client (the PMIC). This means that transfers done through it take adapter->bus_lock twice, once for the parent i2c-adapter and once for its own bus_lock. Lockdep does not like this nested locking. To make lockdep happy in the case of busses with muxes, the i2c-core's i2c_adapter_lock_bus function calls: rt_mutex_lock_nested(&adapter->bus_lock, i2c_adapter_depth(adapter)); But i2c_adapter_depth only works when the direct parent of the adapter is another adapter, as it is only meant for muxes. In this case there is an i2c-client and MFD instantiated platform_device in the parent->child chain between the 2 devices. This commit overrides the default i2c_lock_operations, passing a hardcoded depth of 1 to rt_mutex_lock_nested, making lockdep happy. Note that if there were to be a mux attached to the i2c-wc-cht adapter, this would break things again since the i2c-mux code expects the root-adapter to have a locking depth of 0. But the i2c-wc-cht adapter always has only 1 client directly attached in the form of the charger IC paired with the CHT Whiskey Cove PMIC. Signed-off-by: Hans de Goede Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-cht-wc.c | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c index 66af44bfa67d..f6546de66fbc 100644 --- a/drivers/i2c/busses/i2c-cht-wc.c +++ b/drivers/i2c/busses/i2c-cht-wc.c @@ -178,6 +178,51 @@ static const struct i2c_algorithm cht_wc_i2c_adap_algo = { .smbus_xfer = cht_wc_i2c_adap_smbus_xfer, }; +/* + * We are an i2c-adapter which itself is part of an i2c-client. This means that + * transfers done through us take adapter->bus_lock twice, once for our parent + * i2c-adapter and once to take our own bus_lock. Lockdep does not like this + * nested locking, to make lockdep happy in the case of busses with muxes, the + * i2c-core's i2c_adapter_lock_bus function calls: + * rt_mutex_lock_nested(&adapter->bus_lock, i2c_adapter_depth(adapter)); + * + * But i2c_adapter_depth only works when the direct parent of the adapter is + * another adapter, as it is only meant for muxes. In our case there is an + * i2c-client and MFD instantiated platform_device in the parent->child chain + * between the 2 devices. + * + * So we override the default i2c_lock_operations and pass a hardcoded + * depth of 1 to rt_mutex_lock_nested, to make lockdep happy. + * + * Note that if there were to be a mux attached to our adapter, this would + * break things again since the i2c-mux code expects the root-adapter to have + * a locking depth of 0. But we always have only 1 client directly attached + * in the form of the Charger IC paired with the CHT Whiskey Cove PMIC. + */ +static void cht_wc_i2c_adap_lock_bus(struct i2c_adapter *adapter, + unsigned int flags) +{ + rt_mutex_lock_nested(&adapter->bus_lock, 1); +} + +static int cht_wc_i2c_adap_trylock_bus(struct i2c_adapter *adapter, + unsigned int flags) +{ + return rt_mutex_trylock(&adapter->bus_lock); +} + +static void cht_wc_i2c_adap_unlock_bus(struct i2c_adapter *adapter, + unsigned int flags) +{ + rt_mutex_unlock(&adapter->bus_lock); +} + +static const struct i2c_lock_operations cht_wc_i2c_adap_lock_ops = { + .lock_bus = cht_wc_i2c_adap_lock_bus, + .trylock_bus = cht_wc_i2c_adap_trylock_bus, + .unlock_bus = cht_wc_i2c_adap_unlock_bus, +}; + /**** irqchip for the client connected to the extchgr i2c adapter ****/ static void cht_wc_i2c_irq_lock(struct irq_data *data) { @@ -286,6 +331,7 @@ static int cht_wc_i2c_adap_i2c_probe(struct platform_device *pdev) adap->adapter.owner = THIS_MODULE; adap->adapter.class = I2C_CLASS_HWMON; adap->adapter.algo = &cht_wc_i2c_adap_algo; + adap->adapter.lock_ops = &cht_wc_i2c_adap_lock_ops; strlcpy(adap->adapter.name, "PMIC I2C Adapter", sizeof(adap->adapter.name)); adap->adapter.dev.parent = &pdev->dev; From f58ba5e3f6863ea4486952698898848a6db726c2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 12 Jul 2019 08:53:19 -0700 Subject: [PATCH 109/721] PCI: pci-hyperv: Fix build errors on non-SYSFS config Fix build errors when building almost-allmodconfig but with SYSFS not set (not enabled). Fixes these build errors: ERROR: "pci_destroy_slot" [drivers/pci/controller/pci-hyperv.ko] undefined! ERROR: "pci_create_slot" [drivers/pci/controller/pci-hyperv.ko] undefined! drivers/pci/slot.o is only built when SYSFS is enabled, so pci-hyperv.o has an implicit dependency on SYSFS. Make that explicit. Also, depending on X86 && X86_64 is not needed, so just change that to depend on X86_64. Fixes: a15f2c08c708 ("PCI: hv: support reporting serial number as slot information") Signed-off-by: Randy Dunlap Signed-off-by: Lorenzo Pieralisi Reviewed-by: Haiyang Zhang Cc: Matthew Wilcox Cc: Jake Oshins Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Stephen Hemminger Cc: Sasha Levin Cc: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Cc: linux-hyperv@vger.kernel.org Cc: Dexuan Cui --- drivers/pci/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2ab92409210a..297bf928d652 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -181,7 +181,7 @@ config PCI_LABEL config PCI_HYPERV tristate "Hyper-V PCI Frontend" - depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64 + depends on X86_64 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && SYSFS help The PCI device frontend driver allows the kernel to import arbitrary PCI devices from a PCI backend to support PCI driver domains. From 075af61c19cdc32b94fd697c610c9a07fa21866e Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Fri, 26 Jul 2019 16:40:07 +0200 Subject: [PATCH 110/721] PCI: imx6: Limit DBI register length Define the length of the DBI registers and limit config space to its length. This makes sure that the kernel does not access registers beyond that point, avoiding the following abort on a i.MX 6Quad: # cat /sys/devices/soc0/soc/1ffc000.pcie/pci0000\:00/0000\:00\:00.0/config [ 100.021433] Unhandled fault: imprecise external abort (0x1406) at 0xb6ea7000 ... [ 100.056423] PC is at dw_pcie_read+0x50/0x84 [ 100.060790] LR is at dw_pcie_rd_own_conf+0x44/0x48 ... Signed-off-by: Stefan Agner Signed-off-by: Lorenzo Pieralisi Reviewed-by: Lucas Stach --- drivers/pci/controller/dwc/pci-imx6.c | 33 +++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 9b5cb5b70389..8b8efa3063f5 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -57,6 +57,7 @@ enum imx6_pcie_variants { struct imx6_pcie_drvdata { enum imx6_pcie_variants variant; u32 flags; + int dbi_length; }; struct imx6_pcie { @@ -1212,6 +1213,7 @@ static const struct imx6_pcie_drvdata drvdata[] = { .variant = IMX6Q, .flags = IMX6_PCIE_FLAG_IMX6_PHY | IMX6_PCIE_FLAG_IMX6_SPEED_CHANGE, + .dbi_length = 0x200, }, [IMX6SX] = { .variant = IMX6SX, @@ -1254,6 +1256,37 @@ static struct platform_driver imx6_pcie_driver = { .shutdown = imx6_pcie_shutdown, }; +static void imx6_pcie_quirk(struct pci_dev *dev) +{ + struct pci_bus *bus = dev->bus; + struct pcie_port *pp = bus->sysdata; + + /* Bus parent is the PCI bridge, its parent is this platform driver */ + if (!bus->dev.parent || !bus->dev.parent->parent) + return; + + /* Make sure we only quirk devices associated with this driver */ + if (bus->dev.parent->parent->driver != &imx6_pcie_driver.driver) + return; + + if (bus->number == pp->root_bus_nr) { + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct imx6_pcie *imx6_pcie = to_imx6_pcie(pci); + + /* + * Limit config length to avoid the kernel reading beyond + * the register set and causing an abort on i.MX 6Quad + */ + if (imx6_pcie->drvdata->dbi_length) { + dev->cfg_size = imx6_pcie->drvdata->dbi_length; + dev_info(&dev->dev, "Limiting cfg_size to %d\n", + dev->cfg_size); + } + } +} +DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_SYNOPSYS, 0xabcd, + PCI_CLASS_BRIDGE_PCI, 8, imx6_pcie_quirk); + static int __init imx6_pcie_init(void) { #ifdef CONFIG_ARM From a6e6fe6549f609f5c00c91e988da9d25f8ae8604 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:35 -0600 Subject: [PATCH 111/721] PCI/P2PDMA: Introduce private pagemap structure Move the PCI bus offset from the generic dev_pagemap structure to a specific pci_p2pdma_pagemap structure. This structure will grow in subsequent patches. Link: https://lore.kernel.org/r/20190730163545.4915-2-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-2-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 26 ++++++++++++++++++++------ include/linux/memremap.h | 1 - 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 234476226529..03e9c887bdfb 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -25,6 +25,16 @@ struct pci_p2pdma { bool p2pmem_published; }; +struct pci_p2pdma_pagemap { + struct dev_pagemap pgmap; + u64 bus_offset; +}; + +static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) +{ + return container_of(pgmap, struct pci_p2pdma_pagemap, pgmap); +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -135,6 +145,7 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { + struct pci_p2pdma_pagemap *p2p_pgmap; struct dev_pagemap *pgmap; void *addr; int error; @@ -157,14 +168,17 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, return error; } - pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL); - if (!pgmap) + p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL); + if (!p2p_pgmap) return -ENOMEM; + + pgmap = &p2p_pgmap->pgmap; pgmap->res.start = pci_resource_start(pdev, bar) + offset; pgmap->res.end = pgmap->res.start + size - 1; pgmap->res.flags = pci_resource_flags(pdev, bar); pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; - pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) - + + p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); addr = devm_memremap_pages(&pdev->dev, pgmap); @@ -720,7 +734,7 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish); int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir) { - struct dev_pagemap *pgmap; + struct pci_p2pdma_pagemap *p2p_pgmap; struct scatterlist *s; phys_addr_t paddr; int i; @@ -736,10 +750,10 @@ int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, return 0; for_each_sg(sg, s, nents, i) { - pgmap = sg_page(s)->pgmap; + p2p_pgmap = to_p2p_pgmap(sg_page(s)->pgmap); paddr = sg_phys(s); - s->dma_address = paddr - pgmap->pci_p2pdma_bus_offset; + s->dma_address = paddr - p2p_pgmap->bus_offset; sg_dma_len(s) = s->length; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f8a5b2a19945..b459518ce475 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -112,7 +112,6 @@ struct dev_pagemap { struct device *dev; enum memory_type type; unsigned int flags; - u64 pci_p2pdma_bus_offset; const struct dev_pagemap_ops *ops; }; From 0afea3814358c97e2964710ba16fbb0b54ceda0e Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:36 -0600 Subject: [PATCH 112/721] PCI/P2PDMA: Add provider's pci_dev to pci_p2pdma_pagemap struct The provider will be needed to figure out how to map a device. Link: https://lore.kernel.org/r/20190730163545.4915-3-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-3-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 03e9c887bdfb..93fbda14f4a9 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -27,6 +27,7 @@ struct pci_p2pdma { struct pci_p2pdma_pagemap { struct dev_pagemap pgmap; + struct pci_dev *provider; u64 bus_offset; }; @@ -178,6 +179,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->res.flags = pci_resource_flags(pdev, bar); pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; + p2p_pgmap->provider = pdev; p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); From 72583084e3fe333c27bc37527efe455d27eaf0b1 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:37 -0600 Subject: [PATCH 113/721] PCI/P2PDMA: Add constants for map type results to upstream_bridge_distance() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add constant flags to indicate how two devices will be mapped or if they are unsupported. upstream_bridge_distance() will now return the mapping type and the distance in a passed-by-reference argument. This helps annotate the code better, but the main reason is so we can use the information to store the required mapping method in an xarray. Link: https://lore.kernel.org/r/20190730163545.4915-4-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-4-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 95 +++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 37 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 93fbda14f4a9..8f0688201aec 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -20,6 +20,13 @@ #include #include +enum pci_p2pdma_map_type { + PCI_P2PDMA_MAP_UNKNOWN = 0, + PCI_P2PDMA_MAP_NOT_SUPPORTED, + PCI_P2PDMA_MAP_BUS_ADDR, + PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, +}; + struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; @@ -313,34 +320,33 @@ static bool root_complex_whitelist(struct pci_dev *dev) * port of the switch, to the common upstream port, back up to the second * downstream port and then to Device B. * - * Any two devices that don't have a common upstream bridge will return -1. - * In this way devices on separate PCIe root ports will be rejected, which - * is what we want for peer-to-peer seeing each PCIe root port defines a - * separate hierarchy domain and there's no way to determine whether the root - * complex supports forwarding between them. + * Any two devices that cannot communicate using p2pdma will return + * PCI_P2PDMA_MAP_NOT_SUPPORTED. * - * In the case where two devices are connected to different PCIe switches, - * this function will still return a positive distance as long as both - * switches eventually have a common upstream bridge. Note this covers - * the case of using multiple PCIe switches to achieve a desired level of - * fan-out from a root port. The exact distance will be a function of the - * number of switches between Device A and Device B. + * Any two devices that have a data path that goes through the host bridge + * will consult a whitelist. If the host bridges are on the whitelist, + * this function will return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE. * - * If a bridge which has any ACS redirection bits set is in the path - * then this functions will return -2. This is so we reject any - * cases where the TLPs are forwarded up into the root complex. - * In this case, a list of all infringing bridge addresses will be - * populated in acs_list (assuming it's non-null) for printk purposes. + * If either bridge is not on the whitelist this function returns + * PCI_P2PDMA_MAP_NOT_SUPPORTED. + * + * If a bridge which has any ACS redirection bits set is in the path, + * acs_redirects will be set to true. In this case, a list of all infringing + * bridge addresses will be populated in acs_list (assuming it's non-null) + * for printk purposes. */ -static int upstream_bridge_distance(struct pci_dev *provider, - struct pci_dev *client, - struct seq_buf *acs_list) +static enum pci_p2pdma_map_type +upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, + int *dist, bool *acs_redirects, struct seq_buf *acs_list) { struct pci_dev *a = provider, *b = client, *bb; int dist_a = 0; int dist_b = 0; int acs_cnt = 0; + if (acs_redirects) + *acs_redirects = false; + /* * Note, we don't need to take references to devices returned by * pci_upstream_bridge() seeing we hold a reference to a child @@ -369,15 +375,18 @@ static int upstream_bridge_distance(struct pci_dev *provider, dist_a++; } + if (dist) + *dist = dist_a + dist_b; + /* * Allow the connection if both devices are on a whitelisted root * complex, but add an arbitrary large value to the distance. */ if (root_complex_whitelist(provider) && root_complex_whitelist(client)) - return 0x1000 + dist_a + dist_b; + return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; - return -1; + return PCI_P2PDMA_MAP_NOT_SUPPORTED; check_b_path_acs: bb = b; @@ -394,33 +403,44 @@ static int upstream_bridge_distance(struct pci_dev *provider, bb = pci_upstream_bridge(bb); } - if (acs_cnt) - return -2; + if (dist) + *dist = dist_a + dist_b; - return dist_a + dist_b; + if (acs_cnt) { + if (acs_redirects) + *acs_redirects = true; + + return PCI_P2PDMA_MAP_NOT_SUPPORTED; + } + + return PCI_P2PDMA_MAP_BUS_ADDR; } -static int upstream_bridge_distance_warn(struct pci_dev *provider, - struct pci_dev *client) +static enum pci_p2pdma_map_type +upstream_bridge_distance_warn(struct pci_dev *provider, struct pci_dev *client, + int *dist) { struct seq_buf acs_list; + bool acs_redirects; int ret; seq_buf_init(&acs_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); if (!acs_list.buffer) return -ENOMEM; - ret = upstream_bridge_distance(provider, client, &acs_list); - if (ret == -2) { - pci_warn(client, "cannot be used for peer-to-peer DMA as ACS redirect is set between the client and provider (%s)\n", + ret = upstream_bridge_distance(provider, client, dist, &acs_redirects, + &acs_list); + if (acs_redirects) { + pci_warn(client, "ACS redirect is set between the client and provider (%s)\n", pci_name(provider)); /* Drop final semicolon */ acs_list.buffer[acs_list.len-1] = 0; pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n", acs_list.buffer); + } - } else if (ret < 0) { - pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge\n", + if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED) { + pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge or whitelisted host bridge\n", pci_name(provider)); } @@ -452,7 +472,8 @@ int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, { bool not_supported = false; struct pci_dev *pci_client; - int distance = 0; + int total_dist = 0; + int distance; int i, ret; if (num_clients == 0) @@ -477,26 +498,26 @@ int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, if (verbose) ret = upstream_bridge_distance_warn(provider, - pci_client); + pci_client, &distance); else ret = upstream_bridge_distance(provider, pci_client, - NULL); + &distance, NULL, NULL); pci_dev_put(pci_client); - if (ret < 0) + if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED) not_supported = true; if (not_supported && !verbose) break; - distance += ret; + total_dist += distance; } if (not_supported) return -1; - return distance; + return total_dist; } EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many); From c6bfaeb573a6caae4b96885b8bad63ee57495c77 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:38 -0600 Subject: [PATCH 114/721] PCI/P2PDMA: Factor out __upstream_bridge_distance() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a prep patch to create a second level helper. There are no functional changes. The root complex whitelist code will be moved into this function in a subsequent patch. Link: https://lore.kernel.org/r/20190730163545.4915-5-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-5-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 88 ++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 8f0688201aec..600ba6a7aa11 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -296,47 +296,8 @@ static bool root_complex_whitelist(struct pci_dev *dev) return false; } -/* - * Find the distance through the nearest common upstream bridge between - * two PCI devices. - * - * If the two devices are the same device then 0 will be returned. - * - * If there are two virtual functions of the same device behind the same - * bridge port then 2 will be returned (one step down to the PCIe switch, - * then one step back to the same device). - * - * In the case where two devices are connected to the same PCIe switch, the - * value 4 will be returned. This corresponds to the following PCI tree: - * - * -+ Root Port - * \+ Switch Upstream Port - * +-+ Switch Downstream Port - * + \- Device A - * \-+ Switch Downstream Port - * \- Device B - * - * The distance is 4 because we traverse from Device A through the downstream - * port of the switch, to the common upstream port, back up to the second - * downstream port and then to Device B. - * - * Any two devices that cannot communicate using p2pdma will return - * PCI_P2PDMA_MAP_NOT_SUPPORTED. - * - * Any two devices that have a data path that goes through the host bridge - * will consult a whitelist. If the host bridges are on the whitelist, - * this function will return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE. - * - * If either bridge is not on the whitelist this function returns - * PCI_P2PDMA_MAP_NOT_SUPPORTED. - * - * If a bridge which has any ACS redirection bits set is in the path, - * acs_redirects will be set to true. In this case, a list of all infringing - * bridge addresses will be populated in acs_list (assuming it's non-null) - * for printk purposes. - */ static enum pci_p2pdma_map_type -upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, +__upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, int *dist, bool *acs_redirects, struct seq_buf *acs_list) { struct pci_dev *a = provider, *b = client, *bb; @@ -416,6 +377,53 @@ upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, return PCI_P2PDMA_MAP_BUS_ADDR; } +/* + * Find the distance through the nearest common upstream bridge between + * two PCI devices. + * + * If the two devices are the same device then 0 will be returned. + * + * If there are two virtual functions of the same device behind the same + * bridge port then 2 will be returned (one step down to the PCIe switch, + * then one step back to the same device). + * + * In the case where two devices are connected to the same PCIe switch, the + * value 4 will be returned. This corresponds to the following PCI tree: + * + * -+ Root Port + * \+ Switch Upstream Port + * +-+ Switch Downstream Port + * + \- Device A + * \-+ Switch Downstream Port + * \- Device B + * + * The distance is 4 because we traverse from Device A through the downstream + * port of the switch, to the common upstream port, back up to the second + * downstream port and then to Device B. + * + * Any two devices that cannot communicate using p2pdma will return + * PCI_P2PDMA_MAP_NOT_SUPPORTED. + * + * Any two devices that have a data path that goes through the host bridge + * will consult a whitelist. If the host bridges are on the whitelist, + * this function will return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE. + * + * If either bridge is not on the whitelist this function returns + * PCI_P2PDMA_MAP_NOT_SUPPORTED. + * + * If a bridge which has any ACS redirection bits set is in the path, + * acs_redirects will be set to true. In this case, a list of all infringing + * bridge addresses will be populated in acs_list (assuming it's non-null) + * for printk purposes. + */ +static enum pci_p2pdma_map_type +upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, + int *dist, bool *acs_redirects, struct seq_buf *acs_list) +{ + return __upstream_bridge_distance(provider, client, dist, + acs_redirects, acs_list); +} + static enum pci_p2pdma_map_type upstream_bridge_distance_warn(struct pci_dev *provider, struct pci_dev *client, int *dist) From e2cbfbf78968db3e3a71cb989af17f4f6448bf49 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:39 -0600 Subject: [PATCH 115/721] PCI/P2PDMA: Apply host bridge whitelist for ACS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a P2PDMA transfer is rejected due to ACS being set, we can also check the whitelist and allow the transactions. Do this by pushing the whitelist check into the upstream_bridge_distance() function. Link: https://lore.kernel.org/r/20190730163545.4915-6-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-6-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 600ba6a7aa11..f7f7e5862bab 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -339,15 +339,7 @@ __upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, if (dist) *dist = dist_a + dist_b; - /* - * Allow the connection if both devices are on a whitelisted root - * complex, but add an arbitrary large value to the distance. - */ - if (root_complex_whitelist(provider) && - root_complex_whitelist(client)) - return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; - - return PCI_P2PDMA_MAP_NOT_SUPPORTED; + return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; check_b_path_acs: bb = b; @@ -371,7 +363,7 @@ __upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, if (acs_redirects) *acs_redirects = true; - return PCI_P2PDMA_MAP_NOT_SUPPORTED; + return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; } return PCI_P2PDMA_MAP_BUS_ADDR; @@ -420,8 +412,18 @@ static enum pci_p2pdma_map_type upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, int *dist, bool *acs_redirects, struct seq_buf *acs_list) { - return __upstream_bridge_distance(provider, client, dist, - acs_redirects, acs_list); + enum pci_p2pdma_map_type map_type; + + map_type = __upstream_bridge_distance(provider, client, dist, + acs_redirects, acs_list); + + if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) { + if (!root_complex_whitelist(provider) || + !root_complex_whitelist(client)) + return PCI_P2PDMA_MAP_NOT_SUPPORTED; + } + + return map_type; } static enum pci_p2pdma_map_type From 2c84d818aee976390c055a417045a85c23d39662 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:40 -0600 Subject: [PATCH 116/721] PCI/P2PDMA: Factor out host_bridge_whitelist() Push both PCI devices into the whitelist checking function seeing some hardware will require us ensuring they are on the same host bridge. At the same time we rename root_complex_whitelist() to host_bridge_whitelist() to match the terminology used in the code. Link: https://lore.kernel.org/r/20190730163545.4915-7-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-7-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index f7f7e5862bab..4b9f0903b340 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -269,19 +269,11 @@ static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev) seq_buf_printf(buf, "%s;", pci_name(pdev)); } -/* - * If we can't find a common upstream bridge take a look at the root - * complex and compare it to a whitelist of known good hardware. - */ -static bool root_complex_whitelist(struct pci_dev *dev) +static bool __host_bridge_whitelist(struct pci_host_bridge *host) { - struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0)); unsigned short vendor, device; - if (iommu_present(dev->dev.bus)) - return false; - if (!root) return false; @@ -296,6 +288,24 @@ static bool root_complex_whitelist(struct pci_dev *dev) return false; } +/* + * If we can't find a common upstream bridge take a look at the root + * complex and compare it to a whitelist of known good hardware. + */ +static bool host_bridge_whitelist(struct pci_dev *a, struct pci_dev *b) +{ + struct pci_host_bridge *host_a = pci_find_host_bridge(a->bus); + struct pci_host_bridge *host_b = pci_find_host_bridge(b->bus); + + if (iommu_present(a->dev.bus) || iommu_present(b->dev.bus)) + return false; + + if (__host_bridge_whitelist(host_a) && __host_bridge_whitelist(host_b)) + return true; + + return false; +} + static enum pci_p2pdma_map_type __upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, int *dist, bool *acs_redirects, struct seq_buf *acs_list) @@ -418,8 +428,7 @@ upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, acs_redirects, acs_list); if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) { - if (!root_complex_whitelist(provider) || - !root_complex_whitelist(client)) + if (!host_bridge_whitelist(provider, client)) return PCI_P2PDMA_MAP_NOT_SUPPORTED; } From 494d63b0d5d0a481f6ac23bc61e94647ab9c4390 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:41 -0600 Subject: [PATCH 117/721] PCI/P2PDMA: Whitelist some Intel host bridges Intel devices do not have good support for P2P requests that span different host bridges as the transactions will cross the QPI/UPI bus and this does not perform well. Therefore, enable support for these devices only if the host bridges match. Add Intel devices that have been tested and are known to work. There are likely many others out there that will need to be tested and added. Link: https://lore.kernel.org/r/20190730163545.4915-8-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-8-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 4b9f0903b340..2c4a8e92ed64 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -269,9 +269,30 @@ static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev) seq_buf_printf(buf, "%s;", pci_name(pdev)); } -static bool __host_bridge_whitelist(struct pci_host_bridge *host) +static const struct pci_p2pdma_whitelist_entry { + unsigned short vendor; + unsigned short device; + enum { + REQ_SAME_HOST_BRIDGE = 1 << 0, + } flags; +} pci_p2pdma_whitelist[] = { + /* AMD ZEN */ + {PCI_VENDOR_ID_AMD, 0x1450, 0}, + + /* Intel Xeon E5/Core i7 */ + {PCI_VENDOR_ID_INTEL, 0x3c00, REQ_SAME_HOST_BRIDGE}, + {PCI_VENDOR_ID_INTEL, 0x3c01, REQ_SAME_HOST_BRIDGE}, + /* Intel Xeon E7 v3/Xeon E5 v3/Core i7 */ + {PCI_VENDOR_ID_INTEL, 0x2f00, REQ_SAME_HOST_BRIDGE}, + {PCI_VENDOR_ID_INTEL, 0x2f01, REQ_SAME_HOST_BRIDGE}, + {} +}; + +static bool __host_bridge_whitelist(struct pci_host_bridge *host, + bool same_host_bridge) { struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0)); + const struct pci_p2pdma_whitelist_entry *entry; unsigned short vendor, device; if (!root) @@ -281,9 +302,14 @@ static bool __host_bridge_whitelist(struct pci_host_bridge *host) device = root->device; pci_dev_put(root); - /* AMD ZEN host bridges can do peer to peer */ - if (vendor == PCI_VENDOR_ID_AMD && device == 0x1450) + for (entry = pci_p2pdma_whitelist; entry->vendor; entry++) { + if (vendor != entry->vendor || device != entry->device) + continue; + if (entry->flags & REQ_SAME_HOST_BRIDGE && !same_host_bridge) + return false; + return true; + } return false; } @@ -300,7 +326,11 @@ static bool host_bridge_whitelist(struct pci_dev *a, struct pci_dev *b) if (iommu_present(a->dev.bus) || iommu_present(b->dev.bus)) return false; - if (__host_bridge_whitelist(host_a) && __host_bridge_whitelist(host_b)) + if (host_a == host_b) + return __host_bridge_whitelist(host_a, true); + + if (__host_bridge_whitelist(host_a, false) && + __host_bridge_whitelist(host_b, false)) return true; return false; From 2b9f4bb2a4fb77da4862f9ddf5209de2bcdaa0c0 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:42 -0600 Subject: [PATCH 118/721] PCI/P2PDMA: Add attrs argument to pci_p2pdma_map_sg() This is to match the dma_map_sg() API which this function will have to call in an future patch. Add a pci_p2pdma_map_sg_attrs() function and helper to call it with no attributes just like the dma_map_sg() function. Link: https://lore.kernel.org/r/20190730163545.4915-9-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-9-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- drivers/pci/p2pdma.c | 7 ++++--- include/linux/pci-p2pdma.h | 15 +++++++++++---- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bb970ca82517..7747712054cd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -832,8 +832,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out; if (is_pci_p2pdma_page(sg_page(iod->sg))) - nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents, - rq_dma_dir(req)); + nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg, + iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); else nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 2c4a8e92ed64..94fbacbcbbd0 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -798,13 +798,14 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish); * @sg: scatter list to map * @nents: elements in the scatterlist * @dir: DMA direction + * @attrs: DMA attributes passed to dma_map_sg() (if called) * * Scatterlists mapped with this function should not be unmapped in any way. * * Returns the number of SG entries mapped or 0 on error. */ -int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir) +int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) { struct pci_p2pdma_pagemap *p2p_pgmap; struct scatterlist *s; @@ -831,7 +832,7 @@ int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, return nents; } -EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg); +EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); /** * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index bca9bc3e5be7..7fd51954f93a 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -30,8 +30,8 @@ struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev, unsigned int *nents, u32 length); void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl); void pci_p2pmem_publish(struct pci_dev *pdev, bool publish); -int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir); +int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs); int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, @@ -81,8 +81,9 @@ static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev, static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) { } -static inline int pci_p2pdma_map_sg(struct device *dev, - struct scatterlist *sg, int nents, enum dma_data_direction dir) +static inline int pci_p2pdma_map_sg_attrs(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir, + unsigned long attrs) { return 0; } @@ -111,4 +112,10 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client) return pci_p2pmem_find_many(&client, 1); } +static inline int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + return pci_p2pdma_map_sg_attrs(dev, sg, nents, dir, 0); +} + #endif /* _LINUX_PCI_P2P_H */ From 7f73eac3a7137eabfb0c005c7ba55eb7994b9673 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:43 -0600 Subject: [PATCH 119/721] PCI/P2PDMA: Introduce pci_p2pdma_unmap_sg() Add pci_p2pdma_unmap_sg() to the two places that call pci_p2pdma_map_sg(). This is a prep patch to introduce correct mappings for p2pdma transactions that go through the root complex. Link: https://lore.kernel.org/r/20190730163545.4915-10-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-10-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/infiniband/core/rw.c | 6 ++++-- drivers/nvme/host/pci.c | 6 ++++-- drivers/pci/p2pdma.c | 18 +++++++++++++++++- include/linux/pci-p2pdma.h | 13 +++++++++++++ 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index dce06108c8c3..5337393d4dfe 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -583,8 +583,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, break; } - /* P2PDMA contexts do not need to be unmapped */ - if (!is_pci_p2pdma_page(sg_page(sg))) + if (is_pci_p2pdma_page(sg_page(sg))) + pci_p2pdma_unmap_sg(qp->pd->device->dma_device, sg, + sg_cnt, dir); + else ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); } EXPORT_SYMBOL(rdma_rw_ctx_destroy); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7747712054cd..2348b15f6bd0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -547,8 +547,10 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) WARN_ON_ONCE(!iod->nents); - /* P2PDMA requests do not need to be unmapped */ - if (!is_pci_p2pdma_page(sg_page(iod->sg))) + if (is_pci_p2pdma_page(sg_page(iod->sg))) + pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents, + rq_dma_dir(req)); + else dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 94fbacbcbbd0..1eec7a5ec27e 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -800,7 +800,8 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish); * @dir: DMA direction * @attrs: DMA attributes passed to dma_map_sg() (if called) * - * Scatterlists mapped with this function should not be unmapped in any way. + * Scatterlists mapped with this function should be unmapped using + * pci_p2pdma_unmap_sg_attrs(). * * Returns the number of SG entries mapped or 0 on error. */ @@ -834,6 +835,21 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, } EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); +/** + * pci_p2pdma_unmap_sg - unmap a PCI peer-to-peer scatterlist that was + * mapped with pci_p2pdma_map_sg() + * @dev: device doing the DMA request + * @sg: scatter list to map + * @nents: number of elements returned by pci_p2pdma_map_sg() + * @dir: DMA direction + * @attrs: DMA attributes passed to dma_unmap_sg() (if called) + */ +void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ +} +EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs); + /** * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store * to enable p2pdma diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 7fd51954f93a..8318a97c9c61 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -32,6 +32,8 @@ void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl); void pci_p2pmem_publish(struct pci_dev *pdev, bool publish); int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); +void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs); int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, @@ -87,6 +89,11 @@ static inline int pci_p2pdma_map_sg_attrs(struct device *dev, { return 0; } +static inline void pci_p2pdma_unmap_sg_attrs(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir, + unsigned long attrs) +{ +} static inline int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma) { @@ -118,4 +125,10 @@ static inline int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, return pci_p2pdma_map_sg_attrs(dev, sg, nents, dir, 0); } +static inline void pci_p2pdma_unmap_sg(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir) +{ + pci_p2pdma_unmap_sg_attrs(dev, sg, nents, dir, 0); +} + #endif /* _LINUX_PCI_P2P_H */ From 142c242e6f12196f8064beb09198838611e029db Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:44 -0600 Subject: [PATCH 120/721] PCI/P2PDMA: Factor out __pci_p2pdma_map_sg() Factor out the bus-only mapping into its own static function. No functional changes. The original pci_p2pdma_map_sg_attrs() will be used to decide whether this is an appropriate way to map. Link: https://lore.kernel.org/r/20190730163545.4915-11-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-11-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 53 +++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 1eec7a5ec27e..771b45605853 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -792,6 +792,33 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); +static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap, + struct device *dev, struct scatterlist *sg, int nents) +{ + struct scatterlist *s; + phys_addr_t paddr; + int i; + + /* + * p2pdma mappings are not compatible with devices that use + * dma_virt_ops. If the upper layers do the right thing + * this should never happen because it will be prevented + * by the check in pci_p2pdma_distance_many() + */ + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_DMA_VIRT_OPS) && + dev->dma_ops == &dma_virt_ops)) + return 0; + + for_each_sg(sg, s, nents, i) { + paddr = sg_phys(s); + + s->dma_address = paddr - p2p_pgmap->bus_offset; + sg_dma_len(s) = s->length; + } + + return nents; +} + /** * pci_p2pdma_map_sg - map a PCI peer-to-peer scatterlist for DMA * @dev: device doing the DMA request @@ -808,30 +835,10 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish); int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { - struct pci_p2pdma_pagemap *p2p_pgmap; - struct scatterlist *s; - phys_addr_t paddr; - int i; + struct pci_p2pdma_pagemap *p2p_pgmap = + to_p2p_pgmap(sg_page(sg)->pgmap); - /* - * p2pdma mappings are not compatible with devices that use - * dma_virt_ops. If the upper layers do the right thing - * this should never happen because it will be prevented - * by the check in pci_p2pdma_distance_many() - */ - if (WARN_ON_ONCE(IS_ENABLED(CONFIG_DMA_VIRT_OPS) && - dev->dma_ops == &dma_virt_ops)) - return 0; - - for_each_sg(sg, s, nents, i) { - p2p_pgmap = to_p2p_pgmap(sg_page(s)->pgmap); - paddr = sg_phys(s); - - s->dma_address = paddr - p2p_pgmap->bus_offset; - sg_dma_len(s) = s->length; - } - - return nents; + return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents); } EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); From 110203bee05f33637ccb44f046f14edaea94bb4c Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:45 -0600 Subject: [PATCH 121/721] PCI/P2PDMA: Store mapping method in an xarray When upstream_bridge_distance() is called, store the method required to map the DMA transfers in an xarray so it can be looked up efficiently on the hot path in pci_p2pdma_map_sg(). Link: https://lore.kernel.org/r/20190730163545.4915-12-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-12-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 771b45605853..db8224ff6e80 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -19,6 +19,7 @@ #include #include #include +#include enum pci_p2pdma_map_type { PCI_P2PDMA_MAP_UNKNOWN = 0, @@ -30,6 +31,7 @@ enum pci_p2pdma_map_type { struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; + struct xarray map_types; }; struct pci_p2pdma_pagemap { @@ -105,6 +107,7 @@ static void pci_p2pdma_release(void *data) gen_pool_destroy(p2pdma->pool); sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); + xa_destroy(&p2pdma->map_types); } static int pci_p2pdma_setup(struct pci_dev *pdev) @@ -116,6 +119,8 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) if (!p2p) return -ENOMEM; + xa_init(&p2p->map_types); + p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); if (!p2p->pool) goto out; @@ -409,6 +414,12 @@ __upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, return PCI_P2PDMA_MAP_BUS_ADDR; } +static unsigned long map_types_idx(struct pci_dev *client) +{ + return (pci_domain_nr(client->bus) << 16) | + (client->bus->number << 8) | client->devfn; +} + /* * Find the distance through the nearest common upstream bridge between * two PCI devices. @@ -459,9 +470,13 @@ upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) { if (!host_bridge_whitelist(provider, client)) - return PCI_P2PDMA_MAP_NOT_SUPPORTED; + map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED; } + if (provider->p2pdma) + xa_store(&provider->p2pdma->map_types, map_types_idx(client), + xa_mk_value(map_type), GFP_KERNEL); + return map_type; } From 5d52e1abcd47957f5f4cae26659768655e3ac9f5 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:46 -0600 Subject: [PATCH 122/721] PCI/P2PDMA: dma_map() requests that traverse the host bridge Any requests that traverse the host bridge will need to be mapped into the IOMMU, so call dma_map_sg() inside pci_p2pdma_map_sg() when appropriate. Similarly, call dma_unmap_sg() inside pci_p2pdma_unmap_sg(). Link: https://lore.kernel.org/r/20190730163545.4915-13-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-13-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index db8224ff6e80..bca1ffc7075e 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -807,6 +807,16 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); +static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct pci_dev *provider, + struct pci_dev *client) +{ + if (!provider->p2pdma) + return PCI_P2PDMA_MAP_NOT_SUPPORTED; + + return xa_to_value(xa_load(&provider->p2pdma->map_types, + map_types_idx(client))); +} + static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap, struct device *dev, struct scatterlist *sg, int nents) { @@ -852,8 +862,22 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, { struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(sg_page(sg)->pgmap); + struct pci_dev *client; - return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents); + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return 0; + + client = to_pci_dev(dev); + + switch (pci_p2pdma_map_type(p2p_pgmap->provider, client)) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + return dma_map_sg_attrs(dev, sg, nents, dir, attrs); + case PCI_P2PDMA_MAP_BUS_ADDR: + return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents); + default: + WARN_ON_ONCE(1); + return 0; + } } EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); @@ -869,6 +893,20 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { + struct pci_p2pdma_pagemap *p2p_pgmap = + to_p2p_pgmap(sg_page(sg)->pgmap); + enum pci_p2pdma_map_type map_type; + struct pci_dev *client; + + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return; + + client = to_pci_dev(dev); + + map_type = pci_p2pdma_map_type(p2p_pgmap->provider, client); + + if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + dma_unmap_sg_attrs(dev, sg, nents, dir, attrs); } EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs); From 5daf40b039249ef98d49610d14293dca3464b91f Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:47 -0600 Subject: [PATCH 123/721] PCI/P2PDMA: Allow IOMMU for host bridge whitelist Now that we map the requests correctly we can remove the iommu_present() restriction. Link: https://lore.kernel.org/r/20190730163545.4915-14-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-14-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index bca1ffc7075e..d8c824097d26 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -18,7 +18,6 @@ #include #include #include -#include #include enum pci_p2pdma_map_type { @@ -328,9 +327,6 @@ static bool host_bridge_whitelist(struct pci_dev *a, struct pci_dev *b) struct pci_host_bridge *host_a = pci_find_host_bridge(a->bus); struct pci_host_bridge *host_b = pci_find_host_bridge(b->bus); - if (iommu_present(a->dev.bus) || iommu_present(b->dev.bus)) - return false; - if (host_a == host_b) return __host_bridge_whitelist(host_a, true); From 27235b15e4197b3a1014cf0ca0b08dbbf61a692b Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 12 Aug 2019 11:30:48 -0600 Subject: [PATCH 124/721] PCI/P2PDMA: Update pci_p2pdma_distance_many() documentation The comment describing pci_p2pdma_distance_many() still referred to the devices being behind the same root port. This no longer applies so reword the documentation. Link: https://lore.kernel.org/r/20190730163545.4915-15-logang@deltatee.com Link: https://lore.kernel.org/r/20190812173048.9186-15-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- drivers/pci/p2pdma.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index d8c824097d26..0608aae72ccc 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -517,15 +517,14 @@ upstream_bridge_distance_warn(struct pci_dev *provider, struct pci_dev *client, * @num_clients: number of clients in the array * @verbose: if true, print warnings for devices when we return -1 * - * Returns -1 if any of the clients are not compatible (behind the same - * root port as the provider), otherwise returns a positive number where - * a lower number is the preferable choice. (If there's one client - * that's the same as the provider it will return 0, which is best choice). + * Returns -1 if any of the clients are not compatible, otherwise returns a + * positive number where a lower number is the preferable choice. (If there's + * one client that's the same as the provider it will return 0, which is best + * choice). * - * For now, "compatible" means the provider and the clients are all behind - * the same PCI root port. This cuts out cases that may work but is safest - * for the user. Future work can expand this to white-list root complexes that - * can safely forward between each ports. + * "compatible" means the provider and the clients are either all behind + * the same PCI root port or the host bridges connected to each of the devices + * are listed in the 'pci_p2pdma_whitelist'. */ int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, int num_clients, bool verbose) From 4ff96fb52c6964ad42e0a878be8f86a2e8052ddd Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Fri, 19 Jul 2019 14:28:39 +0200 Subject: [PATCH 125/721] livepatch: Nullify obj->mod in klp_module_coming()'s error path klp_module_coming() is called for every module appearing in the system. It sets obj->mod to a patched module for klp_object obj. Unfortunately it leaves it set even if an error happens later in the function and the patched module is not allowed to be loaded. klp_is_object_loaded() uses obj->mod variable and could currently give a wrong return value. The bug is probably harmless as of now. Signed-off-by: Miroslav Benes Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- kernel/livepatch/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c4ce08f43bd6..ab4a4606d19b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1175,6 +1175,7 @@ int klp_module_coming(struct module *mod) pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", patch->mod->name, obj->mod->name, obj->mod->name); mod->klp_alive = false; + obj->mod = NULL; klp_cleanup_module_patches_limited(mod, patch); mutex_unlock(&klp_mutex); From c1ca58f6982bb815c27a4a75f0f430f87b624f66 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Thu, 8 Aug 2019 18:21:11 +0800 Subject: [PATCH 126/721] HID: intel-ish-hid: ipc: set NO_D3 flag only when needed Currently, the NO_D3 flag is set in ish_probe(), and cleared in ish_remove(). So even if the system goes into S3, ISH is still in D0i3 state. It makes more sense that put ISH into D3 as system goes into S3 and put ISH into D0i3 as system goes into suspend-to-idle. I remove the NO_D3 setting in ish_probe(), so that ISH can enter D3 state when system enters S3. Only set N0_D3 flag when system enters the suspend-to-idle or platform specified, and clear it when system resume. When the ISH enters D3, the FW will check the DMA bit status. If the DMA bit is set, the FW will reset automatically. So the DMA bit need be clear before putting ISH into D3 state. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/hw-ish.h | 1 + drivers/hid/intel-ish-hid/ipc/ipc.c | 2 +- drivers/hid/intel-ish-hid/ipc/pci-ish.c | 21 +++++++++++++++++++-- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/hid/intel-ish-hid/ipc/hw-ish.h b/drivers/hid/intel-ish-hid/ipc/hw-ish.h index 1065692f90e2..ddd8e8e87cfa 100644 --- a/drivers/hid/intel-ish-hid/ipc/hw-ish.h +++ b/drivers/hid/intel-ish-hid/ipc/hw-ish.h @@ -77,5 +77,6 @@ irqreturn_t ish_irq_handler(int irq, void *dev_id); struct ishtp_device *ish_dev_init(struct pci_dev *pdev); int ish_hw_start(struct ishtp_device *dev); void ish_device_disable(struct ishtp_device *dev); +int ish_disable_dma(struct ishtp_device *dev); #endif /* _ISHTP_HW_ISH_H_ */ diff --git a/drivers/hid/intel-ish-hid/ipc/ipc.c b/drivers/hid/intel-ish-hid/ipc/ipc.c index 18fe8af89aad..8f8dfdf64833 100644 --- a/drivers/hid/intel-ish-hid/ipc/ipc.c +++ b/drivers/hid/intel-ish-hid/ipc/ipc.c @@ -672,7 +672,7 @@ irqreturn_t ish_irq_handler(int irq, void *dev_id) * * Return: 0 for success else error code. */ -static int ish_disable_dma(struct ishtp_device *dev) +int ish_disable_dma(struct ishtp_device *dev) { unsigned int dma_delay; diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index aa80b4d3b740..89b14d2edd0b 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #define CREATE_TRACE_POINTS @@ -97,6 +98,11 @@ static const struct pci_device_id ish_invalid_pci_ids[] = { {} }; +static inline bool ish_should_enter_d0i3(struct pci_dev *pdev) +{ + return !pm_suspend_via_firmware() || pdev->device == CHV_DEVICE_ID; +} + /** * ish_probe() - PCI driver probe callback * @pdev: pci device @@ -147,7 +153,6 @@ static int ish_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* mapping IO device memory */ hw->mem_addr = pcim_iomap_table(pdev)[0]; ishtp->pdev = pdev; - pdev->dev_flags |= PCI_DEV_FLAGS_NO_D3; /* request and enable interrupt */ ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); @@ -184,7 +189,6 @@ static void ish_remove(struct pci_dev *pdev) struct ishtp_device *ishtp_dev = pci_get_drvdata(pdev); ishtp_bus_remove_all_clients(ishtp_dev, false); - pdev->dev_flags &= ~PCI_DEV_FLAGS_NO_D3; ish_device_disable(ishtp_dev); } @@ -209,6 +213,8 @@ static void __maybe_unused ish_resume_handler(struct work_struct *work) uint32_t fwsts; int ret; + pdev->dev_flags &= ~PCI_DEV_FLAGS_NO_D3; + /* Get ISH FW status */ fwsts = IPC_GET_ISH_FWSTS(dev->ops->get_fw_status(dev)); @@ -267,6 +273,17 @@ static int __maybe_unused ish_suspend(struct device *device) !dev->suspend_flag, msecs_to_jiffies(25)); + if (ish_should_enter_d0i3(pdev)) { + /* Set the NO_D3 flag, the ISH would enter D0i3 */ + pdev->dev_flags |= PCI_DEV_FLAGS_NO_D3; + } else { + /* + * Clear the DMA bit before putting ISH into D3, + * or ISH FW would reset automatically. + */ + ish_disable_dma(dev); + } + return 0; } From 2db8edaa88c1208f7346c65041120081e2434d2a Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Thu, 8 Aug 2019 18:21:12 +0800 Subject: [PATCH 127/721] HID: intel-ish-hid: ipc: make ish suspend paths clear For suspend-to-idle, send suspend message and set N0_D3 flag to put the ISH into D0i3 state. For suspend-to-mem, disable the DMA bit before ISH entering D3, and NO_D3 flag is cleared by default, then the ISH would enter D3. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/pci-ish.c | 52 +++++++++++++++---------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index 89b14d2edd0b..35081f2cf781 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -223,6 +223,8 @@ static void __maybe_unused ish_resume_handler(struct work_struct *work) * it means ISH isn't powered off, in this case, send a resume message. */ if (fwsts >= FWSTS_SENSOR_APP_LOADED) { + disable_irq_wake(pdev->irq); + ishtp_send_resume(dev); /* Waiting to get resume response */ @@ -255,27 +257,36 @@ static int __maybe_unused ish_suspend(struct device *device) struct pci_dev *pdev = to_pci_dev(device); struct ishtp_device *dev = pci_get_drvdata(pdev); - enable_irq_wake(pdev->irq); - /* - * If previous suspend hasn't been asnwered then ISH is likely dead, - * don't attempt nested notification - */ - if (dev->suspend_flag) - return 0; - - dev->resume_flag = 0; - dev->suspend_flag = 1; - ishtp_send_suspend(dev); - - /* 25 ms should be enough for live ISH to flush all IPC buf */ - if (dev->suspend_flag) - wait_event_interruptible_timeout(dev->suspend_wait, - !dev->suspend_flag, - msecs_to_jiffies(25)); - if (ish_should_enter_d0i3(pdev)) { - /* Set the NO_D3 flag, the ISH would enter D0i3 */ - pdev->dev_flags |= PCI_DEV_FLAGS_NO_D3; + /* + * If previous suspend hasn't been asnwered then ISH is likely + * dead, don't attempt nested notification + */ + if (dev->suspend_flag) + return 0; + + dev->resume_flag = 0; + dev->suspend_flag = 1; + ishtp_send_suspend(dev); + + /* 25 ms should be enough for live ISH to flush all IPC buf */ + if (dev->suspend_flag) + wait_event_interruptible_timeout(dev->suspend_wait, + !dev->suspend_flag, + msecs_to_jiffies(25)); + + if (dev->suspend_flag) { + /* + * It looks like FW halt, clear the DMA bit, and put + * ISH into D3, and FW would reset on resume. + */ + ish_disable_dma(dev); + } else { + /* Set the NO_D3 flag, the ISH would enter D0i3 */ + pdev->dev_flags |= PCI_DEV_FLAGS_NO_D3; + + enable_irq_wake(pdev->irq); + } } else { /* * Clear the DMA bit before putting ISH into D3, @@ -304,7 +315,6 @@ static int __maybe_unused ish_resume(struct device *device) ish_resume_device = device; dev->resume_flag = 1; - disable_irq_wake(pdev->irq); schedule_work(&resume_work); return 0; From fc19a57dd4834a939b5eef574c61cd4f81af6cf8 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Thu, 8 Aug 2019 18:21:13 +0800 Subject: [PATCH 128/721] HID: intel-ish-hid: ipc: check the NO_D3 flag to distinguish resume paths The NO_D3 flag would be set if the ISH enter D0i3 in ish_suspend(), The resume paths can be distinguished by checking the NO_D3 flag. It's more reasonable than checking the FW status. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/pci-ish.c | 34 +++++++++++-------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index 35081f2cf781..f269852304e5 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -210,19 +210,11 @@ static void __maybe_unused ish_resume_handler(struct work_struct *work) { struct pci_dev *pdev = to_pci_dev(ish_resume_device); struct ishtp_device *dev = pci_get_drvdata(pdev); - uint32_t fwsts; int ret; - pdev->dev_flags &= ~PCI_DEV_FLAGS_NO_D3; - - /* Get ISH FW status */ - fwsts = IPC_GET_ISH_FWSTS(dev->ops->get_fw_status(dev)); - - /* - * If currently, in ISH FW, sensor app is loaded or beyond that, - * it means ISH isn't powered off, in this case, send a resume message. - */ - if (fwsts >= FWSTS_SENSOR_APP_LOADED) { + /* Check the NO_D3 flag to distinguish the resume paths */ + if (pdev->dev_flags & PCI_DEV_FLAGS_NO_D3) { + pdev->dev_flags &= ~PCI_DEV_FLAGS_NO_D3; disable_irq_wake(pdev->irq); ishtp_send_resume(dev); @@ -232,16 +224,20 @@ static void __maybe_unused ish_resume_handler(struct work_struct *work) ret = wait_event_interruptible_timeout(dev->resume_wait, !dev->resume_flag, msecs_to_jiffies(WAIT_FOR_RESUME_ACK_MS)); - } - /* - * If in ISH FW, sensor app isn't loaded yet, or no resume response. - * That means this platform is not S0ix compatible, or something is - * wrong with ISH FW. So on resume, full reboot of ISH processor will - * happen, so need to go through init sequence again. - */ - if (dev->resume_flag) + /* + * If the flag is not cleared, something is wrong with ISH FW. + * So on resume, need to go through init sequence again. + */ + if (dev->resume_flag) + ish_init(dev); + } else { + /* + * Resume from the D3, full reboot of ISH processor will happen, + * so need to go through init sequence again. + */ ish_init(dev); + } } /** From e2751463eaa6f9fec8fea80abbdc62dbc487b3c5 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 26 Jul 2019 15:48:53 +0800 Subject: [PATCH 129/721] fs: nfs: Fix possible null-pointer dereferences in encode_attrs() In encode_attrs(), there is an if statement on line 1145 to check whether label is NULL: if (label && (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL)) When label is NULL, it is used on lines 1178-1181: *p++ = cpu_to_be32(label->lfs); *p++ = cpu_to_be32(label->pi); *p++ = cpu_to_be32(label->len); p = xdr_encode_opaque_fixed(p, label->label, label->len); To fix these bugs, label is checked before being used. These bugs are found by a static analysis tool STCheck written by us. Signed-off-by: Jia-Ju Bai Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 46a8d636d151..ab07db0f07cd 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1174,7 +1174,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } - if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) { *p++ = cpu_to_be32(label->lfs); *p++ = cpu_to_be32(label->pi); *p++ = cpu_to_be32(label->len); From 691b45ddbd182a4ce0bc91953c70c845cf0935f1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:36:19 -0400 Subject: [PATCH 130/721] SUNRPC: Remove rpc_wake_up_queued_task_on_wq() Clean up: commit c544577daddb ("SUNRPC: Clean up transport write space handling") appears to have removed the last caller of rpc_wake_up_queued_task_on_wq(). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 3 --- net/sunrpc/sched.c | 27 ++++----------------------- 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index baa3ecdb882f..d1283bddd218 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -243,9 +243,6 @@ void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *queue, void rpc_sleep_on_priority(struct rpc_wait_queue *, struct rpc_task *, int priority); -void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, - struct rpc_task *task); void rpc_wake_up_queued_task(struct rpc_wait_queue *, struct rpc_task *); void rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 1f275aba786f..f25c4b9ba185 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -541,33 +541,14 @@ rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq, return NULL; } -static void -rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, struct rpc_task *task) -{ - rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL); -} - /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, + struct rpc_task *task) { - rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); -} - -/* - * Wake up a task on a specific queue - */ -void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, - struct rpc_task *task) -{ - if (!RPC_IS_QUEUED(task)) - return; - spin_lock(&queue->lock); - rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); - spin_unlock(&queue->lock); + rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue, + task, NULL, NULL); } /* From 95bd8304b34656d30bc0c908384de007b8fbcb55 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:37:05 -0400 Subject: [PATCH 131/721] SUNRPC: Inline xdr_commit_encode Micro-optimization: For xdr_commit_encode call sites in net/sunrpc/xdr.c, eliminate the extra calling sequence. On my client, this change saves about a microsecond for every 30 calls to xdr_reserve_space(). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 48c93b9e525e..7ba0ede6b417 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -560,7 +560,7 @@ EXPORT_SYMBOL_GPL(xdr_init_encode); * required at the end of encoding, or any other time when the xdr_buf * data might be read. */ -void xdr_commit_encode(struct xdr_stream *xdr) +inline void xdr_commit_encode(struct xdr_stream *xdr) { int shift = xdr->scratch.iov_len; void *page; From 2fb2a4d529fe3b1db4ced06d3b6568d46a991269 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:37:52 -0400 Subject: [PATCH 132/721] xprtrdma: Refresh the documenting comment in frwr_ops.c Things have changed since this comment was written. In particular, the reworking of connection closing, on-demand creation of MRs, and the removal of fr_state all mean that deferring MR recovery to frwr_map is no longer needed. The description is obsolete. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 64 +++++++++------------------------- 1 file changed, 17 insertions(+), 47 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 0b6dad7580a1..a30f2ae49578 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -7,67 +7,37 @@ /* Lightweight memory registration using Fast Registration Work * Requests (FRWR). * - * FRWR features ordered asynchronous registration and deregistration - * of arbitrarily sized memory regions. This is the fastest and safest + * FRWR features ordered asynchronous registration and invalidation + * of arbitrarily-sized memory regions. This is the fastest and safest * but most complex memory registration mode. */ /* Normal operation * - * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG + * A Memory Region is prepared for RDMA Read or Write using a FAST_REG * Work Request (frwr_map). When the RDMA operation is finished, this * Memory Region is invalidated using a LOCAL_INV Work Request - * (frwr_unmap_sync). + * (frwr_unmap_async and frwr_unmap_sync). * - * Typically these Work Requests are not signaled, and neither are RDMA - * SEND Work Requests (with the exception of signaling occasionally to - * prevent provider work queue overflows). This greatly reduces HCA + * Typically FAST_REG Work Requests are not signaled, and neither are + * RDMA Send Work Requests (with the exception of signaling occasionally + * to prevent provider work queue overflows). This greatly reduces HCA * interrupt workload. - * - * As an optimization, frwr_unmap marks MRs INVALID before the - * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on - * rb_mrs immediately so that no work (like managing a linked list - * under a spinlock) is needed in the completion upcall. - * - * But this means that frwr_map() can occasionally encounter an MR - * that is INVALID but the LOCAL_INV WR has not completed. Work Queue - * ordering prevents a subsequent FAST_REG WR from executing against - * that MR while it is still being invalidated. */ /* Transport recovery * - * ->op_map and the transport connect worker cannot run at the same - * time, but ->op_unmap can fire while the transport connect worker - * is running. Thus MR recovery is handled in ->op_map, to guarantee - * that recovered MRs are owned by a sending RPC, and not one where - * ->op_unmap could fire at the same time transport reconnect is - * being done. + * frwr_map and frwr_unmap_* cannot run at the same time the transport + * connect worker is running. The connect worker holds the transport + * send lock, just as ->send_request does. This prevents frwr_map and + * the connect worker from running concurrently. When a connection is + * closed, the Receive completion queue is drained before the allowing + * the connect worker to get control. This prevents frwr_unmap and the + * connect worker from running concurrently. * - * When the underlying transport disconnects, MRs are left in one of - * four states: - * - * INVALID: The MR was not in use before the QP entered ERROR state. - * - * VALID: The MR was registered before the QP entered ERROR state. - * - * FLUSHED_FR: The MR was being registered when the QP entered ERROR - * state, and the pending WR was flushed. - * - * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR - * state, and the pending WR was flushed. - * - * When frwr_map encounters FLUSHED and VALID MRs, they are recovered - * with ib_dereg_mr and then are re-initialized. Because MR recovery - * allocates fresh resources, it is deferred to a workqueue, and the - * recovered MRs are placed back on the rb_mrs list when recovery is - * complete. frwr_map allocates another MR for the current RPC while - * the broken MR is reset. - * - * To ensure that frwr_map doesn't encounter an MR that is marked - * INVALID but that is about to be flushed due to a previous transport - * disconnect, the transport connect worker attempts to drain all - * pending send queue WRs before the transport is reconnected. + * When the underlying transport disconnects, MRs that are in flight + * are flushed and are likely unusable. Thus all flushed MRs are + * destroyed. New MRs are created on demand. */ #include From af08a7754a5da7ae704624cbe4ef83e46246c92d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:38:38 -0400 Subject: [PATCH 133/721] xprtrdma: Update obsolete comment Comment was made obsolete by commit 8cec3dba76a4 ("xprtrdma: rpcrdma_regbuf alignment"). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/xprt_rdma.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 92ce09fcea74..3b2f2041e889 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -117,9 +117,6 @@ struct rpcrdma_ep { #endif /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV - * - * The below structure appears at the front of a large region of kmalloc'd - * memory, which always starts on a good alignment boundary. */ struct rpcrdma_regbuf { From 073b50bccbbf99a3b79a1913604c656d0e1a56c9 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Fri, 16 Aug 2019 11:54:26 -0700 Subject: [PATCH 134/721] HID: wacom: Fix several minor compiler warnings Addresses a few issues that were noticed when compiling with non-default warnings enabled. The trimmed-down warnings in the order they are fixed below are: * declaration of 'size' shadows a parameter * '%s' directive output may be truncated writing up to 5 bytes into a region of size between 1 and 64 * pointer targets in initialization of 'char *' from 'unsigned char *' differ in signedness * left shift of negative value Signed-off-by: Jason Gerecke Reviewed-by: Aaron Armstrong Skomra Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 7 ++++--- drivers/hid/wacom_wac.c | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 53bddb50aeba..602219a8710d 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -88,7 +88,7 @@ static void wacom_wac_queue_flush(struct hid_device *hdev, } static int wacom_wac_pen_serial_enforce(struct hid_device *hdev, - struct hid_report *report, u8 *raw_data, int size) + struct hid_report *report, u8 *raw_data, int report_size) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; @@ -149,7 +149,8 @@ static int wacom_wac_pen_serial_enforce(struct hid_device *hdev, if (flush) wacom_wac_queue_flush(hdev, &wacom_wac->pen_fifo); else if (insert) - wacom_wac_queue_insert(hdev, &wacom_wac->pen_fifo, raw_data, size); + wacom_wac_queue_insert(hdev, &wacom_wac->pen_fifo, + raw_data, report_size); return insert && !flush; } @@ -2176,7 +2177,7 @@ static void wacom_update_name(struct wacom *wacom, const char *suffix) { struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; - char name[WACOM_NAME_MAX]; + char name[WACOM_NAME_MAX - 20]; /* Leave some room for suffixes */ /* Generic devices name unspecified */ if ((features->type == HID_GENERIC) && !strcmp("Wacom HID", features->name)) { diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 7a8ddc999a8e..125b4ed7afc6 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -251,7 +251,7 @@ static int wacom_dtu_irq(struct wacom_wac *wacom) static int wacom_dtus_irq(struct wacom_wac *wacom) { - char *data = wacom->data; + unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; unsigned short prox, pressure = 0; @@ -572,7 +572,7 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) strip2 = ((data[3] & 0x1f) << 8) | data[4]; } - prox = (buttons & ~(~0 << nbuttons)) | (keys & ~(~0 << nkeys)) | + prox = (buttons & ~(~0U << nbuttons)) | (keys & ~(~0U << nkeys)) | (ring1 & 0x80) | (ring2 & 0x80) | strip1 | strip2; wacom_report_numbered_buttons(input, nbuttons, buttons); From 670e90924bfe9cf80536eab66a31704e07b5fd00 Mon Sep 17 00:00:00 2001 From: Aaron Armstrong Skomra Date: Fri, 16 Aug 2019 12:02:50 -0700 Subject: [PATCH 135/721] HID: wacom: support named keys on older devices Some Wacom devices have keys with predefined meanings. However, when support was originally added for these devices, the codes for these keys were not available yet. These keys were thus reported with the numbered KEY_PROG* range. Some missing key codes were added with commit 4eb220cb35a9 ("HID: wacom: generic: add 3 tablet touch keys") and we are now able to report the proper key codes. We continue to report the original KEY_PROG codes so as not to break any unknown applications that may depend on them. Also, to support the touch key, track its state in the pad report. Signed-off-by: Aaron Armstrong Skomra Link: https://gitlab.freedesktop.org/libinput/libinput/merge_requests/155 Link: https://github.com/linuxwacom/xf86-input-wacom/pull/46 Reviewed-by: Ping Cheng Reviewed-by: Jason Gerecke Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 68 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 125b4ed7afc6..6b739c1bda99 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -483,6 +483,8 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) int ring1 = 0, ring2 = 0; int strip1 = 0, strip2 = 0; bool prox = false; + bool wrench = false, keyboard = false, mute_touch = false, menu = false, + info = false; /* pad packets. Works as a second tool and is always in prox */ if (!(data[0] == WACOM_REPORT_INTUOSPAD || data[0] == WACOM_REPORT_INTUOS5PAD || @@ -512,10 +514,32 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) keys = ((data[3] & 0x1C) ? 1<<2 : 0) | ((data[4] & 0xE0) ? 1<<1 : 0) | ((data[4] & 0x07) ? 1<<0 : 0); + keyboard = !!(data[4] & 0xE0); + info = !!(data[3] & 0x1C); + + if (features->oPid) { + mute_touch = !!(data[4] & 0x07); + if (mute_touch) + wacom->shared->is_touch_on = + !wacom->shared->is_touch_on; + } else { + wrench = !!(data[4] & 0x07); + } } else if (features->type == WACOM_27QHD) { nkeys = 3; keys = data[2] & 0x07; + wrench = !!(data[2] & 0x01); + keyboard = !!(data[2] & 0x02); + + if (features->oPid) { + mute_touch = !!(data[2] & 0x04); + if (mute_touch) + wacom->shared->is_touch_on = + !wacom->shared->is_touch_on; + } else { + menu = !!(data[2] & 0x04); + } input_report_abs(input, ABS_X, be16_to_cpup((__be16 *)&data[4])); input_report_abs(input, ABS_Y, be16_to_cpup((__be16 *)&data[6])); input_report_abs(input, ABS_Z, be16_to_cpup((__be16 *)&data[8])); @@ -561,6 +585,9 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) if (features->type == WACOM_22HD) { nkeys = 3; keys = data[9] & 0x07; + + info = !!(data[9] & 0x01); + wrench = !!(data[9] & 0x02); } } else { buttons = ((data[6] & 0x10) << 5) | @@ -580,6 +607,18 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) for (i = 0; i < nkeys; i++) input_report_key(input, KEY_PROG1 + i, keys & (1 << i)); + input_report_key(input, KEY_BUTTONCONFIG, wrench); + input_report_key(input, KEY_ONSCREEN_KEYBOARD, keyboard); + input_report_key(input, KEY_CONTROLPANEL, menu); + input_report_key(input, KEY_INFO, info); + + if (wacom->shared && wacom->shared->touch_input) { + input_report_switch(wacom->shared->touch_input, + SW_MUTE_DEVICE, + !wacom->shared->is_touch_on); + input_sync(wacom->shared->touch_input); + } + input_report_abs(input, ABS_RX, strip1); input_report_abs(input, ABS_RY, strip2); @@ -1480,6 +1519,12 @@ static int wacom_24hdt_irq(struct wacom_wac *wacom) int byte_per_packet = WACOM_BYTES_PER_24HDT_PACKET; int y_offset = 2; + if (wacom->shared->has_mute_touch_switch && + !wacom->shared->is_touch_on) { + if (!wacom->shared->touch_down) + return 0; + } + if (wacom->features.type == WACOM_27QHDT) { current_num_contacts = data[63]; num_contacts_left = 10; @@ -3812,6 +3857,14 @@ int wacom_setup_touch_input_capabilities(struct input_dev *input_dev, /* fall through */ case WACOM_27QHDT: + if (wacom_wac->shared->touch->product == 0x32C || + wacom_wac->shared->touch->product == 0xF6) { + input_dev->evbit[0] |= BIT_MASK(EV_SW); + __set_bit(SW_MUTE_DEVICE, input_dev->swbit); + wacom_wac->shared->has_mute_touch_switch = true; + } + /* fall through */ + case MTSCREEN: case MTTPC: case MTTPC_B: @@ -4047,6 +4100,12 @@ int wacom_setup_pad_input_capabilities(struct input_dev *input_dev, __set_bit(KEY_PROG2, input_dev->keybit); __set_bit(KEY_PROG3, input_dev->keybit); + __set_bit(KEY_ONSCREEN_KEYBOARD, input_dev->keybit); + __set_bit(KEY_INFO, input_dev->keybit); + + if (!features->oPid) + __set_bit(KEY_BUTTONCONFIG, input_dev->keybit); + input_set_abs_params(input_dev, ABS_WHEEL, 0, 71, 0, 0); input_set_abs_params(input_dev, ABS_THROTTLE, 0, 71, 0, 0); break; @@ -4055,6 +4114,12 @@ int wacom_setup_pad_input_capabilities(struct input_dev *input_dev, __set_bit(KEY_PROG1, input_dev->keybit); __set_bit(KEY_PROG2, input_dev->keybit); __set_bit(KEY_PROG3, input_dev->keybit); + + __set_bit(KEY_ONSCREEN_KEYBOARD, input_dev->keybit); + __set_bit(KEY_BUTTONCONFIG, input_dev->keybit); + + if (!features->oPid) + __set_bit(KEY_CONTROLPANEL, input_dev->keybit); input_set_abs_params(input_dev, ABS_X, -2048, 2048, 0, 0); input_abs_set_res(input_dev, ABS_X, 1024); /* points/g */ input_set_abs_params(input_dev, ABS_Y, -2048, 2048, 0, 0); @@ -4068,6 +4133,9 @@ int wacom_setup_pad_input_capabilities(struct input_dev *input_dev, __set_bit(KEY_PROG1, input_dev->keybit); __set_bit(KEY_PROG2, input_dev->keybit); __set_bit(KEY_PROG3, input_dev->keybit); + + __set_bit(KEY_BUTTONCONFIG, input_dev->keybit); + __set_bit(KEY_INFO, input_dev->keybit); /* fall through */ case WACOM_21UX2: From 36bdd9056b6a83d573ffdde282a5a91ce734c536 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:39:25 -0400 Subject: [PATCH 136/721] xprtrdma: Fix calculation of ri_max_segs again Commit 302d3deb206 ("xprtrdma: Prevent inline overflow") added this calculation back in 2016, but got it wrong. I tested only the lower bound, which is why there is a max_t there. The upper bound should be rounded up too. Now, when using DIV_ROUND_UP, that takes care of the lower bound as well. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index a30f2ae49578..3a10bfff2125 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -260,8 +260,8 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frwr_depth); + ia->ri_max_segs = + DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth); /* Reply chunks require segments for head and tail buffers */ ia->ri_max_segs += 2; if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS) From f3c66a2f56683a20ced4e700a8167d6b357641da Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:40:11 -0400 Subject: [PATCH 137/721] xprtrdma: Boost maximum transport header size Although I haven't seen any performance results that justify it, I've received several complaints that NFS/RDMA no longer supports a maximum rsize and wsize of 1MB. These days it is somewhat smaller. To simplify the logic that determines whether a chunk list is necessary, the implementation uses a fixed maximum size of the transport header. Currently that maximum size is 256 bytes, one quarter of the default inline threshold size for RPC/RDMA v1. Since commit a78868497c2e ("xprtrdma: Reduce max_frwr_depth"), the size of chunks is also smaller to take advantage of inline page lists in device internal MR data structures. The combination of these two design choices has reduced the maximum NFS rsize and wsize that can be used for most RNIC/HCAs. Increasing the maximum transport header size and the maximum number of RDMA segments it can contain increases the negotiated maximum rsize/wsize on common RNIC/HCAs. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 9 ++++++++- net/sunrpc/xprtrdma/xprt_rdma.h | 23 ++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 805b1f35e1ca..e639ea0faf19 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -1000,12 +1001,18 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; + size_t maxhdrsize; req = kzalloc(sizeof(*req), flags); if (req == NULL) goto out1; - rb = rpcrdma_regbuf_alloc(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); + /* Compute maximum header buffer size in bytes */ + maxhdrsize = rpcrdma_fixed_maxsz + 3 + + r_xprt->rx_ia.ri_max_segs * rpcrdma_readchunk_maxsz; + maxhdrsize *= sizeof(__be32); + rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), + DMA_TO_DEVICE, flags); if (!rb) goto out2; req->rl_rdmabuf = rb; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 3b2f2041e889..eaf6b907a76e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -155,25 +155,22 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) /* To ensure a transport can always make forward progress, * the number of RDMA segments allowed in header chunk lists - * is capped at 8. This prevents less-capable devices and - * memory registrations from overrunning the Send buffer - * while building chunk lists. + * is capped at 16. This prevents less-capable devices from + * overrunning the Send buffer while building chunk lists. * * Elements of the Read list take up more room than the - * Write list or Reply chunk. 8 read segments means the Read - * list (or Write list or Reply chunk) cannot consume more - * than + * Write list or Reply chunk. 16 read segments means the + * chunk lists cannot consume more than * - * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes. + * ((16 + 2) * read segment size) + 1 XDR words, * - * And the fixed part of the header is another 24 bytes. - * - * The smallest inline threshold is 1024 bytes, ensuring that - * at least 750 bytes are available for RPC messages. + * or about 400 bytes. The fixed part of the header is + * another 24 bytes. Thus when the inline threshold is + * 1024 bytes, at least 600 bytes are available for RPC + * message bodies. */ enum { - RPCRDMA_MAX_HDR_SEGS = 8, - RPCRDMA_HDRBUF_SIZE = 256, + RPCRDMA_MAX_HDR_SEGS = 16, }; /* From aeaed4848234c97fb720b0e51a0c56dc8de0eeed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:40:58 -0400 Subject: [PATCH 138/721] xprtrdma: Boost client's max slot table size to match Linux server I've heard rumors of an NFS/RDMA server implementation that has a default credit limit of 1024. The client's default setting remains at 128. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtrdma.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 86fc38ff0355..16c239e0d6dd 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -49,9 +49,9 @@ * fully-chunked NFS message (read chunks are the largest). Note only * a single chunk type per message is supported currently. */ -#define RPCRDMA_MIN_SLOT_TABLE (2U) +#define RPCRDMA_MIN_SLOT_TABLE (4U) #define RPCRDMA_DEF_SLOT_TABLE (128U) -#define RPCRDMA_MAX_SLOT_TABLE (256U) +#define RPCRDMA_MAX_SLOT_TABLE (16384U) #define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */ #define RPCRDMA_DEF_INLINE (4096) /* default inline thresh */ From 9b543419652917f048310d0863c47c107c26fb0c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 19 Aug 2019 15:41:00 +0300 Subject: [PATCH 139/721] Tools: hv: move to tools buildsystem There is a nice buildsystem dedicated for userspace tools in Linux kernel tree. Switch Hyper-V daemons to be built by it. Cc: Vitaly Kuznetsov Signed-off-by: Andy Shevchenko Tested-by: Vitaly Kuznetsov Signed-off-by: Sasha Levin --- tools/hv/Build | 3 +++ tools/hv/Makefile | 51 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 10 deletions(-) create mode 100644 tools/hv/Build diff --git a/tools/hv/Build b/tools/hv/Build new file mode 100644 index 000000000000..6cf51fa4b306 --- /dev/null +++ b/tools/hv/Build @@ -0,0 +1,3 @@ +hv_kvp_daemon-y += hv_kvp_daemon.o +hv_vss_daemon-y += hv_vss_daemon.o +hv_fcopy_daemon-y += hv_fcopy_daemon.o diff --git a/tools/hv/Makefile b/tools/hv/Makefile index 5db5e62cebda..b57143d9459c 100644 --- a/tools/hv/Makefile +++ b/tools/hv/Makefile @@ -1,28 +1,55 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for Hyper-V tools - -WARNINGS = -Wall -Wextra -CFLAGS = $(WARNINGS) -g $(shell getconf LFS_CFLAGS) - -CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include +include ../scripts/Makefile.include sbindir ?= /usr/sbin libexecdir ?= /usr/libexec sharedstatedir ?= /var/lib -ALL_PROGRAMS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +endif + +# Do not use make's built-in rules +# (this improves performance and avoids hard-to-debug behaviour); +MAKEFLAGS += -r + +override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include + +ALL_TARGETS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon +ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) ALL_SCRIPTS := hv_get_dhcp_info.sh hv_get_dns_info.sh hv_set_ifconfig.sh all: $(ALL_PROGRAMS) -%: %.c - $(CC) $(CFLAGS) -o $@ $^ +export srctree OUTPUT CC LD CFLAGS +include $(srctree)/tools/build/Makefile.include + +HV_KVP_DAEMON_IN := $(OUTPUT)hv_kvp_daemon-in.o +$(HV_KVP_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_kvp_daemon +$(OUTPUT)hv_kvp_daemon: $(HV_KVP_DAEMON_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ + +HV_VSS_DAEMON_IN := $(OUTPUT)hv_vss_daemon-in.o +$(HV_VSS_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_vss_daemon +$(OUTPUT)hv_vss_daemon: $(HV_VSS_DAEMON_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ + +HV_FCOPY_DAEMON_IN := $(OUTPUT)hv_fcopy_daemon-in.o +$(HV_FCOPY_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_fcopy_daemon +$(OUTPUT)hv_fcopy_daemon: $(HV_FCOPY_DAEMON_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ clean: - $(RM) hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon + rm -f $(ALL_PROGRAMS) + find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -install: all +install: $(ALL_PROGRAMS) install -d -m 755 $(DESTDIR)$(sbindir); \ install -d -m 755 $(DESTDIR)$(libexecdir)/hypervkvpd; \ install -d -m 755 $(DESTDIR)$(sharedstatedir); \ @@ -33,3 +60,7 @@ install: all for script in $(ALL_SCRIPTS); do \ install $$script -m 755 $(DESTDIR)$(libexecdir)/hypervkvpd/$${script%.sh}; \ done + +FORCE: + +.PHONY: all install clean FORCE prepare From 2dfdcd88cf0ea66eec0478de82283ef20eb6f421 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:41:44 -0400 Subject: [PATCH 140/721] xprtrdma: Rename CQE field in Receive trace points Make the field name the same for all trace points that handle pointers to struct rpcrdma_rep. That makes it easy to grep for matching rep points in trace output. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 21 +++++++++++---------- net/sunrpc/xprtrdma/verbs.c | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index f6a4eaa85a3e..6e6055eb67e7 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -623,21 +623,21 @@ TRACE_EVENT(xprtrdma_post_send, TRACE_EVENT(xprtrdma_post_recv, TP_PROTO( - const struct ib_cqe *cqe + const struct rpcrdma_rep *rep ), - TP_ARGS(cqe), + TP_ARGS(rep), TP_STRUCT__entry( - __field(const void *, cqe) + __field(const void *, rep) ), TP_fast_assign( - __entry->cqe = cqe; + __entry->rep = rep; ), - TP_printk("cqe=%p", - __entry->cqe + TP_printk("rep=%p", + __entry->rep ) ); @@ -715,14 +715,15 @@ TRACE_EVENT(xprtrdma_wc_receive, TP_ARGS(wc), TP_STRUCT__entry( - __field(const void *, cqe) + __field(const void *, rep) __field(u32, byte_len) __field(unsigned int, status) __field(u32, vendor_err) ), TP_fast_assign( - __entry->cqe = wc->wr_cqe; + __entry->rep = container_of(wc->wr_cqe, struct rpcrdma_rep, + rr_cqe); __entry->status = wc->status; if (wc->status) { __entry->byte_len = 0; @@ -733,8 +734,8 @@ TRACE_EVENT(xprtrdma_wc_receive, } ), - TP_printk("cqe=%p %u bytes: %s (%u/0x%x)", - __entry->cqe, __entry->byte_len, + TP_printk("rep=%p %u bytes: %s (%u/0x%x)", + __entry->rep, __entry->byte_len, rdma_show_wc_status(__entry->status), __entry->status, __entry->vendor_err ) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e639ea0faf19..3c275a7a4e4c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1531,7 +1531,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) goto release_wrs; - trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe); + trace_xprtrdma_post_recv(rep); ++count; } From eed48a9c161588544999ee8d451392921d846ee8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:42:31 -0400 Subject: [PATCH 141/721] xprtrdma: Rename rpcrdma_buffer::rb_all Clean up: There are other "all" list heads. For code clarity distinguish this one as for use only for MRs by renaming it. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 26 ++++++++------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3c275a7a4e4c..e004873cc4f0 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -944,8 +944,6 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; unsigned int count; - LIST_HEAD(free); - LIST_HEAD(all); for (count = 0; count < ia->ri_max_segs; count++) { struct rpcrdma_mr *mr; @@ -963,15 +961,13 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) mr->mr_xprt = r_xprt; - list_add(&mr->mr_list, &free); - list_add(&mr->mr_all, &all); + spin_lock(&buf->rb_mrlock); + list_add(&mr->mr_list, &buf->rb_mrs); + list_add(&mr->mr_all, &buf->rb_all_mrs); + spin_unlock(&buf->rb_mrlock); } - spin_lock(&buf->rb_mrlock); - list_splice(&free, &buf->rb_mrs); - list_splice(&all, &buf->rb_all); r_xprt->rx_stats.mrs_allocated += count; - spin_unlock(&buf->rb_mrlock); trace_xprtrdma_createmrs(r_xprt, count); } @@ -1089,7 +1085,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); INIT_LIST_HEAD(&buf->rb_mrs); - INIT_LIST_HEAD(&buf->rb_all); + INIT_LIST_HEAD(&buf->rb_all_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); @@ -1156,24 +1152,18 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) count = 0; spin_lock(&buf->rb_mrlock); - while (!list_empty(&buf->rb_all)) { - mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all); + while ((mr = list_first_entry_or_null(&buf->rb_all_mrs, + struct rpcrdma_mr, + mr_all)) != NULL) { list_del(&mr->mr_all); - spin_unlock(&buf->rb_mrlock); - /* Ensure MW is not on any rl_registered list */ - if (!list_empty(&mr->mr_list)) - list_del(&mr->mr_list); - frwr_release_mr(mr); count++; spin_lock(&buf->rb_mrlock); } spin_unlock(&buf->rb_mrlock); r_xprt->rx_stats.mrs_allocated = 0; - - dprintk("RPC: %s: released %u MRs\n", __func__, count); } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index eaf6b907a76e..5aaa53b8ae12 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -360,7 +360,7 @@ rpcrdma_mr_pop(struct list_head *list) struct rpcrdma_buffer { spinlock_t rb_mrlock; /* protect rb_mrs list */ struct list_head rb_mrs; - struct list_head rb_all; + struct list_head rb_all_mrs; unsigned long rb_sc_head; unsigned long rb_sc_tail; From 395790566eec37706dedeb94779045adc3a7581e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:43:17 -0400 Subject: [PATCH 142/721] xprtrdma: Toggle XPRT_CONGESTED in xprtrdma's slot methods Commit 48be539dd44a ("xprtrdma: Introduce ->alloc_slot call-out for xprtrdma") added a separate alloc_slot and free_slot to the RPC/RDMA transport. Later, commit 75891f502f5f ("SUNRPC: Support for congestion control when queuing is enabled") modified the generic alloc/free_slot methods, but neglected the methods in xprtrdma. Found via code review. Fixes: 75891f502f5f ("SUNRPC: Support for congestion control ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2ec349ed4770..f4763e8a6761 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -571,6 +571,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) return; out_sleep: + set_bit(XPRT_CONGESTED, &xprt->state); rpc_sleep_on(&xprt->backlog, task, NULL); task->tk_status = -EAGAIN; } @@ -589,7 +590,8 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) memset(rqst, 0, sizeof(*rqst)); rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); - rpc_wake_up_next(&xprt->backlog); + if (unlikely(!rpc_wake_up_next(&xprt->backlog))) + clear_bit(XPRT_CONGESTED, &xprt->state); } static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, From 8bdfa145f58220796abddc93b5038efa6e7e90fc Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Tue, 13 Aug 2019 14:45:11 -0600 Subject: [PATCH 143/721] PCI: sysfs: Define device attributes with DEVICE_ATTR*() Device attributes should be defined using DEVICE_ATTR*(_name, _mode, _show, _store). Convert them all from __ATTR*() to DEVICE_ATTR*(), e.g., - struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store) + static DEVICE_ATTR(foo, S_IWUSR | S_IRUGO, show_foo, store_foo) Link: https://lore.kernel.org/r/20190813204513.4790-2-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Greg Kroah-Hartman Reviewed-by: Donald Dutile --- drivers/pci/pci-sysfs.c | 59 +++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 965c72104150..8af7944fdccb 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -464,9 +464,7 @@ static ssize_t dev_rescan_store(struct device *dev, } return count; } -static struct device_attribute dev_rescan_attr = __ATTR(rescan, - (S_IWUSR|S_IWGRP), - NULL, dev_rescan_store); +static DEVICE_ATTR(rescan, (S_IWUSR | S_IWGRP), NULL, dev_rescan_store); static ssize_t remove_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -480,9 +478,8 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr, pci_stop_and_remove_bus_device_locked(to_pci_dev(dev)); return count; } -static struct device_attribute dev_remove_attr = __ATTR_IGNORE_LOCKDEP(remove, - (S_IWUSR|S_IWGRP), - NULL, remove_store); +static DEVICE_ATTR_IGNORE_LOCKDEP(remove, (S_IWUSR | S_IWGRP), NULL, + remove_store); static ssize_t dev_bus_rescan_store(struct device *dev, struct device_attribute *attr, @@ -504,7 +501,7 @@ static ssize_t dev_bus_rescan_store(struct device *dev, } return count; } -static DEVICE_ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, dev_bus_rescan_store); +static DEVICE_ATTR(bus_rescan, (S_IWUSR | S_IWGRP), NULL, dev_bus_rescan_store); #if defined(CONFIG_PM) && defined(CONFIG_ACPI) static ssize_t d3cold_allowed_store(struct device *dev, @@ -687,16 +684,14 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev, return count; } -static struct device_attribute sriov_totalvfs_attr = __ATTR_RO(sriov_totalvfs); -static struct device_attribute sriov_numvfs_attr = - __ATTR(sriov_numvfs, (S_IRUGO|S_IWUSR|S_IWGRP), - sriov_numvfs_show, sriov_numvfs_store); -static struct device_attribute sriov_offset_attr = __ATTR_RO(sriov_offset); -static struct device_attribute sriov_stride_attr = __ATTR_RO(sriov_stride); -static struct device_attribute sriov_vf_device_attr = __ATTR_RO(sriov_vf_device); -static struct device_attribute sriov_drivers_autoprobe_attr = - __ATTR(sriov_drivers_autoprobe, (S_IRUGO|S_IWUSR|S_IWGRP), - sriov_drivers_autoprobe_show, sriov_drivers_autoprobe_store); +static DEVICE_ATTR_RO(sriov_totalvfs); +static DEVICE_ATTR(sriov_numvfs, (S_IRUGO | S_IWUSR | S_IWGRP), + sriov_numvfs_show, sriov_numvfs_store); +static DEVICE_ATTR_RO(sriov_offset); +static DEVICE_ATTR_RO(sriov_stride); +static DEVICE_ATTR_RO(sriov_vf_device); +static DEVICE_ATTR(sriov_drivers_autoprobe, (S_IRUGO | S_IWUSR | S_IWGRP), + sriov_drivers_autoprobe_show, sriov_drivers_autoprobe_store); #endif /* CONFIG_PCI_IOV */ static ssize_t driver_override_store(struct device *dev, @@ -792,7 +787,7 @@ static struct attribute *pcie_dev_attrs[] = { }; static struct attribute *pcibus_attrs[] = { - &dev_attr_rescan.attr, + &dev_attr_bus_rescan.attr, &dev_attr_cpuaffinity.attr, &dev_attr_cpulistaffinity.attr, NULL, @@ -820,7 +815,7 @@ static ssize_t boot_vga_show(struct device *dev, struct device_attribute *attr, !!(pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW)); } -static struct device_attribute vga_attr = __ATTR_RO(boot_vga); +static DEVICE_ATTR_RO(boot_vga); static ssize_t pci_read_config(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, @@ -1458,7 +1453,7 @@ static ssize_t reset_store(struct device *dev, struct device_attribute *attr, return count; } -static struct device_attribute reset_attr = __ATTR(reset, 0200, NULL, reset_store); +static DEVICE_ATTR(reset, 0200, NULL, reset_store); static int pci_create_capabilities_sysfs(struct pci_dev *dev) { @@ -1468,7 +1463,7 @@ static int pci_create_capabilities_sysfs(struct pci_dev *dev) pcie_aspm_create_sysfs_dev_files(dev); if (dev->reset_fn) { - retval = device_create_file(&dev->dev, &reset_attr); + retval = device_create_file(&dev->dev, &dev_attr_reset); if (retval) goto error; } @@ -1553,7 +1548,7 @@ static void pci_remove_capabilities_sysfs(struct pci_dev *dev) pcie_vpd_remove_sysfs_dev_files(dev); pcie_aspm_remove_sysfs_dev_files(dev); if (dev->reset_fn) { - device_remove_file(&dev->dev, &reset_attr); + device_remove_file(&dev->dev, &dev_attr_reset); dev->reset_fn = 0; } } @@ -1606,7 +1601,7 @@ static int __init pci_sysfs_init(void) late_initcall(pci_sysfs_init); static struct attribute *pci_dev_dev_attrs[] = { - &vga_attr.attr, + &dev_attr_boot_vga.attr, NULL, }; @@ -1616,7 +1611,7 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct pci_dev *pdev = to_pci_dev(dev); - if (a == &vga_attr.attr) + if (a == &dev_attr_boot_vga.attr) if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA) return 0; @@ -1624,8 +1619,8 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj, } static struct attribute *pci_dev_hp_attrs[] = { - &dev_remove_attr.attr, - &dev_rescan_attr.attr, + &dev_attr_remove.attr, + &dev_attr_rescan.attr, NULL, }; @@ -1699,12 +1694,12 @@ static const struct attribute_group pci_dev_hp_attr_group = { #ifdef CONFIG_PCI_IOV static struct attribute *sriov_dev_attrs[] = { - &sriov_totalvfs_attr.attr, - &sriov_numvfs_attr.attr, - &sriov_offset_attr.attr, - &sriov_stride_attr.attr, - &sriov_vf_device_attr.attr, - &sriov_drivers_autoprobe_attr.attr, + &dev_attr_sriov_totalvfs.attr, + &dev_attr_sriov_numvfs.attr, + &dev_attr_sriov_offset.attr, + &dev_attr_sriov_stride.attr, + &dev_attr_sriov_vf_device.attr, + &dev_attr_sriov_drivers_autoprobe.attr, NULL, }; From 4e2b79436e4f22b225ee445832705bb7c4675807 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Thu, 15 Aug 2019 09:33:52 -0600 Subject: [PATCH 144/721] PCI: sysfs: Change DEVICE_ATTR() to DEVICE_ATTR_WO() DEVICE_ATTR() should only be used when files have unusual permissions. Change DEVICE_ATTR() with '0220' write-only permissions to DEVICE_ATTR_WO(), e.g., - static DEVICE_ATTR(_name, (S_IWUSR | S_IWGRP), NULL, _store); + static DEVICE_ATTR_WO(_name); Since _store is no longer passed, make the _name passed by DEVICE_ATTR_WO() and the related _name##_store() name match with each other, e.g., DEVICE_ATTR_WO(bus_rescan) must be able to call bus_rescan_store() Link: https://lore.kernel.org/r/20190815153352.86143-4-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Greg Kroah-Hartman Reviewed-by: Donald Dutile --- drivers/pci/pci-sysfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 8af7944fdccb..4fe988d54807 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -464,7 +464,7 @@ static ssize_t dev_rescan_store(struct device *dev, } return count; } -static DEVICE_ATTR(rescan, (S_IWUSR | S_IWGRP), NULL, dev_rescan_store); +static DEVICE_ATTR_WO(dev_rescan); static ssize_t remove_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -481,9 +481,9 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_IGNORE_LOCKDEP(remove, (S_IWUSR | S_IWGRP), NULL, remove_store); -static ssize_t dev_bus_rescan_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t bus_rescan_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { unsigned long val; struct pci_bus *bus = to_pci_bus(dev); @@ -501,7 +501,7 @@ static ssize_t dev_bus_rescan_store(struct device *dev, } return count; } -static DEVICE_ATTR(bus_rescan, (S_IWUSR | S_IWGRP), NULL, dev_bus_rescan_store); +static DEVICE_ATTR_WO(bus_rescan); #if defined(CONFIG_PM) && defined(CONFIG_ACPI) static ssize_t d3cold_allowed_store(struct device *dev, @@ -1620,7 +1620,7 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj, static struct attribute *pci_dev_hp_attrs[] = { &dev_attr_remove.attr, - &dev_attr_rescan.attr, + &dev_attr_dev_rescan.attr, NULL, }; From e2154044dd4168bc25c70170dfa6179b57f63914 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Tue, 13 Aug 2019 14:45:12 -0600 Subject: [PATCH 145/721] PCI: sysfs: Change permissions from symbolic to octal We prefer octal permissions over symbolic permissions such as "(S_IWUSR | S_IWGRP)". Change all symbolic permissions to octal permissions, e.g., - (S_IWUSR | S_IWGRP) + 0220 Link: https://lore.kernel.org/r/20190813204513.4790-3-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Greg Kroah-Hartman Reviewed-by: Donald Dutile --- drivers/pci/pci-sysfs.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 4fe988d54807..5bb301efec98 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -478,7 +478,7 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr, pci_stop_and_remove_bus_device_locked(to_pci_dev(dev)); return count; } -static DEVICE_ATTR_IGNORE_LOCKDEP(remove, (S_IWUSR | S_IWGRP), NULL, +static DEVICE_ATTR_IGNORE_LOCKDEP(remove, 0220, NULL, remove_store); static ssize_t bus_rescan_store(struct device *dev, @@ -685,13 +685,12 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev, } static DEVICE_ATTR_RO(sriov_totalvfs); -static DEVICE_ATTR(sriov_numvfs, (S_IRUGO | S_IWUSR | S_IWGRP), - sriov_numvfs_show, sriov_numvfs_store); +static DEVICE_ATTR(sriov_numvfs, 0664, sriov_numvfs_show, sriov_numvfs_store); static DEVICE_ATTR_RO(sriov_offset); static DEVICE_ATTR_RO(sriov_stride); static DEVICE_ATTR_RO(sriov_vf_device); -static DEVICE_ATTR(sriov_drivers_autoprobe, (S_IRUGO | S_IWUSR | S_IWGRP), - sriov_drivers_autoprobe_show, sriov_drivers_autoprobe_store); +static DEVICE_ATTR(sriov_drivers_autoprobe, 0664, sriov_drivers_autoprobe_show, + sriov_drivers_autoprobe_store); #endif /* CONFIG_PCI_IOV */ static ssize_t driver_override_store(struct device *dev, @@ -1080,7 +1079,7 @@ void pci_create_legacy_files(struct pci_bus *b) sysfs_bin_attr_init(b->legacy_io); b->legacy_io->attr.name = "legacy_io"; b->legacy_io->size = 0xffff; - b->legacy_io->attr.mode = S_IRUSR | S_IWUSR; + b->legacy_io->attr.mode = 0600; b->legacy_io->read = pci_read_legacy_io; b->legacy_io->write = pci_write_legacy_io; b->legacy_io->mmap = pci_mmap_legacy_io; @@ -1094,7 +1093,7 @@ void pci_create_legacy_files(struct pci_bus *b) sysfs_bin_attr_init(b->legacy_mem); b->legacy_mem->attr.name = "legacy_mem"; b->legacy_mem->size = 1024*1024; - b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR; + b->legacy_mem->attr.mode = 0600; b->legacy_mem->mmap = pci_mmap_legacy_mem; pci_adjust_legacy_attr(b, pci_mmap_mem); error = device_create_bin_file(&b->dev, b->legacy_mem); @@ -1301,7 +1300,7 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) } } res_attr->attr.name = res_attr_name; - res_attr->attr.mode = S_IRUSR | S_IWUSR; + res_attr->attr.mode = 0600; res_attr->size = pci_resource_len(pdev, num); res_attr->private = (void *)(unsigned long)num; retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); @@ -1414,7 +1413,7 @@ static ssize_t pci_read_rom(struct file *filp, struct kobject *kobj, static const struct bin_attribute pci_config_attr = { .attr = { .name = "config", - .mode = S_IRUGO | S_IWUSR, + .mode = 0644, }, .size = PCI_CFG_SPACE_SIZE, .read = pci_read_config, @@ -1424,7 +1423,7 @@ static const struct bin_attribute pci_config_attr = { static const struct bin_attribute pcie_config_attr = { .attr = { .name = "config", - .mode = S_IRUGO | S_IWUSR, + .mode = 0644, }, .size = PCI_CFG_SPACE_EXP_SIZE, .read = pci_read_config, @@ -1506,7 +1505,7 @@ int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) sysfs_bin_attr_init(attr); attr->size = rom_size; attr->attr.name = "rom"; - attr->attr.mode = S_IRUSR | S_IWUSR; + attr->attr.mode = 0600; attr->read = pci_read_rom; attr->write = pci_write_rom; retval = sysfs_create_bin_file(&pdev->dev.kobj, attr); From aaee0c1ffd6399d291b030b49d622b81dd5071c5 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Tue, 13 Aug 2019 14:45:13 -0600 Subject: [PATCH 146/721] PCI/IOV: Move sysfs SR-IOV functions to iov.c The sysfs SR-IOV functions are only needed when the kernel is built with SR-IOV support. Rather than put them in pci-sysfs.c under #ifdef CONFIG_PCI_IOV, move them to iov.c, which is only compiled when CONFIG_PCI_IOV=y. Link: https://lore.kernel.org/r/20190813204513.4790-4-skunberg.kelsey@gmail.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Greg Kroah-Hartman Reviewed-by: Donald Dutile Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/iov.c | 168 ++++++++++++++++++++++++++++++++++++++ drivers/pci/pci-sysfs.c | 173 ---------------------------------------- drivers/pci/pci.h | 2 +- 3 files changed, 169 insertions(+), 174 deletions(-) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 525fd3f272b3..053054362247 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -240,6 +240,174 @@ void pci_iov_remove_virtfn(struct pci_dev *dev, int id) pci_dev_put(dev); } +static ssize_t sriov_totalvfs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev)); +} + +static ssize_t sriov_numvfs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->num_VFs); +} + +/* + * num_vfs > 0; number of VFs to enable + * num_vfs = 0; disable all VFs + * + * Note: SRIOV spec does not allow partial VF + * disable, so it's all or none. + */ +static ssize_t sriov_numvfs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int ret; + u16 num_vfs; + + ret = kstrtou16(buf, 0, &num_vfs); + if (ret < 0) + return ret; + + if (num_vfs > pci_sriov_get_totalvfs(pdev)) + return -ERANGE; + + device_lock(&pdev->dev); + + if (num_vfs == pdev->sriov->num_VFs) + goto exit; + + /* is PF driver loaded w/callback */ + if (!pdev->driver || !pdev->driver->sriov_configure) { + pci_info(pdev, "Driver does not support SRIOV configuration via sysfs\n"); + ret = -ENOENT; + goto exit; + } + + if (num_vfs == 0) { + /* disable VFs */ + ret = pdev->driver->sriov_configure(pdev, 0); + goto exit; + } + + /* enable VFs */ + if (pdev->sriov->num_VFs) { + pci_warn(pdev, "%d VFs already enabled. Disable before enabling %d VFs\n", + pdev->sriov->num_VFs, num_vfs); + ret = -EBUSY; + goto exit; + } + + ret = pdev->driver->sriov_configure(pdev, num_vfs); + if (ret < 0) + goto exit; + + if (ret != num_vfs) + pci_warn(pdev, "%d VFs requested; only %d enabled\n", + num_vfs, ret); + +exit: + device_unlock(&pdev->dev); + + if (ret < 0) + return ret; + + return count; +} + +static ssize_t sriov_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->offset); +} + +static ssize_t sriov_stride_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->stride); +} + +static ssize_t sriov_vf_device_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%x\n", pdev->sriov->vf_device); +} + +static ssize_t sriov_drivers_autoprobe_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe); +} + +static ssize_t sriov_drivers_autoprobe_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + bool drivers_autoprobe; + + if (kstrtobool(buf, &drivers_autoprobe) < 0) + return -EINVAL; + + pdev->sriov->drivers_autoprobe = drivers_autoprobe; + + return count; +} + +static DEVICE_ATTR_RO(sriov_totalvfs); +static DEVICE_ATTR(sriov_numvfs, 0664, sriov_numvfs_show, sriov_numvfs_store); +static DEVICE_ATTR_RO(sriov_offset); +static DEVICE_ATTR_RO(sriov_stride); +static DEVICE_ATTR_RO(sriov_vf_device); +static DEVICE_ATTR(sriov_drivers_autoprobe, 0664, sriov_drivers_autoprobe_show, + sriov_drivers_autoprobe_store); + +static struct attribute *sriov_dev_attrs[] = { + &dev_attr_sriov_totalvfs.attr, + &dev_attr_sriov_numvfs.attr, + &dev_attr_sriov_offset.attr, + &dev_attr_sriov_stride.attr, + &dev_attr_sriov_vf_device.attr, + &dev_attr_sriov_drivers_autoprobe.attr, + NULL, +}; + +static umode_t sriov_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + + if (!dev_is_pf(dev)) + return 0; + + return a->mode; +} + +const struct attribute_group sriov_dev_attr_group = { + .attrs = sriov_dev_attrs, + .is_visible = sriov_attrs_are_visible, +}; + int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { return 0; diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 5bb301efec98..868e35109284 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -548,151 +548,6 @@ static ssize_t devspec_show(struct device *dev, static DEVICE_ATTR_RO(devspec); #endif -#ifdef CONFIG_PCI_IOV -static ssize_t sriov_totalvfs_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev)); -} - - -static ssize_t sriov_numvfs_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->num_VFs); -} - -/* - * num_vfs > 0; number of VFs to enable - * num_vfs = 0; disable all VFs - * - * Note: SRIOV spec doesn't allow partial VF - * disable, so it's all or none. - */ -static ssize_t sriov_numvfs_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct pci_dev *pdev = to_pci_dev(dev); - int ret; - u16 num_vfs; - - ret = kstrtou16(buf, 0, &num_vfs); - if (ret < 0) - return ret; - - if (num_vfs > pci_sriov_get_totalvfs(pdev)) - return -ERANGE; - - device_lock(&pdev->dev); - - if (num_vfs == pdev->sriov->num_VFs) - goto exit; - - /* is PF driver loaded w/callback */ - if (!pdev->driver || !pdev->driver->sriov_configure) { - pci_info(pdev, "Driver doesn't support SRIOV configuration via sysfs\n"); - ret = -ENOENT; - goto exit; - } - - if (num_vfs == 0) { - /* disable VFs */ - ret = pdev->driver->sriov_configure(pdev, 0); - goto exit; - } - - /* enable VFs */ - if (pdev->sriov->num_VFs) { - pci_warn(pdev, "%d VFs already enabled. Disable before enabling %d VFs\n", - pdev->sriov->num_VFs, num_vfs); - ret = -EBUSY; - goto exit; - } - - ret = pdev->driver->sriov_configure(pdev, num_vfs); - if (ret < 0) - goto exit; - - if (ret != num_vfs) - pci_warn(pdev, "%d VFs requested; only %d enabled\n", - num_vfs, ret); - -exit: - device_unlock(&pdev->dev); - - if (ret < 0) - return ret; - - return count; -} - -static ssize_t sriov_offset_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->offset); -} - -static ssize_t sriov_stride_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->stride); -} - -static ssize_t sriov_vf_device_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%x\n", pdev->sriov->vf_device); -} - -static ssize_t sriov_drivers_autoprobe_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe); -} - -static ssize_t sriov_drivers_autoprobe_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct pci_dev *pdev = to_pci_dev(dev); - bool drivers_autoprobe; - - if (kstrtobool(buf, &drivers_autoprobe) < 0) - return -EINVAL; - - pdev->sriov->drivers_autoprobe = drivers_autoprobe; - - return count; -} - -static DEVICE_ATTR_RO(sriov_totalvfs); -static DEVICE_ATTR(sriov_numvfs, 0664, sriov_numvfs_show, sriov_numvfs_store); -static DEVICE_ATTR_RO(sriov_offset); -static DEVICE_ATTR_RO(sriov_stride); -static DEVICE_ATTR_RO(sriov_vf_device); -static DEVICE_ATTR(sriov_drivers_autoprobe, 0664, sriov_drivers_autoprobe_show, - sriov_drivers_autoprobe_store); -#endif /* CONFIG_PCI_IOV */ - static ssize_t driver_override_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -1691,34 +1546,6 @@ static const struct attribute_group pci_dev_hp_attr_group = { .is_visible = pci_dev_hp_attrs_are_visible, }; -#ifdef CONFIG_PCI_IOV -static struct attribute *sriov_dev_attrs[] = { - &dev_attr_sriov_totalvfs.attr, - &dev_attr_sriov_numvfs.attr, - &dev_attr_sriov_offset.attr, - &dev_attr_sriov_stride.attr, - &dev_attr_sriov_vf_device.attr, - &dev_attr_sriov_drivers_autoprobe.attr, - NULL, -}; - -static umode_t sriov_attrs_are_visible(struct kobject *kobj, - struct attribute *a, int n) -{ - struct device *dev = kobj_to_dev(kobj); - - if (!dev_is_pf(dev)) - return 0; - - return a->mode; -} - -static const struct attribute_group sriov_dev_attr_group = { - .attrs = sriov_dev_attrs, - .is_visible = sriov_attrs_are_visible, -}; -#endif /* CONFIG_PCI_IOV */ - static const struct attribute_group pci_dev_attr_group = { .attrs = pci_dev_dev_attrs, .is_visible = pci_dev_attrs_are_visible, diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1be03a97cb92..061a935ac18e 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -433,7 +433,7 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); - +extern const struct attribute_group sriov_dev_attr_group; #else static inline int pci_iov_init(struct pci_dev *dev) { From 265a38d4611360ae3d5bb612d586a3126507a954 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:44:04 -0400 Subject: [PATCH 147/721] xprtrdma: Simplify rpcrdma_mr_pop Clean up: rpcrdma_mr_pop call sites check if the list is empty first. Let's replace the list_empty with less costly logic. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 12 ++++-------- net/sunrpc/xprtrdma/rpc_rdma.c | 7 +------ net/sunrpc/xprtrdma/verbs.c | 6 ++---- net/sunrpc/xprtrdma/xprt_rdma.h | 7 ++++--- 4 files changed, 11 insertions(+), 21 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 3a10bfff2125..d7e763fafa04 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -126,12 +126,10 @@ frwr_mr_recycle_worker(struct work_struct *work) */ void frwr_reset(struct rpcrdma_req *req) { - while (!list_empty(&req->rl_registered)) { - struct rpcrdma_mr *mr; + struct rpcrdma_mr *mr; - mr = rpcrdma_mr_pop(&req->rl_registered); + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) rpcrdma_mr_unmap_and_put(mr); - } } /** @@ -532,8 +530,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ frwr = NULL; prev = &first; - while (!list_empty(&req->rl_registered)) { - mr = rpcrdma_mr_pop(&req->rl_registered); + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; @@ -632,8 +629,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ frwr = NULL; prev = &first; - while (!list_empty(&req->rl_registered)) { - mr = rpcrdma_mr_pop(&req->rl_registered); + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 4345e6912392..0ac096a6348a 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -841,12 +841,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) * chunks. Very likely the connection has been replaced, * so these registrations are invalid and unusable. */ - while (unlikely(!list_empty(&req->rl_registered))) { - struct rpcrdma_mr *mr; - - mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_recycle(mr); - } + frwr_reset(req); /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e004873cc4f0..ee6fcf10425b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1213,13 +1213,11 @@ struct rpcrdma_mr * rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_mr *mr = NULL; + struct rpcrdma_mr *mr; spin_lock(&buf->rb_mrlock); - if (!list_empty(&buf->rb_mrs)) - mr = rpcrdma_mr_pop(&buf->rb_mrs); + mr = rpcrdma_mr_pop(&buf->rb_mrs); spin_unlock(&buf->rb_mrlock); - if (!mr) goto out_nomrs; return mr; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5aaa53b8ae12..9663b8ddd733 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -338,7 +338,7 @@ rpcr_to_rdmar(const struct rpc_rqst *rqst) static inline void rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list) { - list_add_tail(&mr->mr_list, list); + list_add(&mr->mr_list, list); } static inline struct rpcrdma_mr * @@ -346,8 +346,9 @@ rpcrdma_mr_pop(struct list_head *list) { struct rpcrdma_mr *mr; - mr = list_first_entry(list, struct rpcrdma_mr, mr_list); - list_del_init(&mr->mr_list); + mr = list_first_entry_or_null(list, struct rpcrdma_mr, mr_list); + if (mr) + list_del_init(&mr->mr_list); return mr; } From 1ca3f4c054a4e3765bdeb62c849d940b5bc8002d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:44:50 -0400 Subject: [PATCH 148/721] xprtrdma: Combine rpcrdma_mr_put and rpcrdma_mr_unmap_and_put Clean up. There is only one remaining rpcrdma_mr_put call site, and it can be directly replaced with unmap_and_put because mr->mr_dir is set to DMA_NONE just before the call. Now all the call sites do a DMA unmap, and we can just rename mr_unmap_and_put to mr_put, which nicely matches mr_get. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 6 +++--- net/sunrpc/xprtrdma/verbs.c | 32 ++++++++------------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 3 files changed, 11 insertions(+), 28 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index d7e763fafa04..97e1804139b8 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -129,7 +129,7 @@ void frwr_reset(struct rpcrdma_req *req) struct rpcrdma_mr *mr; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) - rpcrdma_mr_unmap_and_put(mr); + rpcrdma_mr_put(mr); } /** @@ -453,7 +453,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); trace_xprtrdma_mr_remoteinv(mr); - rpcrdma_mr_unmap_and_put(mr); + rpcrdma_mr_put(mr); break; /* only one invalidated MR per RPC */ } } @@ -463,7 +463,7 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) if (wc->status != IB_WC_SUCCESS) rpcrdma_mr_recycle(mr); else - rpcrdma_mr_unmap_and_put(mr); + rpcrdma_mr_put(mr); } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ee6fcf10425b..5e0b774ed522 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1233,34 +1233,15 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) return NULL; } -static void -__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr) -{ - spin_lock(&buf->rb_mrlock); - rpcrdma_mr_push(mr, &buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); -} - /** - * rpcrdma_mr_put - Release an rpcrdma_mr object - * @mr: object to release + * rpcrdma_mr_put - DMA unmap an MR and release it + * @mr: MR to release * */ -void -rpcrdma_mr_put(struct rpcrdma_mr *mr) -{ - __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr); -} - -/** - * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it - * @mr: object to release - * - */ -void -rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) +void rpcrdma_mr_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); @@ -1268,7 +1249,10 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } - __rpcrdma_mr_put(&r_xprt->rx_buf, mr); + + spin_lock(&buf->rb_mrlock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9663b8ddd733..3e0839c2cda2 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -485,7 +485,6 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); -void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); static inline void rpcrdma_mr_recycle(struct rpcrdma_mr *mr) From 3b39f52a02d4b3322744a0a32d59142e01afa435 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:45:37 -0400 Subject: [PATCH 149/721] xprtrdma: Move rpcrdma_mr_get out of frwr_map Refactor: Retrieve an MR and handle error recovery entirely in rpc_rdma.c, as this is not a device-specific function. Note that since commit 89f90fe1ad8b ("SUNRPC: Allow calls to xprt_transmit() to drain the entire transmit queue"), the xprt_transmit function handles the cond_resched. The transport no longer has to do this itself. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 29 ++++++++++++++++++++++++++++- net/sunrpc/xprtrdma/frwr_ops.c | 23 +++++------------------ net/sunrpc/xprtrdma/rpc_rdma.c | 30 ++++++++++++++++++++++++------ net/sunrpc/xprtrdma/verbs.c | 21 ++++----------------- net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++-- 5 files changed, 63 insertions(+), 44 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 6e6055eb67e7..83c4dfd7feea 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -464,7 +464,34 @@ TRACE_EVENT(xprtrdma_createmrs, ) ); -DEFINE_RXPRT_EVENT(xprtrdma_nomrs); +TRACE_EVENT(xprtrdma_nomrs, + TP_PROTO( + const struct rpcrdma_req *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(const void *, req) + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + ), + + TP_fast_assign( + const struct rpc_rqst *rqst = &req->rl_slot; + + __entry->req = req; + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + ), + + TP_printk("task:%u@%u xid=0x%08x req=%p", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->req + ) +); DEFINE_RDCH_EVENT(read); DEFINE_WRCH_EVENT(write); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 97e1804139b8..362056f4f48d 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -291,31 +291,25 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt) * @nsegs: number of segments remaining * @writing: true when RDMA Write will be used * @xid: XID of RPC using the registered memory - * @out: initialized MR + * @mr: MR to fill in * * Prepare a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. * * Returns the next segment or a negative errno pointer. - * On success, the prepared MR is planted in @out. + * On success, @mr is filled in. */ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr **out) + struct rpcrdma_mr *mr) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; - struct rpcrdma_mr *mr; - struct ib_mr *ibmr; struct ib_reg_wr *reg_wr; + struct ib_mr *ibmr; int i, n; u8 key; - mr = rpcrdma_mr_get(r_xprt); - if (!mr) - goto out_getmr_err; - if (nsegs > ia->ri_max_frwr_depth) nsegs = ia->ri_max_frwr_depth; for (i = 0; i < nsegs;) { @@ -330,7 +324,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, ++seg; ++i; - if (holes_ok) + if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS) continue; if ((i < nsegs && offset_in_page(seg->mr_offset)) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) @@ -365,22 +359,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_offset = ibmr->iova; trace_xprtrdma_mr_map(mr); - *out = mr; return seg; -out_getmr_err: - xprt_wait_for_buffer_space(&r_xprt->rx_xprt); - return ERR_PTR(-EAGAIN); - out_dmamap_err: mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); - rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_mapmr_err: trace_xprtrdma_frwr_maperr(mr, n); - rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 0ac096a6348a..34772cb19286 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -342,6 +342,27 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return 0; } +static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, + struct rpcrdma_mr **mr) +{ + *mr = rpcrdma_mr_get(r_xprt); + if (!*mr) + goto out_getmr_err; + + rpcrdma_mr_push(*mr, &req->rl_registered); + return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); + +out_getmr_err: + trace_xprtrdma_nomrs(req); + xprt_wait_for_buffer_space(&r_xprt->rx_xprt); + if (r_xprt->rx_ep.rep_connected != -ENODEV) + schedule_work(&r_xprt->rx_buf.rb_refresh_worker); + return ERR_PTR(-EAGAIN); +} + /* Register and XDR encode the Read list. Supports encoding a list of read * segments that belong to a single read chunk. * @@ -379,10 +400,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return nsegs; do { - seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; @@ -440,10 +460,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; @@ -501,10 +520,9 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 5e0b774ed522..c9fa0f27b10a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -408,7 +408,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) struct rpcrdma_req *req; struct rpcrdma_rep *rep; - cancel_delayed_work_sync(&buf->rb_refresh_worker); + cancel_work_sync(&buf->rb_refresh_worker); /* This is similar to rpcrdma_ep_destroy, but: * - Don't cancel the connect worker. @@ -975,7 +975,7 @@ static void rpcrdma_mr_refresh_worker(struct work_struct *work) { struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, - rb_refresh_worker.work); + rb_refresh_worker); struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); @@ -1086,8 +1086,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) spin_lock_init(&buf->rb_lock); INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all_mrs); - INIT_DELAYED_WORK(&buf->rb_refresh_worker, - rpcrdma_mr_refresh_worker); + INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); rpcrdma_mrs_create(r_xprt); @@ -1177,7 +1176,7 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_delayed_work_sync(&buf->rb_refresh_worker); + cancel_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); @@ -1218,19 +1217,7 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) spin_lock(&buf->rb_mrlock); mr = rpcrdma_mr_pop(&buf->rb_mrs); spin_unlock(&buf->rb_mrlock); - if (!mr) - goto out_nomrs; return mr; - -out_nomrs: - trace_xprtrdma_nomrs(r_xprt); - if (r_xprt->rx_ep.rep_connected != -ENODEV) - schedule_delayed_work(&buf->rb_refresh_worker, 0); - - /* Allow the reply handler and refresh worker to run */ - cond_resched(); - - return NULL; } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 3e0839c2cda2..9573587ca602 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -379,7 +379,7 @@ struct rpcrdma_buffer { u32 rb_bc_srv_max_requests; u32 rb_bc_max_requests; - struct delayed_work rb_refresh_worker; + struct work_struct rb_refresh_worker; }; /* @@ -548,7 +548,7 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr **mr); + struct rpcrdma_mr *mr); int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); From 805a1f620ba38c5f6de8b9697f35dcb38d8112b5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:46:24 -0400 Subject: [PATCH 150/721] xprtrdma: Ensure creating an MR does not trigger FS writeback Probably would be good to also pass GFP flags to ib_alloc_mr. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 7 ++++--- net/sunrpc/xprtrdma/verbs.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 362056f4f48d..1f2e3dda7401 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -147,11 +147,14 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) struct ib_mr *frmr; int rc; + /* NB: ib_alloc_mr and device drivers typically allocate + * memory with GFP_KERNEL. + */ frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; - sg = kcalloc(depth, sizeof(*sg), GFP_KERNEL); + sg = kcalloc(depth, sizeof(*sg), GFP_NOFS); if (!sg) goto out_list_err; @@ -171,8 +174,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) return rc; out_list_err: - dprintk("RPC: %s: sg allocation failure\n", - __func__); ib_dereg_mr(frmr); return -ENOMEM; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c9fa0f27b10a..cb6df58488bb 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -949,7 +949,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_mr *mr; int rc; - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = kzalloc(sizeof(*mr), GFP_NOFS); if (!mr) break; From be700103efd1050808db1cf00e52c3a2837bf802 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Thu, 15 Aug 2019 17:01:37 +0000 Subject: [PATCH 151/721] PCI: hv: Detect and fix Hyper-V PCI domain number collision Currently in Azure cloud, for passthrough devices, the host sets the device instance ID's bytes 8 - 15 to a value derived from the host HWID, which is the same on all devices in a VM. So, the device instance ID's bytes 8 and 9 provided by the host are no longer unique. This affects all Azure hosts since July 2018, and can cause device passthrough to VMs to fail because the bytes 8 and 9 are used as PCI domain number. Collision of domain numbers will cause the second device with the same domain number fail to load. In the cases of collision, we will detect and find another number that is not in use. Suggested-by: Michael Kelley Signed-off-by: Haiyang Zhang Signed-off-by: Lorenzo Pieralisi Acked-by: Sasha Levin --- drivers/pci/controller/pci-hyperv.c | 92 +++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 13 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 2b53976cd9f9..4caa3388692a 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2510,6 +2510,48 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus) complete(&hbus->remove_event); } +#define HVPCI_DOM_MAP_SIZE (64 * 1024) +static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); + +/* + * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 + * as invalid for passthrough PCI devices of this driver. + */ +#define HVPCI_DOM_INVALID 0 + +/** + * hv_get_dom_num() - Get a valid PCI domain number + * Check if the PCI domain number is in use, and return another number if + * it is in use. + * + * @dom: Requested domain number + * + * return: domain number on success, HVPCI_DOM_INVALID on failure + */ +static u16 hv_get_dom_num(u16 dom) +{ + unsigned int i; + + if (test_and_set_bit(dom, hvpci_dom_map) == 0) + return dom; + + for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { + if (test_and_set_bit(i, hvpci_dom_map) == 0) + return i; + } + + return HVPCI_DOM_INVALID; +} + +/** + * hv_put_dom_num() - Mark the PCI domain number as free + * @dom: Domain number to be freed + */ +static void hv_put_dom_num(u16 dom) +{ + clear_bit(dom, hvpci_dom_map); +} + /** * hv_pci_probe() - New VMBus channel probe, for a root PCI bus * @hdev: VMBus's tracking struct for this root PCI bus @@ -2521,6 +2563,7 @@ static int hv_pci_probe(struct hv_device *hdev, const struct hv_vmbus_device_id *dev_id) { struct hv_pcibus_device *hbus; + u16 dom_req, dom; int ret; /* @@ -2535,19 +2578,34 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->state = hv_pcibus_init; /* - * The PCI bus "domain" is what is called "segment" in ACPI and - * other specs. Pull it from the instance ID, to get something - * unique. Bytes 8 and 9 are what is used in Windows guests, so - * do the same thing for consistency. Note that, since this code - * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee - * that (1) the only domain in use for something that looks like - * a physical PCI bus (which is actually emulated by the - * hypervisor) is domain 0 and (2) there will be no overlap - * between domains derived from these instance IDs in the same - * VM. + * The PCI bus "domain" is what is called "segment" in ACPI and other + * specs. Pull it from the instance ID, to get something usually + * unique. In rare cases of collision, we will find out another number + * not in use. + * + * Note that, since this code only runs in a Hyper-V VM, Hyper-V + * together with this guest driver can guarantee that (1) The only + * domain used by Gen1 VMs for something that looks like a physical + * PCI bus (which is actually emulated by the hypervisor) is domain 0. + * (2) There will be no overlap between domains (after fixing possible + * collisions) in the same VM. */ - hbus->sysdata.domain = hdev->dev_instance.b[9] | - hdev->dev_instance.b[8] << 8; + dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9]; + dom = hv_get_dom_num(dom_req); + + if (dom == HVPCI_DOM_INVALID) { + dev_err(&hdev->device, + "Unable to use dom# 0x%hx or other numbers", dom_req); + ret = -EINVAL; + goto free_bus; + } + + if (dom != dom_req) + dev_info(&hdev->device, + "PCI dom# 0x%hx has collision, using 0x%hx", + dom_req, dom); + + hbus->sysdata.domain = dom; hbus->hdev = hdev; refcount_set(&hbus->remove_lock, 1); @@ -2562,7 +2620,7 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->sysdata.domain); if (!hbus->wq) { ret = -ENOMEM; - goto free_bus; + goto free_dom; } ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, @@ -2639,6 +2697,8 @@ static int hv_pci_probe(struct hv_device *hdev, vmbus_close(hdev->channel); destroy_wq: destroy_workqueue(hbus->wq); +free_dom: + hv_put_dom_num(hbus->sysdata.domain); free_bus: free_page((unsigned long)hbus); return ret; @@ -2720,6 +2780,9 @@ static int hv_pci_remove(struct hv_device *hdev) put_hvpcibus(hbus); wait_for_completion(&hbus->remove_event); destroy_workqueue(hbus->wq); + + hv_put_dom_num(hbus->sysdata.domain); + free_page((unsigned long)hbus); return 0; } @@ -2747,6 +2810,9 @@ static void __exit exit_hv_pci_drv(void) static int __init init_hv_pci_drv(void) { + /* Set the invalid domain number's bit, so it will not be used */ + set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); + return vmbus_driver_register(&hv_pci_drv); } From 5ae6393e6d41c7cc38c1ce679bc91c475412c624 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Mon, 19 Aug 2019 13:09:46 +0530 Subject: [PATCH 152/721] PCI: kirin: Make structure kirin_dw_pcie_ops constant Static variable kirin_dw_pcie_ops, of type dw_pcie_ops, is used only once, when it is assigned to the constant field ops of variable pci (having type dw_pcie) so kirin_dw_pcie_ops is never modified. Make it constant to protect it from unintended modification. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- drivers/pci/controller/dwc/pcie-kirin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 8df1914226be..c19617a912bd 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -436,7 +436,7 @@ static int kirin_pcie_host_init(struct pcie_port *pp) return 0; } -static struct dw_pcie_ops kirin_dw_pcie_ops = { +static const struct dw_pcie_ops kirin_dw_pcie_ops = { .read_dbi = kirin_pcie_read_dbi, .write_dbi = kirin_pcie_write_dbi, .link_up = kirin_pcie_link_up, From 6dc6ec9e04c468d994bff6eb660f3146f94cbfd9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:47:10 -0400 Subject: [PATCH 153/721] xprtrdma: Cache free MRs in each rpcrdma_req Instead of a globally-contended MR free list, cache MRs in each rpcrdma_req as they are released. This means acquiring and releasing an MR will be lock-free in the common case, even outside the transport send lock. The original idea of per-rpcrdma_req MR free lists was suggested by Shirley Ma several years ago. I just now figured out how to make that idea work with on-demand MR allocation. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 38 +++++++++++++++++++++++++++++++-- net/sunrpc/xprtrdma/frwr_ops.c | 9 +++++--- net/sunrpc/xprtrdma/rpc_rdma.c | 11 +++++++--- net/sunrpc/xprtrdma/verbs.c | 18 +++++++++++++--- net/sunrpc/xprtrdma/xprt_rdma.h | 7 +++--- 5 files changed, 69 insertions(+), 14 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 83c4dfd7feea..a13830616107 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -451,16 +451,50 @@ TRACE_EVENT(xprtrdma_createmrs, TP_STRUCT__entry( __field(const void *, r_xprt) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) __field(unsigned int, count) ), TP_fast_assign( __entry->r_xprt = r_xprt; __entry->count = count; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("r_xprt=%p: created %u MRs", - __entry->r_xprt, __entry->count + TP_printk("peer=[%s]:%s r_xprt=%p: created %u MRs", + __get_str(addr), __get_str(port), __entry->r_xprt, + __entry->count + ) +); + +TRACE_EVENT(xprtrdma_mr_get, + TP_PROTO( + const struct rpcrdma_req *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(const void *, req) + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + ), + + TP_fast_assign( + const struct rpc_rqst *rqst = &req->rl_slot; + + __entry->req = req; + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + ), + + TP_printk("task:%u@%u xid=0x%08x req=%p", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->req ) ); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 1f2e3dda7401..0e740bae2d80 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -488,8 +488,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_wake(wc, frwr); - complete(&frwr->fr_linv_done); __frwr_release_mr(wc, mr); + complete(&frwr->fr_linv_done); } /** @@ -587,11 +587,15 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_rep *rep = mr->mr_req->rl_reply; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_done(wc, frwr); - rpcrdma_complete_rqst(frwr->fr_req->rl_reply); __frwr_release_mr(wc, mr); + + /* Ensure @rep is generated before __frwr_release_mr */ + smp_rmb(); + rpcrdma_complete_rqst(rep); } /** @@ -624,7 +628,6 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; - frwr->fr_req = req; last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 34772cb19286..ffeb4dfebd46 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -348,9 +348,14 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, int nsegs, bool writing, struct rpcrdma_mr **mr) { - *mr = rpcrdma_mr_get(r_xprt); - if (!*mr) - goto out_getmr_err; + *mr = rpcrdma_mr_pop(&req->rl_free_mrs); + if (!*mr) { + *mr = rpcrdma_mr_get(r_xprt); + if (!*mr) + goto out_getmr_err; + trace_xprtrdma_mr_get(req); + (*mr)->mr_req = req; + } rpcrdma_mr_push(*mr, &req->rl_registered); return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index cb6df58488bb..69753ec73c36 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -77,6 +77,7 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); +static void rpcrdma_mr_free(struct rpcrdma_mr *mr); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); @@ -1022,6 +1023,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, if (!req->rl_recvbuf) goto out4; + INIT_LIST_HEAD(&req->rl_free_mrs); INIT_LIST_HEAD(&req->rl_registered); spin_lock(&buffer->rb_lock); list_add(&req->rl_all, &buffer->rb_allreqs); @@ -1130,11 +1132,13 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) * This function assumes that the caller prevents concurrent device * unload and transport tear-down. */ -void -rpcrdma_req_destroy(struct rpcrdma_req *req) +void rpcrdma_req_destroy(struct rpcrdma_req *req) { list_del(&req->rl_all); + while (!list_empty(&req->rl_free_mrs)) + rpcrdma_mr_free(rpcrdma_mr_pop(&req->rl_free_mrs)); + rpcrdma_regbuf_free(req->rl_recvbuf); rpcrdma_regbuf_free(req->rl_sendbuf); rpcrdma_regbuf_free(req->rl_rdmabuf); @@ -1228,7 +1232,6 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) void rpcrdma_mr_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); @@ -1237,6 +1240,15 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr) mr->mr_dir = DMA_NONE; } + rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); +} + +static void rpcrdma_mr_free(struct rpcrdma_mr *mr) +{ + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + + mr->mr_req = NULL; spin_lock(&buf->rb_mrlock); rpcrdma_mr_push(mr, &buf->rb_mrs); spin_unlock(&buf->rb_mrlock); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9573587ca602..c375b0e434ac 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -234,20 +234,20 @@ struct rpcrdma_sendctx { * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). */ -struct rpcrdma_req; struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; struct completion fr_linv_done; - struct rpcrdma_req *fr_req; union { struct ib_reg_wr fr_regwr; struct ib_send_wr fr_invwr; }; }; +struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; + struct rpcrdma_req *mr_req; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; @@ -325,7 +325,8 @@ struct rpcrdma_req { struct list_head rl_all; struct kref rl_kref; - struct list_head rl_registered; /* registered segments */ + struct list_head rl_free_mrs; + struct list_head rl_registered; struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; From 4d6b8890ddb16120c5dd25edc8d215cfd8222b63 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:47:57 -0400 Subject: [PATCH 154/721] xprtrdma: Remove rpcrdma_buffer::rb_mrlock Clean up: Now that the free list is used sparingly, get rid of the separate spin lock protecting it. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 4 ++-- net/sunrpc/xprtrdma/verbs.c | 21 ++++++++++----------- net/sunrpc/xprtrdma/xprt_rdma.h | 9 ++++----- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 0e740bae2d80..368cdf3edfc9 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -106,10 +106,10 @@ frwr_mr_recycle_worker(struct work_struct *work) mr->mr_dir = DMA_NONE; } - spin_lock(&r_xprt->rx_buf.rb_mrlock); + spin_lock(&r_xprt->rx_buf.rb_lock); list_del(&mr->mr_all); r_xprt->rx_stats.mrs_recycled++; - spin_unlock(&r_xprt->rx_buf.rb_mrlock); + spin_unlock(&r_xprt->rx_buf.rb_lock); frwr_release_mr(mr); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 69753ec73c36..52444c4d1be2 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -962,10 +962,10 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) mr->mr_xprt = r_xprt; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); list_add(&mr->mr_list, &buf->rb_mrs); list_add(&mr->mr_all, &buf->rb_all_mrs); - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); } r_xprt->rx_stats.mrs_allocated += count; @@ -1084,7 +1084,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests; buf->rb_bc_srv_max_requests = 0; - spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all_mrs); @@ -1154,18 +1153,18 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) unsigned int count; count = 0; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); while ((mr = list_first_entry_or_null(&buf->rb_all_mrs, struct rpcrdma_mr, mr_all)) != NULL) { list_del(&mr->mr_all); - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); frwr_release_mr(mr); count++; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); } - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); r_xprt->rx_stats.mrs_allocated = 0; } @@ -1218,9 +1217,9 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_mr *mr; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); mr = rpcrdma_mr_pop(&buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); return mr; } @@ -1249,9 +1248,9 @@ static void rpcrdma_mr_free(struct rpcrdma_mr *mr) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; mr->mr_req = NULL; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); rpcrdma_mr_push(mr, &buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c375b0e434ac..200d075bbe31 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -360,19 +360,18 @@ rpcrdma_mr_pop(struct list_head *list) * One of these is associated with a transport instance */ struct rpcrdma_buffer { - spinlock_t rb_mrlock; /* protect rb_mrs list */ + spinlock_t rb_lock; + struct list_head rb_send_bufs; + struct list_head rb_recv_bufs; struct list_head rb_mrs; - struct list_head rb_all_mrs; unsigned long rb_sc_head; unsigned long rb_sc_tail; unsigned long rb_sc_last; struct rpcrdma_sendctx **rb_sc_ctxs; - spinlock_t rb_lock; /* protect buf lists */ - struct list_head rb_send_bufs; - struct list_head rb_recv_bufs; struct list_head rb_allreqs; + struct list_head rb_all_mrs; u32 rb_max_requests; u32 rb_credits; /* most recent credit grant */ From b0b227f071a06d0ce5ef803538899299933d2931 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:48:43 -0400 Subject: [PATCH 155/721] xprtrdma: Use an llist to manage free rpcrdma_reps rpcrdma_rep objects are removed from their free list by only a single thread: the Receive completion handler. Thus that free list can be converted to an llist, where a single-threaded consumer and a multi-threaded producer (rpcrdma_buffer_put) can both access the llist without the need for any serialization. This eliminates spin lock contention between the Receive completion handler and rpcrdma_buffer_get, and makes the rep consumer wait- free. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 106 +++++++++++++++----------------- net/sunrpc/xprtrdma/xprt_rdma.h | 6 +- 2 files changed, 53 insertions(+), 59 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 52444c4d1be2..db90083ed35b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -75,6 +75,7 @@ * internal functions */ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); +static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_mr_free(struct rpcrdma_mr *mr); @@ -407,7 +408,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_req *req; - struct rpcrdma_rep *rep; cancel_work_sync(&buf->rb_refresh_worker); @@ -431,8 +431,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) /* The ULP is responsible for ensuring all DMA * mappings and MRs are gone. */ - list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) - rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); + rpcrdma_reps_destroy(buf); list_for_each_entry(req, &buf->rb_allreqs, rl_all) { rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf); rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); @@ -1071,6 +1070,40 @@ static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, return NULL; } +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +{ + rpcrdma_regbuf_free(rep->rr_rdmabuf); + kfree(rep); +} + +static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) +{ + struct llist_node *node; + + /* Calls to llist_del_first are required to be serialized */ + node = llist_del_first(&buf->rb_free_reps); + if (!node) + return NULL; + return llist_entry(node, struct rpcrdma_rep, rr_node); +} + +static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, + struct rpcrdma_rep *rep) +{ + if (!rep->rr_temp) + llist_add(&rep->rr_node, &buf->rb_free_reps); + else + rpcrdma_rep_destroy(rep); +} + +static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_rep *rep; + + while ((rep = rpcrdma_rep_get_locked(buf)) != NULL) + rpcrdma_rep_destroy(rep); +} + /** * rpcrdma_buffer_create - Create initial set of req/rep objects * @r_xprt: transport instance to (re)initialize @@ -1106,7 +1139,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } buf->rb_credits = 1; - INIT_LIST_HEAD(&buf->rb_recv_bufs); + init_llist_head(&buf->rb_free_reps); rc = rpcrdma_sendctxs_create(r_xprt); if (rc) @@ -1118,12 +1151,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) return rc; } -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) -{ - rpcrdma_regbuf_free(rep->rr_rdmabuf); - kfree(rep); -} - /** * rpcrdma_req_destroy - Destroy an rpcrdma_req object * @req: unused object to be destroyed @@ -1182,15 +1209,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) cancel_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); - - while (!list_empty(&buf->rb_recv_bufs)) { - struct rpcrdma_rep *rep; - - rep = list_first_entry(&buf->rb_recv_bufs, - struct rpcrdma_rep, rr_list); - list_del(&rep->rr_list); - rpcrdma_rep_destroy(rep); - } + rpcrdma_reps_destroy(buf); while (!list_empty(&buf->rb_send_bufs)) { struct rpcrdma_req *req; @@ -1281,39 +1300,24 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) */ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) { - struct rpcrdma_rep *rep = req->rl_reply; - + if (req->rl_reply) + rpcrdma_rep_put(buffers, req->rl_reply); req->rl_reply = NULL; spin_lock(&buffers->rb_lock); list_add(&req->rl_list, &buffers->rb_send_bufs); - if (rep) { - if (!rep->rr_temp) { - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - rep = NULL; - } - } spin_unlock(&buffers->rb_lock); - if (rep) - rpcrdma_rep_destroy(rep); } -/* - * Put reply buffers back into pool when not attached to - * request. This happens in error conditions. +/** + * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list + * @rep: rep to release + * + * Used after error conditions. */ -void -rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) +void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) { - struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; - - if (!rep->rr_temp) { - spin_lock(&buffers->rb_lock); - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock(&buffers->rb_lock); - } else { - rpcrdma_rep_destroy(rep); - } + rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep); } /* Returns a pointer to a rpcrdma_regbuf object, or NULL. @@ -1469,22 +1473,10 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) /* fast path: all needed reps can be found on the free list */ wr = NULL; - spin_lock(&buf->rb_lock); while (needed) { - rep = list_first_entry_or_null(&buf->rb_recv_bufs, - struct rpcrdma_rep, rr_list); + rep = rpcrdma_rep_get_locked(buf); if (!rep) - break; - - list_del(&rep->rr_list); - rep->rr_recv_wr.next = wr; - wr = &rep->rr_recv_wr; - --needed; - } - spin_unlock(&buf->rb_lock); - - while (needed) { - rep = rpcrdma_rep_create(r_xprt, temp); + rep = rpcrdma_rep_create(r_xprt, temp); if (!rep) break; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 200d075bbe31..bd1befa83d24 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -47,6 +47,7 @@ #include /* atomic_t, etc */ #include /* struct kref */ #include /* struct work_struct */ +#include #include /* RDMA connection api */ #include /* RDMA verbs api */ @@ -200,7 +201,7 @@ struct rpcrdma_rep { struct rpc_rqst *rr_rqst; struct xdr_buf rr_hdrbuf; struct xdr_stream rr_stream; - struct list_head rr_list; + struct llist_node rr_node; struct ib_recv_wr rr_recv_wr; }; @@ -362,7 +363,6 @@ rpcrdma_mr_pop(struct list_head *list) struct rpcrdma_buffer { spinlock_t rb_lock; struct list_head rb_send_bufs; - struct list_head rb_recv_bufs; struct list_head rb_mrs; unsigned long rb_sc_head; @@ -373,6 +373,8 @@ struct rpcrdma_buffer { struct list_head rb_allreqs; struct list_head rb_all_mrs; + struct llist_head rb_free_reps; + u32 rb_max_requests; u32 rb_credits; /* most recent credit grant */ From 2a7f77c7be1b5adb69712e440e97e0c6fa7ebecb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:49:30 -0400 Subject: [PATCH 156/721] xprtrdma: Clean up xprt_rdma_set_connect_timeout() Clean up: The function name should match the documenting comment. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index f4763e8a6761..993b96ff6760 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -494,9 +494,9 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) * @reconnect_timeout: reconnect timeout after server disconnects * */ -static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt, - unsigned long connect_timeout, - unsigned long reconnect_timeout) +static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt, + unsigned long connect_timeout, + unsigned long reconnect_timeout) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); @@ -805,7 +805,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = { .send_request = xprt_rdma_send_request, .close = xprt_rdma_close, .destroy = xprt_rdma_destroy, - .set_connect_timeout = xprt_rdma_tcp_set_connect_timeout, + .set_connect_timeout = xprt_rdma_set_connect_timeout, .print_stats = xprt_rdma_print_stats, .enable_swap = xprt_rdma_enable_swap, .disable_swap = xprt_rdma_disable_swap, From df901c85cc28b538c62f6bc20b16a8bd05fcb756 Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Sat, 13 Jul 2019 22:11:29 +0800 Subject: [PATCH 157/721] PCI: mobiveil: Fix the CPU base address setup in inbound window Current code erroneously sets-up the CPU base address through the parameter 'pci_addr', which is passed to initialize the CPU (AXI) base address of the inbound window where the controller maps the PCI address space into CPU physical address space; furthermore, it also truncates it by programming only the lower 32-bit value into the inbound CPU address register. Fix both issues by introducing a new parameter 'u64 cpu_addr' to initialize both lower 32-bit and upper 32-bit of the CPU physical base address mapping PCI inbound transactions into CPU (AXI) ones. Fixes: 9af6bcb11e12 ("PCI: mobiveil: Add Mobiveil PCIe Host Bridge IP driver") Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Minghuan Lian Reviewed-by: Subrahmanya Lingappa --- drivers/pci/controller/pcie-mobiveil.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/pcie-mobiveil.c b/drivers/pci/controller/pcie-mobiveil.c index 672e633601c7..a45a6447b01d 100644 --- a/drivers/pci/controller/pcie-mobiveil.c +++ b/drivers/pci/controller/pcie-mobiveil.c @@ -88,6 +88,7 @@ #define AMAP_CTRL_TYPE_MASK 3 #define PAB_EXT_PEX_AMAP_SIZEN(win) PAB_EXT_REG_ADDR(0xbef0, win) +#define PAB_EXT_PEX_AMAP_AXI_WIN(win) PAB_EXT_REG_ADDR(0xb4a0, win) #define PAB_PEX_AMAP_AXI_WIN(win) PAB_REG_ADDR(0x4ba4, win) #define PAB_PEX_AMAP_PEX_WIN_L(win) PAB_REG_ADDR(0x4ba8, win) #define PAB_PEX_AMAP_PEX_WIN_H(win) PAB_REG_ADDR(0x4bac, win) @@ -462,7 +463,7 @@ static int mobiveil_pcie_parse_dt(struct mobiveil_pcie *pcie) } static void program_ib_windows(struct mobiveil_pcie *pcie, int win_num, - u64 pci_addr, u32 type, u64 size) + u64 cpu_addr, u64 pci_addr, u32 type, u64 size) { u32 value; u64 size64 = ~(size - 1); @@ -482,7 +483,10 @@ static void program_ib_windows(struct mobiveil_pcie *pcie, int win_num, csr_writel(pcie, upper_32_bits(size64), PAB_EXT_PEX_AMAP_SIZEN(win_num)); - csr_writel(pcie, pci_addr, PAB_PEX_AMAP_AXI_WIN(win_num)); + csr_writel(pcie, lower_32_bits(cpu_addr), + PAB_PEX_AMAP_AXI_WIN(win_num)); + csr_writel(pcie, upper_32_bits(cpu_addr), + PAB_EXT_PEX_AMAP_AXI_WIN(win_num)); csr_writel(pcie, lower_32_bits(pci_addr), PAB_PEX_AMAP_PEX_WIN_L(win_num)); @@ -624,7 +628,7 @@ static int mobiveil_host_init(struct mobiveil_pcie *pcie) CFG_WINDOW_TYPE, resource_size(pcie->ob_io_res)); /* memory inbound translation window */ - program_ib_windows(pcie, WIN_NUM_0, 0, MEM_WINDOW_TYPE, IB_WIN_SIZE); + program_ib_windows(pcie, WIN_NUM_0, 0, 0, MEM_WINDOW_TYPE, IB_WIN_SIZE); /* Get the I/O and memory ranges from DT */ resource_list_for_each_entry(win, &pcie->resources) { From 17d47f93bc69cc629aa6a20c0ac7b20c9965c116 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:50:16 -0400 Subject: [PATCH 158/721] xprtrdma: Fix bc_max_slots return value For the moment the returned value just happens to be correct because the current backchannel server implementation does not vary the number of credits it offers. The spec does permit this value to change during the lifetime of a connection, however. The actual maximum is fixed for all RPC/RDMA transports, because each transport instance has to pre-allocate the resources for processing BC requests. That's the value that should be returned. Fixes: 7402a4fedc2b ("SUNRPC: Fix up backchannel slot table ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 59e624b1d7a0..50e075fcdd8f 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -54,9 +54,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt) { - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - - return r_xprt->rx_buf.rb_bc_srv_max_requests; + return RPCRDMA_BACKWARD_WRS >> 1; } static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) From 1738de336ebc9f8d4bb1b3126c5417a5923a660a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:51:03 -0400 Subject: [PATCH 159/721] xprtrdma: Inline XDR chunk encoder functions Micro-optimization: Save the cost of three function calls during transport header encoding. These were "noinline" before to generate more meaningful call stacks during debugging, but this code is now pretty stable. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ffeb4dfebd46..67e1684aee6d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -382,9 +382,10 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, * * Only a single @pos value is currently supported. */ -static noinline int -rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) +static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype rtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -436,9 +437,10 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * * Only a single Write chunk is currently supported. */ -static noinline int -rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -498,9 +500,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * Returns zero on success, or a negative errno if a failure occurred. * @xdr is advanced to the next position in the stream. */ -static noinline int -rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; From 435eba4ae0692e2f3d62988f8648efd65c935b6a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Aug 2019 18:51:49 -0400 Subject: [PATCH 160/721] xprtrdma: Optimize rpcrdma_post_recvs() Micro-optimization: In rpcrdma_post_recvs, since commit e340c2d6ef2a ("xprtrdma: Reduce the doorbell rate (Receive)"), the common case is to return without doing anything. Found with perf. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index db90083ed35b..ac2abf4578b9 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1465,7 +1465,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (ep->rep_receive_count > needed) + if (likely(ep->rep_receive_count > needed)) goto out; needed -= ep->rep_receive_count; if (!temp) From 1e672e3644940d83bd94e7cb46bac6bb3627de02 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Tue, 20 Aug 2019 22:21:21 -0500 Subject: [PATCH 161/721] NFSv4: Fix a memory leak bug In nfs4_try_migration(), if nfs4_begin_drain_session() fails, the previously allocated 'page' and 'locations' are not deallocated, leading to memory leaks. To fix this issue, go to the 'out' label to free 'page' and 'locations' before returning the error. Signed-off-by: Wenwen Wang Signed-off-by: Anna Schumaker --- fs/nfs/nfs4state.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cad4e064b328..e916aba7a799 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2095,8 +2095,10 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } status = nfs4_begin_drain_session(clp); - if (status != 0) - return status; + if (status != 0) { + result = status; + goto out; + } status = nfs4_replace_transport(server, locations); if (status != 0) { From 416dacb819f59180e4d86a5550052033ebb6d72c Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 21 Aug 2019 13:27:12 -0400 Subject: [PATCH 162/721] HID: hidraw: Fix invalid read in hidraw_ioctl The syzbot fuzzer has reported a pair of problems in the hidraw_ioctl() function: slab-out-of-bounds read and use-after-free read. An example of the first: BUG: KASAN: slab-out-of-bounds in strlen+0x79/0x90 lib/string.c:525 Read of size 1 at addr ffff8881c8035f38 by task syz-executor.4/2833 CPU: 1 PID: 2833 Comm: syz-executor.4 Not tainted 5.3.0-rc2+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xca/0x13e lib/dump_stack.c:113 print_address_description+0x6a/0x32c mm/kasan/report.c:351 __kasan_report.cold+0x1a/0x33 mm/kasan/report.c:482 kasan_report+0xe/0x12 mm/kasan/common.c:612 strlen+0x79/0x90 lib/string.c:525 strlen include/linux/string.h:281 [inline] hidraw_ioctl+0x245/0xae0 drivers/hid/hidraw.c:446 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:509 [inline] do_vfs_ioctl+0xd2d/0x1330 fs/ioctl.c:696 ksys_ioctl+0x9b/0xc0 fs/ioctl.c:713 __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl fs/ioctl.c:718 [inline] __x64_sys_ioctl+0x6f/0xb0 fs/ioctl.c:718 do_syscall_64+0xb7/0x580 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x459829 Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f7a68f6dc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000459829 RDX: 0000000000000000 RSI: 0000000080404805 RDI: 0000000000000004 RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f7a68f6e6d4 R13: 00000000004c21de R14: 00000000004d5620 R15: 00000000ffffffff The two problems have the same cause: hidraw_ioctl() fails to test whether the device has been removed. This patch adds the missing test. Reported-and-tested-by: syzbot+5a6c4ec678a0c6ee84ba@syzkaller.appspotmail.com Signed-off-by: Alan Stern CC: Signed-off-by: Jiri Kosina --- drivers/hid/hidraw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index 006bd6f4f653..62ef47a730b0 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -370,7 +370,7 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, mutex_lock(&minors_lock); dev = hidraw_table[minor]; - if (!dev) { + if (!dev || !dev->exist) { ret = -ENODEV; goto out; } From 5f9242775bb61f390f0885f23fc16397262c7538 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 20 Aug 2019 16:00:21 -0400 Subject: [PATCH 163/721] HID: logitech: Fix general protection fault caused by Logitech driver The syzbot fuzzer found a general protection fault in the HID subsystem: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN CPU: 0 PID: 3715 Comm: syz-executor.3 Not tainted 5.2.0-rc6+ #15 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__pm_runtime_resume+0x49/0x180 drivers/base/power/runtime.c:1069 Code: ed 74 d5 fe 45 85 ed 0f 85 9a 00 00 00 e8 6f 73 d5 fe 48 8d bd c1 02 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 48 89 fa 83 e2 07 38 d0 7f 08 84 c0 0f 85 fe 00 00 00 RSP: 0018:ffff8881d99d78e0 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000020 RCX: ffffc90003f3f000 RDX: 0000000416d8686d RSI: ffffffff82676841 RDI: 00000020b6c3436a RBP: 00000020b6c340a9 R08: ffff8881c6d64800 R09: fffffbfff0e84c25 R10: ffff8881d99d7940 R11: ffffffff87426127 R12: 0000000000000004 R13: 0000000000000000 R14: ffff8881d9b94000 R15: ffffffff897f9048 FS: 00007f047f542700(0000) GS:ffff8881db200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b30f21000 CR3: 00000001ca032000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: pm_runtime_get_sync include/linux/pm_runtime.h:226 [inline] usb_autopm_get_interface+0x1b/0x50 drivers/usb/core/driver.c:1707 usbhid_power+0x7c/0xe0 drivers/hid/usbhid/hid-core.c:1234 hid_hw_power include/linux/hid.h:1038 [inline] hidraw_open+0x20d/0x740 drivers/hid/hidraw.c:282 chrdev_open+0x219/0x5c0 fs/char_dev.c:413 do_dentry_open+0x497/0x1040 fs/open.c:778 do_last fs/namei.c:3416 [inline] path_openat+0x1430/0x3ff0 fs/namei.c:3533 do_filp_open+0x1a1/0x280 fs/namei.c:3563 do_sys_open+0x3c0/0x580 fs/open.c:1070 do_syscall_64+0xb7/0x560 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe It turns out the fault was caused by a bug in the HID Logitech driver, which violates the requirement that every pathway calling hid_hw_start() must also call hid_hw_stop(). This patch fixes the bug by making sure the requirement is met. Reported-and-tested-by: syzbot+3cbe5cd105d2ad56a1df@syzkaller.appspotmail.com Signed-off-by: Alan Stern CC: Signed-off-by: Jiri Kosina --- drivers/hid/hid-lg.c | 10 ++++++---- drivers/hid/hid-lg4ff.c | 1 - 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/hid/hid-lg.c b/drivers/hid/hid-lg.c index 5008a3dc28f4..0dc7cdfc56f7 100644 --- a/drivers/hid/hid-lg.c +++ b/drivers/hid/hid-lg.c @@ -818,7 +818,7 @@ static int lg_probe(struct hid_device *hdev, const struct hid_device_id *id) if (!buf) { ret = -ENOMEM; - goto err_free; + goto err_stop; } ret = hid_hw_raw_request(hdev, buf[0], buf, sizeof(cbuf), @@ -850,9 +850,12 @@ static int lg_probe(struct hid_device *hdev, const struct hid_device_id *id) ret = lg4ff_init(hdev); if (ret) - goto err_free; + goto err_stop; return 0; + +err_stop: + hid_hw_stop(hdev); err_free: kfree(drv_data); return ret; @@ -863,8 +866,7 @@ static void lg_remove(struct hid_device *hdev) struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if (drv_data->quirks & LG_FF4) lg4ff_deinit(hdev); - else - hid_hw_stop(hdev); + hid_hw_stop(hdev); kfree(drv_data); } diff --git a/drivers/hid/hid-lg4ff.c b/drivers/hid/hid-lg4ff.c index cefba038520c..03f0220062ca 100644 --- a/drivers/hid/hid-lg4ff.c +++ b/drivers/hid/hid-lg4ff.c @@ -1477,7 +1477,6 @@ int lg4ff_deinit(struct hid_device *hid) } } #endif - hid_hw_stop(hid); drv_data->device_props = NULL; kfree(entry); From 48c058543cbbb6b34114bbf8f0967e86dcd06b14 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 14 Aug 2019 15:27:00 -0400 Subject: [PATCH 164/721] NFS: Add an nfs4_call_sync_custom() function There are a few cases where we need to manually configure the rpc_task_setup structure to get the behavior we want. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1406858bae6c..e5b6499c0b8b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1073,14 +1073,26 @@ static const struct rpc_call_ops nfs40_call_sync_ops = { .rpc_call_done = nfs40_call_sync_done, }; +static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup) +{ + int ret; + struct rpc_task *task; + + task = rpc_run_task(task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res) { - int ret; - struct rpc_task *task; struct nfs_client *clp = server->nfs_client; struct nfs4_call_sync_data data = { .seq_server = server, @@ -1094,14 +1106,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, .callback_data = &data }; - task = rpc_run_task(&task_setup); - if (IS_ERR(task)) - ret = PTR_ERR(task); - else { - ret = task->tk_status; - rpc_put_task(task); - } - return ret; + return nfs4_call_sync_custom(&task_setup); } int nfs4_call_sync(struct rpc_clnt *clnt, From dae40965d51e563603222aa8d4fad7ab3cec0e2d Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 14 Aug 2019 15:28:28 -0400 Subject: [PATCH 165/721] NFS: Have nfs4_proc_setclientid() call nfs4_call_sync_custom() Rather than running the task manually Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e5b6499c0b8b..234312240f33 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6023,7 +6023,6 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_resp = res, .rpc_cred = cred, }; - struct rpc_task *task; struct rpc_task_setup task_setup_data = { .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, @@ -6056,17 +6055,12 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, dprintk("NFS call setclientid auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_owner_id); - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = task->tk_status; + + status = nfs4_call_sync_custom(&task_setup_data); if (setclientid.sc_cred) { clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); put_rpccred(setclientid.sc_cred); } - rpc_put_task(task); out: trace_nfs4_setclientid(clp, status); dprintk("NFS reply setclientid: %d\n", status); From 50493364e784b6e4077e41ab10eba4f798d13d9a Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 14 Aug 2019 15:30:16 -0400 Subject: [PATCH 166/721] NFS: Have _nfs4_proc_secinfo() call nfs4_call_sync_custom() We do this to set the RPC_TASK_NO_ROUND_ROBIN flag in the task_setup structure Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 234312240f33..de2b3fd806ef 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7644,6 +7644,8 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred) static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) { int status; + struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs_client *clp = NFS_SERVER(dir)->nfs_client; struct nfs4_secinfo_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -7656,26 +7658,37 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct .rpc_argp = &args, .rpc_resp = &res, }; - struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs4_call_sync_data data = { + .seq_server = NFS_SERVER(dir), + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = clp->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; const struct cred *cred = NULL; if (use_integrity) { - clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; - cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + clnt = clp->cl_rpcclient; + task_setup.rpc_client = clnt; + + cred = nfs4_get_clid_cred(clp); msg.rpc_cred = cred; } dprintk("NFS call secinfo %s\n", name->name); - nfs4_state_protect(NFS_SERVER(dir)->nfs_client, - NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); - status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, - &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); dprintk("NFS reply secinfo: %d\n", status); put_cred(cred); - return status; } From 4c952e3d1b0dc1835fda7f191b6f92e28b2fa435 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 14 Aug 2019 15:46:48 -0400 Subject: [PATCH 167/721] NFS: Have nfs41_proc_reclaim_complete() call nfs4_call_sync_custom() An async call followed by an rpc_wait_for_completion() is basically the same as a synchronous call, so we can use nfs4_call_sync_custom() to keep our custom callback ops and the RPC_TASK_NO_ROUND_ROBIN flag. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index de2b3fd806ef..1b7863ec12d3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8857,7 +8857,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, const struct cred *cred) { struct nfs4_reclaim_complete_data *calldata; - struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], .rpc_cred = cred, @@ -8866,7 +8865,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs4_reclaim_complete_call_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN, + .flags = RPC_TASK_NO_ROUND_ROBIN, }; int status = -ENOMEM; @@ -8881,15 +8880,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = rpc_wait_for_completion_task(task); - if (status == 0) - status = task->tk_status; - rpc_put_task(task); + status = nfs4_call_sync_custom(&task_setup_data); out: dprintk("<-- %s status=%d\n", __func__, status); return status; From cc15e24a3af249455158bdb2ad02b9685eb4d6e9 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 14 Aug 2019 16:22:31 -0400 Subject: [PATCH 168/721] NFS: Have nfs41_proc_secinfo_no_name() call nfs4_call_sync_custom() We need to use the custom rpc_task_setup here to set the RPC_TASK_NO_ROUND_ROBIN flag on the RPC call. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1b7863ec12d3..df12af8f6b36 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9365,18 +9365,32 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; struct rpc_clnt *clnt = server->client; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = server->nfs_client->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; const struct cred *cred = NULL; int status; if (use_integrity) { clnt = server->nfs_client->cl_rpcclient; + task_setup.rpc_client = clnt; + cred = nfs4_get_clid_cred(server->nfs_client); msg.rpc_cred = cred; } dprintk("--> %s\n", __func__); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); dprintk("<-- %s status=%d\n", __func__, status); put_cred(cred); From f836b27ecad98770bc53600d1ebd6533c921a4e7 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 19 Aug 2019 10:18:28 -0400 Subject: [PATCH 169/721] NFS: Have nfs4_proc_get_lease_time() call nfs4_call_sync_custom() This removes some code duplication, since both functions were doing the same thing. Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index df12af8f6b36..00c7a92e3d6b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8356,7 +8356,6 @@ static const struct rpc_call_ops nfs4_get_lease_time_ops = { int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) { - struct rpc_task *task; struct nfs4_get_lease_time_args args; struct nfs4_get_lease_time_res res = { .lr_fsinfo = fsinfo, @@ -8378,17 +8377,9 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) .callback_data = &data, .flags = RPC_TASK_TIMEOUT, }; - int status; nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1); - task = rpc_run_task(&task_setup); - - if (IS_ERR(task)) - return PTR_ERR(task); - - status = task->tk_status; - rpc_put_task(task); - return status; + return nfs4_call_sync_custom(&task_setup); } #ifdef CONFIG_NFS_V4_1 From 362c571b92e2fa734c1f2c1e303d1dc27f12ec95 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 12 Aug 2019 18:27:40 +0200 Subject: [PATCH 170/721] HID: wacom: do not call hid_set_drvdata(hdev, NULL) This is a common pattern in the HID drivers to reset the drvdata. However, this is actually already handled by driver core, so there is no need to do it manually. Signed-off-by: Benjamin Tissoires Acked-by: Jason Gerecke --- drivers/hid/wacom_sys.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 602219a8710d..5ded94b7bf68 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -2719,14 +2719,12 @@ static int wacom_probe(struct hid_device *hdev, wacom_wac->features = *((struct wacom_features *)id->driver_data); features = &wacom_wac->features; - if (features->check_for_hid_type && features->hid_type != hdev->type) { - error = -ENODEV; - goto fail; - } + if (features->check_for_hid_type && features->hid_type != hdev->type) + return -ENODEV; error = kfifo_alloc(&wacom_wac->pen_fifo, WACOM_PKGLEN_MAX, GFP_KERNEL); if (error) - goto fail; + return error; wacom_wac->hid_data.inputmode = -1; wacom_wac->mode_report = -1; @@ -2744,12 +2742,12 @@ static int wacom_probe(struct hid_device *hdev, error = hid_parse(hdev); if (error) { hid_err(hdev, "parse failed\n"); - goto fail; + return error; } error = wacom_parse_and_register(wacom, false); if (error) - goto fail; + return error; if (hdev->bus == BUS_BLUETOOTH) { error = device_create_file(&hdev->dev, &dev_attr_speed); @@ -2760,10 +2758,6 @@ static int wacom_probe(struct hid_device *hdev, } return 0; - -fail: - hid_set_drvdata(hdev, NULL); - return error; } static void wacom_remove(struct hid_device *hdev) @@ -2792,8 +2786,6 @@ static void wacom_remove(struct hid_device *hdev) wacom_release_resources(wacom); kfifo_free(&wacom_wac->pen_fifo); - - hid_set_drvdata(hdev, NULL); } #ifdef CONFIG_PM From 87fcb6a69e54527ccaeb5878168cccdba9f6b6ae Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 12 Aug 2019 18:27:39 +0200 Subject: [PATCH 171/721] HID: do not call hid_set_drvdata(hdev, NULL) in drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a common pattern in the HID drivers to reset the drvdata. Some do it properly, some do it only in case of failure. But, this is actually already handled by driver core, so there is no need to do it manually. [for hid-sensor-hub.c] Acked-by: Srinivas Pandruvada [For hid-picolcd_core.c] Acked-by: Bruno Prémont Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-cougar.c | 6 ++---- drivers/hid/hid-gfrm.c | 7 ------- drivers/hid/hid-lenovo.c | 2 -- drivers/hid/hid-picolcd_core.c | 7 +------ drivers/hid/hid-sensor-hub.c | 1 - 5 files changed, 3 insertions(+), 20 deletions(-) diff --git a/drivers/hid/hid-cougar.c b/drivers/hid/hid-cougar.c index e0bb7b34f3a4..4ff3bc1d25e2 100644 --- a/drivers/hid/hid-cougar.c +++ b/drivers/hid/hid-cougar.c @@ -207,7 +207,7 @@ static int cougar_probe(struct hid_device *hdev, error = hid_parse(hdev); if (error) { hid_err(hdev, "parse failed\n"); - goto fail; + return error; } if (hdev->collection->usage == COUGAR_VENDOR_USAGE) { @@ -219,7 +219,7 @@ static int cougar_probe(struct hid_device *hdev, error = hid_hw_start(hdev, connect_mask); if (error) { hid_err(hdev, "hw start failed\n"); - goto fail; + return error; } error = cougar_bind_shared_data(hdev, cougar); @@ -249,8 +249,6 @@ static int cougar_probe(struct hid_device *hdev, fail_stop_and_cleanup: hid_hw_stop(hdev); -fail: - hid_set_drvdata(hdev, NULL); return error; } diff --git a/drivers/hid/hid-gfrm.c b/drivers/hid/hid-gfrm.c index 86c317320bf2..699186ff2349 100644 --- a/drivers/hid/hid-gfrm.c +++ b/drivers/hid/hid-gfrm.c @@ -123,12 +123,6 @@ static int gfrm_probe(struct hid_device *hdev, const struct hid_device_id *id) return ret; } -static void gfrm_remove(struct hid_device *hdev) -{ - hid_hw_stop(hdev); - hid_set_drvdata(hdev, NULL); -} - static const struct hid_device_id gfrm_devices[] = { { HID_BLUETOOTH_DEVICE(0x58, 0x2000), .driver_data = GFRM100 }, @@ -142,7 +136,6 @@ static struct hid_driver gfrm_driver = { .name = "gfrm", .id_table = gfrm_devices, .probe = gfrm_probe, - .remove = gfrm_remove, .input_mapping = gfrm_input_mapping, .raw_event = gfrm_raw_event, .input_configured = gfrm_input_configured, diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index 364bc7f11d9d..96fa2a2c2cd3 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -866,8 +866,6 @@ static void lenovo_remove_tpkbd(struct hid_device *hdev) led_classdev_unregister(&data_pointer->led_micmute); led_classdev_unregister(&data_pointer->led_mute); - - hid_set_drvdata(hdev, NULL); } static void lenovo_remove_cptkbd(struct hid_device *hdev) diff --git a/drivers/hid/hid-picolcd_core.c b/drivers/hid/hid-picolcd_core.c index 5f7a39a5d4af..1b5c63241af0 100644 --- a/drivers/hid/hid-picolcd_core.c +++ b/drivers/hid/hid-picolcd_core.c @@ -534,8 +534,7 @@ static int picolcd_probe(struct hid_device *hdev, data = kzalloc(sizeof(struct picolcd_data), GFP_KERNEL); if (data == NULL) { hid_err(hdev, "can't allocate space for Minibox PicoLCD device data\n"); - error = -ENOMEM; - goto err_no_cleanup; + return -ENOMEM; } spin_lock_init(&data->lock); @@ -597,9 +596,6 @@ static int picolcd_probe(struct hid_device *hdev, hid_hw_stop(hdev); err_cleanup_data: kfree(data); -err_no_cleanup: - hid_set_drvdata(hdev, NULL); - return error; } @@ -635,7 +631,6 @@ static void picolcd_remove(struct hid_device *hdev) picolcd_exit_cir(data); picolcd_exit_keys(data); - hid_set_drvdata(hdev, NULL); mutex_destroy(&data->mutex); /* Finally, clean up the picolcd data itself */ kfree(data); diff --git a/drivers/hid/hid-sensor-hub.c b/drivers/hid/hid-sensor-hub.c index be92a6f79687..94c7398b5c27 100644 --- a/drivers/hid/hid-sensor-hub.c +++ b/drivers/hid/hid-sensor-hub.c @@ -742,7 +742,6 @@ static void sensor_hub_remove(struct hid_device *hdev) } spin_unlock_irqrestore(&data->lock, flags); mfd_remove_devices(&hdev->dev); - hid_set_drvdata(hdev, NULL); mutex_destroy(&data->mutex); } From c23e2043d5f7177c3f40da70663b2f3a1de82f5c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 12 Aug 2019 18:23:25 +0200 Subject: [PATCH 172/721] HID: multitouch: do not filter mice nodes It was a good idea at the time to not create a mouse node for the multitouch touchscreens, but: - touchscreens following the Win 8 protocol should not have this disturbing mouse node anymore, or if they have, it should be used for something else (like a joystick attached to the screen) - touchpads have it, and they should not use it unless there is a bug, but when the laptop has a trackstick, the data are reported through this mouse node. So instead of whitelisting all of the devices that have a need for the mouse node, just export it. hid-input.c will append a suffix to it ('Mouse'), so users will eventually see if something goes wrong. Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-multitouch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index b603c14d043b..0d190d93ca7c 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -263,7 +263,8 @@ static const struct mt_class mt_classes[] = { MT_QUIRK_HOVERING | MT_QUIRK_CONTACT_CNT_ACCURATE | MT_QUIRK_STICKY_FINGERS | - MT_QUIRK_WIN8_PTP_BUTTONS }, + MT_QUIRK_WIN8_PTP_BUTTONS, + .export_all_inputs = true }, { .name = MT_CLS_EXPORT_ALL_INPUTS, .quirks = MT_QUIRK_ALWAYS_VALID | MT_QUIRK_CONTACT_CNT_ACCURATE, From 69ecd44d68a7bf4caeda39825af720362db69233 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 12 Aug 2019 18:23:26 +0200 Subject: [PATCH 173/721] HID: multitouch: add support for the Smart Tech panel This panel is not very friendly to us: it exposes multiple multitouch collections, some of them being of logical application stylus. Usually, a device has only one report per application, and that is what I assumed in commit 8dfe14b3b47f ("HID: multitouch: ditch mt_report_id") To avoid breaking all working device, add a new class and a new quirk for that situation. Reported-and-tested-by: Matthias Fend Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-multitouch.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 0d190d93ca7c..3cfeb1629f79 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -68,6 +68,7 @@ MODULE_LICENSE("GPL"); #define MT_QUIRK_STICKY_FINGERS BIT(16) #define MT_QUIRK_ASUS_CUSTOM_UP BIT(17) #define MT_QUIRK_WIN8_PTP_BUTTONS BIT(18) +#define MT_QUIRK_SEPARATE_APP_REPORT BIT(19) #define MT_INPUTMODE_TOUCHSCREEN 0x02 #define MT_INPUTMODE_TOUCHPAD 0x03 @@ -103,6 +104,7 @@ struct mt_usages { struct mt_application { struct list_head list; unsigned int application; + unsigned int report_id; struct list_head mt_usages; /* mt usages list */ __s32 quirks; @@ -203,6 +205,7 @@ static void mt_post_parse(struct mt_device *td, struct mt_application *app); #define MT_CLS_VTL 0x0110 #define MT_CLS_GOOGLE 0x0111 #define MT_CLS_RAZER_BLADE_STEALTH 0x0112 +#define MT_CLS_SMART_TECH 0x0113 #define MT_DEFAULT_MAXCONTACT 10 #define MT_MAX_MAXCONTACT 250 @@ -354,6 +357,12 @@ static const struct mt_class mt_classes[] = { MT_QUIRK_CONTACT_CNT_ACCURATE | MT_QUIRK_WIN8_PTP_BUTTONS, }, + { .name = MT_CLS_SMART_TECH, + .quirks = MT_QUIRK_ALWAYS_VALID | + MT_QUIRK_IGNORE_DUPLICATES | + MT_QUIRK_CONTACT_CNT_ACCURATE | + MT_QUIRK_SEPARATE_APP_REPORT, + }, { } }; @@ -510,8 +519,9 @@ static struct mt_usages *mt_allocate_usage(struct hid_device *hdev, } static struct mt_application *mt_allocate_application(struct mt_device *td, - unsigned int application) + struct hid_report *report) { + unsigned int application = report->application; struct mt_application *mt_application; mt_application = devm_kzalloc(&td->hdev->dev, sizeof(*mt_application), @@ -536,6 +546,7 @@ static struct mt_application *mt_allocate_application(struct mt_device *td, mt_application->scantime = DEFAULT_ZERO; mt_application->raw_cc = DEFAULT_ZERO; mt_application->quirks = td->mtclass.quirks; + mt_application->report_id = report->id; list_add_tail(&mt_application->list, &td->applications); @@ -543,19 +554,23 @@ static struct mt_application *mt_allocate_application(struct mt_device *td, } static struct mt_application *mt_find_application(struct mt_device *td, - unsigned int application) + struct hid_report *report) { + unsigned int application = report->application; struct mt_application *tmp, *mt_application = NULL; list_for_each_entry(tmp, &td->applications, list) { if (application == tmp->application) { - mt_application = tmp; - break; + if (!(td->mtclass.quirks & MT_QUIRK_SEPARATE_APP_REPORT) || + tmp->report_id == report->id) { + mt_application = tmp; + break; + } } } if (!mt_application) - mt_application = mt_allocate_application(td, application); + mt_application = mt_allocate_application(td, report); return mt_application; } @@ -572,7 +587,7 @@ static struct mt_report_data *mt_allocate_report_data(struct mt_device *td, return NULL; rdata->report = report; - rdata->application = mt_find_application(td, report->application); + rdata->application = mt_find_application(td, report); if (!rdata->application) { devm_kfree(&td->hdev->dev, rdata); @@ -1562,6 +1577,9 @@ static int mt_input_configured(struct hid_device *hdev, struct hid_input *hi) case HID_VD_ASUS_CUSTOM_MEDIA_KEYS: suffix = "Custom Media Keys"; break; + case HID_DG_PEN: + suffix = "Stylus"; + break; default: suffix = "UNKNOWN"; break; @@ -2023,6 +2041,10 @@ static const struct hid_device_id mt_devices[] = { HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, USB_VENDOR_ID_SYNAPTICS, 0x8323) }, + /* Smart Tech panels */ + { .driver_data = MT_CLS_SMART_TECH, + MT_USB_DEVICE(0x0b8c, 0x0092)}, + /* Stantum panels */ { .driver_data = MT_CLS_CONFIDENCE, MT_USB_DEVICE(USB_VENDOR_ID_STANTUM_STM, From ee4ea764ea03253016034603cab352c0798ce173 Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 20 Aug 2019 07:28:43 +0000 Subject: [PATCH 174/721] dt-bindings: PCI: designware: Remove the num-lanes from Required properties The num-lanes is not a mandatory property, e.g. on FSL Layerscape SoCs, the PCIe link training is completed automatically based on the selected SerDes protocol, it does not need the num-lanes to set-up the link width. Currently it is both a Required and Optional property, let's remove it from the Required properties. Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- Documentation/devicetree/bindings/pci/designware-pcie.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pci/designware-pcie.txt b/Documentation/devicetree/bindings/pci/designware-pcie.txt index 5561a1c060d0..bd880df39a79 100644 --- a/Documentation/devicetree/bindings/pci/designware-pcie.txt +++ b/Documentation/devicetree/bindings/pci/designware-pcie.txt @@ -11,7 +11,6 @@ Required properties: the ATU address space. (The old way of getting the configuration address space from "ranges" is deprecated and should be avoided.) -- num-lanes: number of lanes to use RC mode: - #address-cells: set to <3> - #size-cells: set to <2> From 66de33f09fd97201847de7e1e2ec8a117242e1d6 Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 20 Aug 2019 07:28:49 +0000 Subject: [PATCH 175/721] PCI: dwc: Return directly when num-lanes is not found The num-lanes is optional since it is not needed on some platforms that bring up the link in firmware. The link programming is based on the num-lanes properties (which is optional); if it is not present code must return instead of fiddling with the lanes value to print an error message. Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- drivers/pci/controller/dwc/pcie-designware.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 7d25102c304c..0a89bfd1636e 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -423,8 +423,10 @@ void dw_pcie_setup(struct dw_pcie *pci) ret = of_property_read_u32(np, "num-lanes", &lanes); - if (ret) - lanes = 0; + if (ret) { + dev_dbg(pci->dev, "property num-lanes isn't found\n"); + return; + } /* Set the number of lanes */ val = dw_pcie_readl_dbi(pci, PCIE_PORT_LINK_CONTROL); From 568adba9eb201875bc561745e0f19b831b7e2bbc Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 20 Aug 2019 07:28:55 +0000 Subject: [PATCH 176/721] ARM: dts: ls1021a: Remove num-lanes property from PCIe nodes Remove the num-lanes property to avoid the driver setting the link width. On FSL Layerscape SoCs, the number of lanes assigned to PCIe controller is not fixed, it is determined by the selected SerDes protocol in the RCW (Reset Configuration Word). The PCIe link training is completed automatically through the selected SerDes protocol - the link width set-up is updated by hardware after power on reset, so the num-lanes property is not needed for Layerscape PCIe. The current num-lanes property was added erroneously, which actually indicates the maximum lanes the PCIe controller can support up to, instead of the lanes assigned to the PCIe controller. The link width set by SerDes protocol will be overridden by the num-lanes property, hence the subsequent re-training will fail when the assigned lanes do not match the value in the num-lanes property. Remove the property to fix the issue. Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- arch/arm/boot/dts/ls1021a.dtsi | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi index 464df4290ffc..2f6977ada447 100644 --- a/arch/arm/boot/dts/ls1021a.dtsi +++ b/arch/arm/boot/dts/ls1021a.dtsi @@ -874,7 +874,6 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -899,7 +898,6 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ From 4035ff36a6e0a07a1f0b0609520e17bd69a74c8b Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 20 Aug 2019 07:29:01 +0000 Subject: [PATCH 177/721] arm64: dts: fsl: Remove num-lanes property from PCIe nodes Remove the num-lanes property to avoid the driver setting the link width. On FSL Layerscape SoCs, the number of lanes assigned to PCIe controller is not fixed, it is determined by the selected SerDes protocol in the RCW (Reset Configuration Word). The PCIe link training is completed automatically through the selected SerDes protocol - the link width set-up is updated by hardware after power on reset, so the num-lanes property is not needed for Layerscape PCIe. The current num-lanes property was added erroneously, which actually indicates the maximum lanes the PCIe controller can support up to, instead of the lanes assigned to the PCIe controller. The link width set by SerDes protocol will be overridden by the num-lanes property, hence the subsequent re-training will fail when the assigned lanes do not match the value in the num-lanes property. Remove the property to fix the issue Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi | 1 - arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi | 3 --- arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi | 6 ------ arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi | 3 --- arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi | 4 ---- 5 files changed, 17 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi index ec6257a5b251..119c597ca867 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi @@ -486,7 +486,6 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; - num-lanes = <4>; num-viewport = <2>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 71d9ed9ff985..c084c7a4b6a6 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -677,7 +677,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -704,7 +703,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -731,7 +729,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000 /* downstream I/O */ diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index b0ef08b090dd..d4c1da3d4bde 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -649,7 +649,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <8>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -671,7 +670,6 @@ reg-names = "regs", "addr_space"; num-ib-windows = <6>; num-ob-windows = <8>; - num-lanes = <2>; status = "disabled"; }; @@ -687,7 +685,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <8>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -709,7 +706,6 @@ reg-names = "regs", "addr_space"; num-ib-windows = <6>; num-ob-windows = <8>; - num-lanes = <2>; status = "disabled"; }; @@ -725,7 +721,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <8>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -747,7 +742,6 @@ reg-names = "regs", "addr_space"; num-ib-windows = <6>; num-ob-windows = <8>; - num-lanes = <2>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi index dacd8cf03a7f..ce48a2323337 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi @@ -452,7 +452,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <256>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x20 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -478,7 +477,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x28 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -504,7 +502,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <8>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x30 0x00010000 0x0 0x00010000 /* downstream I/O */ diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index 3ace91945b72..d4993a2b404f 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -634,7 +634,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; msi-parent = <&its>; @@ -656,7 +655,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; msi-parent = <&its>; @@ -678,7 +676,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <8>; num-viewport = <256>; bus-range = <0x0 0xff>; msi-parent = <&its>; @@ -700,7 +697,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; msi-parent = <&its>; From 992ff2cc9ec6e30ccc64c341b052e8ccb370db90 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 12 Aug 2019 18:08:04 +0200 Subject: [PATCH 178/721] HID: logitech-dj: add support of the G700(s) receiver Both the G700 and the G700s are sharing the same receiver. Include support for this receiver in hid-logitech-dj so that userspace can differentiate both. Signed-off-by: Benjamin Tissoires Reviewed-by: Hans de Goede --- drivers/hid/hid-logitech-dj.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c index c547cba05fbb..d6250b0cb9f8 100644 --- a/drivers/hid/hid-logitech-dj.c +++ b/drivers/hid/hid-logitech-dj.c @@ -959,6 +959,7 @@ static void logi_hidpp_recv_queue_notif(struct hid_device *hdev, break; case 0x07: device_type = "eQUAD step 4 Gaming"; + logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); break; case 0x08: device_type = "eQUAD step 4 for gamepads"; @@ -1832,6 +1833,10 @@ static const struct hid_device_id logi_dj_receivers[] = { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_2), .driver_data = recvr_type_hidpp}, + { /* Logitech G700(s) receiver (0xc531) */ + HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, + 0xc531), + .driver_data = recvr_type_gaming_hidpp}, { /* Logitech lightspeed receiver (0xc539) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED), From 5722f3386eb9847131573513190796533e8f4999 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20La=C3=ADns?= Date: Tue, 30 Jul 2019 13:24:57 +0100 Subject: [PATCH 179/721] hid-logitech-dj: add the new Lightspeed receiver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patchs adds the new Lightspeed receiver. Currently it seems to only be used in the G305. Signed-off-by: Filipe Laíns [bentiss: rebased on top of master] Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-ids.h | 3 ++- drivers/hid/hid-logitech-dj.c | 13 +++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 0a00be19f7a0..214b698927c2 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -769,7 +769,8 @@ #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER 0xc52f #define USB_DEVICE_ID_LOGITECH_UNIFYING_RECEIVER_2 0xc532 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_2 0xc534 -#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED 0xc539 +#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1 0xc539 +#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1 0xc53f #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_POWERPLAY 0xc53a #define USB_DEVICE_ID_SPACETRAVELLER 0xc623 #define USB_DEVICE_ID_SPACENAVIGATOR 0xc626 diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c index d6250b0cb9f8..0c9fb870636f 100644 --- a/drivers/hid/hid-logitech-dj.c +++ b/drivers/hid/hid-logitech-dj.c @@ -969,7 +969,12 @@ static void logi_hidpp_recv_queue_notif(struct hid_device *hdev, logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); break; case 0x0c: - device_type = "eQUAD Lightspeed"; + device_type = "eQUAD Lightspeed 1"; + logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); + workitem.reports_supported |= STD_KEYBOARD; + break; + case 0x0d: + device_type = "eQUAD Lightspeed 1_1"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); workitem.reports_supported |= STD_KEYBOARD; break; @@ -1839,7 +1844,11 @@ static const struct hid_device_id logi_dj_receivers[] = { .driver_data = recvr_type_gaming_hidpp}, { /* Logitech lightspeed receiver (0xc539) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, - USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED), + USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1), + .driver_data = recvr_type_gaming_hidpp}, + { /* Logitech lightspeed receiver (0xc53f) */ + HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, + USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1), .driver_data = recvr_type_gaming_hidpp}, { /* Logitech 27 MHz HID++ 1.0 receiver (0xc513) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX3000_RECEIVER), From 8ccff2843fb4e6d9d26e5ae9ffe9840b38b92638 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 25 Aug 2019 17:35:42 +0200 Subject: [PATCH 180/721] HID: logitech-dj: Fix crash when initial logi_dj_recv_query_paired_devices fails Before this commit dj_probe would exit with an error if the initial logi_dj_recv_query_paired_devices fails. The initial call may fail when the receiver is connected through a kvm and the focus is away. When the call fails this causes 2 problems: 1) dj_probe calls logi_dj_recv_query_paired_devices after calling hid_device_io_start() so a HID report may have been received in between and our delayedwork_callback may be running. It seems that the initial logi_dj_recv_query_paired_devices failure happening with some KVMs triggers this exact scenario, causing the work-queue to run on free-ed memory, leading to: BUG: unable to handle page fault for address: 0000000000001e88 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 257 Comm: kworker/3:3 Tainted: G OE 5.3.0-rc5+ #100 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./B150M Pro4S/D3, BIOS P7.10 12/06/2016 Workqueue: events 0xffffffffc02ba200 RIP: 0010:0xffffffffc02ba1bd Code: e8 e8 13 00 d8 48 89 c5 48 85 c0 74 4c 48 8b 7b 10 48 89 ea b9 07 00 00 00 41 b9 09 00 00 00 41 b8 01 00 00 00 be 10 00 00 00 <48> 8b 87 88 1e 00 00 48 8b 40 40 e8 b3 6b b4 d8 48 89 ef 41 89 c4 RSP: 0018:ffffb760c046bdb8 EFLAGS: 00010286 RAX: ffff935038ea4550 RBX: ffff935046778000 RCX: 0000000000000007 RDX: ffff935038ea4550 RSI: 0000000000000010 RDI: 0000000000000000 RBP: ffff935038ea4550 R08: 0000000000000001 R09: 0000000000000009 R10: 000000000000e011 R11: 0000000000000001 R12: ffff9350467780e8 R13: ffff935046778000 R14: 0000000000000000 R15: ffff935046778070 FS: 0000000000000000(0000) GS:ffff935054e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000001e88 CR3: 000000075a612002 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: 0xffffffffc02ba2f7 ? process_one_work+0x1b1/0x560 process_one_work+0x234/0x560 worker_thread+0x50/0x3b0 kthread+0x10a/0x140 ? process_one_work+0x560/0x560 ? kthread_park+0x80/0x80 ret_from_fork+0x3a/0x50 Modules linked in: vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) bnep vfat fat btusb btrtl btbcm btintel bluetooth intel_rapl_msr ecdh_generic rfkill ecc snd_usb_audio snd_usbmidi_lib intel_rapl_common snd_rawmidi mc x86_pkg_temp_thermal intel_powerclamp coretemp iTCO_wdt iTCO_vendor_support mei_wdt mei_hdcp ppdev kvm_intel kvm irqbypass crct10dif_pclmul crc32_generic crc32_pclmul snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio ghash_clmulni_intel intel_cstate snd_hda_intel snd_hda_codec intel_uncore snd_hda_core snd_hwdep intel_rapl_perf snd_seq snd_seq_device snd_pcm snd_timer intel_wmi_thunderbolt snd e1000e soundcore mxm_wmi i2c_i801 bfq mei_me mei intel_pch_thermal parport_pc parport acpi_pad binfmt_misc hid_lg_g15(E) hid_logitech_dj(E) i915 crc32c_intel i2c_algo_bit drm_kms_helper nvme nvme_core drm wmi video uas usb_storage i2c_dev CR2: 0000000000001e88 ---[ end trace 1d3f8afdcfcbd842 ]--- 2) Even if we were to fix 1. by making sure the work is stopped before failing probe, failing probe is the wrong thing to do, we have logi_dj_recv_queue_unknown_work to deal with the initial logi_dj_recv_query_paired_devices failure. Rather then error-ing out of the probe, causing the receiver to not work at all we should rely on this, so that the attached devices will get properly enumerated once the KVM focus is switched back. Cc: stable@vger.kernel.org Fixes: 74808f9115ce ("HID: logitech-dj: add support for non unifying receivers") Signed-off-by: Hans de Goede Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-logitech-dj.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c index cc47f948c1d0..7badbaa18878 100644 --- a/drivers/hid/hid-logitech-dj.c +++ b/drivers/hid/hid-logitech-dj.c @@ -1734,14 +1734,14 @@ static int logi_dj_probe(struct hid_device *hdev, if (retval < 0) { hid_err(hdev, "%s: logi_dj_recv_query_paired_devices error:%d\n", __func__, retval); - goto logi_dj_recv_query_paired_devices_failed; + /* + * This can happen with a KVM, let the probe succeed, + * logi_dj_recv_queue_unknown_work will retry later. + */ } } - return retval; - -logi_dj_recv_query_paired_devices_failed: - hid_hw_close(hdev); + return 0; llopen_failed: switch_to_dj_mode_fail: From ee2f412ece32ab685921408ab1242d097557b57c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 26 Aug 2019 13:12:46 -0400 Subject: [PATCH 181/721] xprtrdma: Recycle MRs after disconnect The optimization done in "xprtrdma: Simplify rpcrdma_mr_pop" was a bit too optimistic. MRs left over after a reconnect still need to be recycled, not added back to the free list, since they could be in flight or actually fully registered. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 35 +++++++++++++++++++++++++-------- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 368cdf3edfc9..30065a28628c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -88,15 +88,8 @@ void frwr_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -/* MRs are dynamically allocated, so simply clean up and release the MR. - * A replacement MR will subsequently be allocated on demand. - */ -static void -frwr_mr_recycle_worker(struct work_struct *work) +static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - trace_xprtrdma_mr_recycle(mr); if (mr->mr_dir != DMA_NONE) { @@ -114,6 +107,32 @@ frwr_mr_recycle_worker(struct work_struct *work) frwr_release_mr(mr); } +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. + */ +static void +frwr_mr_recycle_worker(struct work_struct *work) +{ + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, + mr_recycle); + + frwr_mr_recycle(mr->mr_xprt, mr); +} + +/* frwr_recycle - Discard MRs + * @req: request to reset + * + * Used after a reconnect. These MRs could be in flight, we can't + * tell. Safe thing to do is release them. + */ +void frwr_recycle(struct rpcrdma_req *req) +{ + struct rpcrdma_mr *mr; + + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) + frwr_mr_recycle(mr->mr_xprt, mr); +} + /* frwr_reset - Place MRs back on the free list * @req: request to reset * diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 67e1684aee6d..19dd29a5c60d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -867,7 +867,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) * chunks. Very likely the connection has been replaced, * so these registrations are invalid and unusable. */ - frwr_reset(req); + frwr_recycle(req); /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index bd1befa83d24..65e6b0eb862e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -542,6 +542,7 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ bool frwr_is_supported(struct ib_device *device); +void frwr_recycle(struct rpcrdma_req *req); void frwr_reset(struct rpcrdma_req *req); int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep); int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); From f9e1afe0fa729337309fa44921da998d2e6e6198 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 26 Aug 2019 13:12:51 -0400 Subject: [PATCH 182/721] xprtrdma: Clear xprt->reestablish_timeout on close Ensure that the re-establishment delay does not grow exponentially on each good reconnect. This probably should have been part of commit 675dd90ad093 ("xprtrdma: Modernize ops->connect"). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 8 ++++++-- net/sunrpc/xprtrdma/transport.c | 3 +-- net/sunrpc/xprtrdma/verbs.c | 2 ++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 19dd29a5c60d..b86b5fd62d9f 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1261,8 +1261,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpc_rqst *rqst = rep->rr_rqst; int status; - xprt->reestablish_timeout = 0; - switch (rep->rr_proc) { case rdma_msg: status = rpcrdma_decode_msg(r_xprt, rep, rqst); @@ -1321,6 +1319,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; + /* Any data means we had a useful conversation, so + * then we don't need to delay the next reconnect. + */ + if (xprt->reestablish_timeout) + xprt->reestablish_timeout = 0; + /* Fixed transport header fields */ xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, rep->rr_hdrbuf.head[0].iov_base, NULL); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 993b96ff6760..160558b4135e 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -423,8 +423,6 @@ void xprt_rdma_close(struct rpc_xprt *xprt) if (ep->rep_connected == -ENODEV) return; - if (ep->rep_connected > 0) - xprt->reestablish_timeout = 0; rpcrdma_ep_disconnect(ep, ia); /* Prepare @xprt for the next connection by reinitializing @@ -434,6 +432,7 @@ void xprt_rdma_close(struct rpc_xprt *xprt) xprt->cwnd = RPC_CWNDSHIFT; out: + xprt->reestablish_timeout = 0; ++xprt->connect_cookie; xprt_disconnect_done(xprt); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ac2abf4578b9..1dadc9ef504f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -731,6 +731,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (rc) goto out; + if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); if (ep->rep_connected <= 0) { if (ep->rep_connected == -EAGAIN) From 98ef77d1aaa7a2f4e1b2a721faa084222021fda7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 26 Aug 2019 13:12:57 -0400 Subject: [PATCH 183/721] xprtrdma: Send Queue size grows after a reconnect Eli Dorfman reports that after a series of idle disconnects, an RPC/RDMA transport becomes unusable (rdma_create_qp returns -ENOMEM). Problem was tracked down to increasing Send Queue size after each reconnect. The rdma_create_qp() API does not promise to leave its @qp_init_attr parameter unaltered. In fact, some drivers do modify one or more of its fields. Thus our calls to rdma_create_qp must use a fresh copy of ib_qp_init_attr each time. This fix is appropriate for kernels dating back to late 2007, though it will have to be adapted, as the connect code has changed over the years. Reported-by: Eli Dorfman Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1dadc9ef504f..796945751e66 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -606,10 +606,10 @@ void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) * Unlike a normal reconnection, a fresh PD and a new set * of MRs and buffers is needed. */ -static int -rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, + struct ib_qp_init_attr *qp_init_attr) { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc, err; trace_xprtrdma_reinsert(r_xprt); @@ -626,7 +626,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, } rc = -ENETUNREACH; - err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr); if (err) { pr_err("rpcrdma: rdma_create_qp returned %d\n", err); goto out3; @@ -643,16 +643,16 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, return rc; } -static int -rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, - struct rpcrdma_ia *ia) +static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, + struct ib_qp_init_attr *qp_init_attr) { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rdma_cm_id *id, *old; int err, rc; trace_xprtrdma_reconnect(r_xprt); - rpcrdma_ep_disconnect(ep, ia); + rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia); rc = -EHOSTUNREACH; id = rpcrdma_create_id(r_xprt, ia); @@ -674,7 +674,7 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, goto out_destroy; } - err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + err = rdma_create_qp(id, ia->ri_pd, qp_init_attr); if (err) goto out_destroy; @@ -699,25 +699,27 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct ib_qp_init_attr qp_init_attr; int rc; retry: + memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr)); switch (ep->rep_connected) { case 0: dprintk("RPC: %s: connecting...\n", __func__); - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr); if (rc) { rc = -ENETUNREACH; goto out_noupdate; } break; case -ENODEV: - rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia); + rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr); if (rc) goto out_noupdate; break; default: - rc = rpcrdma_ep_reconnect(r_xprt, ep, ia); + rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr); if (rc) goto out; } From 116f21bb967fcef1fa360fe591a2947481788020 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Wed, 14 Aug 2019 15:33:20 +0200 Subject: [PATCH 184/721] selinux: avoid atomic_t usage in sidtab As noted in Documentation/atomic_t.txt, if we don't need the RMW atomic operations, we should only use READ_ONCE()/WRITE_ONCE() + smp_rmb()/smp_wmb() where necessary (or the combined variants smp_load_acquire()/smp_store_release()). This patch converts the sidtab code to use regular u32 for the counter and reverse lookup cache and use the appropriate operations instead of atomic_get()/atomic_set(). Note that when reading/updating the reverse lookup cache we don't need memory barriers as it doesn't need to be consistent or accurate. We can now also replace some atomic ops with regular loads (when under spinlock) and stores (for conversion target fields that are always accessed under the master table's spinlock). We can now also bump SIDTAB_MAX to U32_MAX as we can use the full u32 range again. Suggested-by: Jann Horn Signed-off-by: Ondrej Mosnacek Reviewed-by: Jann Horn Signed-off-by: Paul Moore --- security/selinux/ss/sidtab.c | 48 ++++++++++++++++-------------------- security/selinux/ss/sidtab.h | 19 ++++++++++---- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c index 1f0a6eaa2d6a..7d49994e8d5f 100644 --- a/security/selinux/ss/sidtab.c +++ b/security/selinux/ss/sidtab.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "flask.h" #include "security.h" #include "sidtab.h" @@ -23,14 +23,14 @@ int sidtab_init(struct sidtab *s) memset(s->roots, 0, sizeof(s->roots)); + /* max count is SIDTAB_MAX so valid index is always < SIDTAB_MAX */ for (i = 0; i < SIDTAB_RCACHE_SIZE; i++) - atomic_set(&s->rcache[i], -1); + s->rcache[i] = SIDTAB_MAX; for (i = 0; i < SECINITSID_NUM; i++) s->isids[i].set = 0; - atomic_set(&s->count, 0); - + s->count = 0; s->convert = NULL; spin_lock_init(&s->lock); @@ -130,14 +130,12 @@ static struct context *sidtab_do_lookup(struct sidtab *s, u32 index, int alloc) static struct context *sidtab_lookup(struct sidtab *s, u32 index) { - u32 count = (u32)atomic_read(&s->count); + /* read entries only after reading count */ + u32 count = smp_load_acquire(&s->count); if (index >= count) return NULL; - /* read entries after reading count */ - smp_rmb(); - return sidtab_do_lookup(s, index, 0); } @@ -210,10 +208,10 @@ static int sidtab_find_context(union sidtab_entry_inner entry, static void sidtab_rcache_update(struct sidtab *s, u32 index, u32 pos) { while (pos > 0) { - atomic_set(&s->rcache[pos], atomic_read(&s->rcache[pos - 1])); + WRITE_ONCE(s->rcache[pos], READ_ONCE(s->rcache[pos - 1])); --pos; } - atomic_set(&s->rcache[0], (int)index); + WRITE_ONCE(s->rcache[0], index); } static void sidtab_rcache_push(struct sidtab *s, u32 index) @@ -227,14 +225,14 @@ static int sidtab_rcache_search(struct sidtab *s, struct context *context, u32 i; for (i = 0; i < SIDTAB_RCACHE_SIZE; i++) { - int v = atomic_read(&s->rcache[i]); + u32 v = READ_ONCE(s->rcache[i]); - if (v < 0) + if (v >= SIDTAB_MAX) continue; - if (context_cmp(sidtab_do_lookup(s, (u32)v, 0), context)) { - sidtab_rcache_update(s, (u32)v, i); - *index = (u32)v; + if (context_cmp(sidtab_do_lookup(s, v, 0), context)) { + sidtab_rcache_update(s, v, i); + *index = v; return 0; } } @@ -245,8 +243,7 @@ static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, u32 *index) { unsigned long flags; - u32 count = (u32)atomic_read(&s->count); - u32 count_locked, level, pos; + u32 count, count_locked, level, pos; struct sidtab_convert_params *convert; struct context *dst, *dst_convert; int rc; @@ -255,11 +252,10 @@ static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, if (rc == 0) return 0; + /* read entries only after reading count */ + count = smp_load_acquire(&s->count); level = sidtab_level_from_count(count); - /* read entries after reading count */ - smp_rmb(); - pos = 0; rc = sidtab_find_context(s->roots[level], &pos, count, level, context, index); @@ -272,7 +268,7 @@ static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, spin_lock_irqsave(&s->lock, flags); convert = s->convert; - count_locked = (u32)atomic_read(&s->count); + count_locked = s->count; level = sidtab_level_from_count(count_locked); /* if count has changed before we acquired the lock, then catch up */ @@ -320,7 +316,7 @@ static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, } /* at this point we know the insert won't fail */ - atomic_set(&convert->target->count, count + 1); + convert->target->count = count + 1; } if (context->len) @@ -331,9 +327,7 @@ static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, *index = count; /* write entries before writing new count */ - smp_wmb(); - - atomic_set(&s->count, count + 1); + smp_store_release(&s->count, count + 1); rc = 0; out_unlock: @@ -423,7 +417,7 @@ int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params) return -EBUSY; } - count = (u32)atomic_read(&s->count); + count = s->count; level = sidtab_level_from_count(count); /* allocate last leaf in the new sidtab (to avoid race with @@ -436,7 +430,7 @@ int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params) } /* set count in case no new entries are added during conversion */ - atomic_set(¶ms->target->count, count); + params->target->count = count; /* enable live convert of new entries */ s->convert = params; diff --git a/security/selinux/ss/sidtab.h b/security/selinux/ss/sidtab.h index bbd5c0d1f3bd..1f4763141aa1 100644 --- a/security/selinux/ss/sidtab.h +++ b/security/selinux/ss/sidtab.h @@ -40,8 +40,8 @@ union sidtab_entry_inner { #define SIDTAB_LEAF_ENTRIES \ (SIDTAB_NODE_ALLOC_SIZE / sizeof(struct sidtab_entry_leaf)) -#define SIDTAB_MAX_BITS 31 /* limited to INT_MAX due to atomic_t range */ -#define SIDTAB_MAX (((u32)1 << SIDTAB_MAX_BITS) - 1) +#define SIDTAB_MAX_BITS 32 +#define SIDTAB_MAX U32_MAX /* ensure enough tree levels for SIDTAB_MAX entries */ #define SIDTAB_MAX_LEVEL \ DIV_ROUND_UP(SIDTAB_MAX_BITS - size_to_shift(SIDTAB_LEAF_ENTRIES), \ @@ -69,13 +69,22 @@ struct sidtab_convert_params { #define SIDTAB_RCACHE_SIZE 3 struct sidtab { + /* + * lock-free read access only for as many items as a prior read of + * 'count' + */ union sidtab_entry_inner roots[SIDTAB_MAX_LEVEL + 1]; - atomic_t count; + /* + * access atomically via {READ|WRITE}_ONCE(); only increment under + * spinlock + */ + u32 count; + /* access only under spinlock */ struct sidtab_convert_params *convert; spinlock_t lock; - /* reverse lookup cache */ - atomic_t rcache[SIDTAB_RCACHE_SIZE]; + /* reverse lookup cache - access atomically via {READ|WRITE}_ONCE() */ + u32 rcache[SIDTAB_RCACHE_SIZE]; /* index == SID - 1 (no entry for SECSID_NULL) */ struct sidtab_isid_entry isids[SECINITSID_NUM]; From 7ce2e76a0420801fb4b53b9e6850940e6b326433 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 27 Aug 2019 11:56:20 +0200 Subject: [PATCH 185/721] PCI: Move ASPM declarations to linux/pci.h Move ASPM definitions and function prototypes from include/linux/pci-aspm.h to include/linux/pci.h so users only need to include : PCIE_LINK_STATE_L0S PCIE_LINK_STATE_L1 PCIE_LINK_STATE_CLKPM pci_disable_link_state() pci_disable_link_state_locked() pcie_no_aspm() No functional changes intended. Link: https://lore.kernel.org/r/20190827095620.11213-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/acpi/pci_root.c | 1 - drivers/char/xillybus/xillybus_pcie.c | 1 - drivers/net/ethernet/intel/e1000e/e1000.h | 1 - drivers/net/ethernet/jme.c | 1 - drivers/net/ethernet/realtek/r8169_main.c | 1 - drivers/net/wireless/ath/ath5k/pci.c | 1 - .../net/wireless/intel/iwlegacy/3945-mac.c | 1 - .../net/wireless/intel/iwlegacy/4965-mac.c | 1 - .../net/wireless/intel/iwlwifi/pcie/trans.c | 1 - drivers/pci/pci-acpi.c | 1 - drivers/pci/pcie/aspm.c | 1 - drivers/pci/quirks.c | 1 - drivers/scsi/aacraid/linit.c | 1 - drivers/scsi/hpsa.c | 1 - drivers/scsi/mpt3sas/mpt3sas_scsih.c | 1 - include/linux/pci-aspm.h | 36 ------------------- include/linux/pci.h | 18 ++++++++++ 17 files changed, 18 insertions(+), 51 deletions(-) delete mode 100644 include/linux/pci-aspm.h diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 314a187ed572..d1e666ef3fcc 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/xillybus/xillybus_pcie.c b/drivers/char/xillybus/xillybus_pcie.c index 02c15952b103..18b0c392bc93 100644 --- a/drivers/char/xillybus/xillybus_pcie.c +++ b/drivers/char/xillybus/xillybus_pcie.c @@ -9,7 +9,6 @@ #include #include -#include #include #include "xillybus.h" diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index 34cd67951aec..6c51b1bad8c4 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index 0b668357db4d..57e8aea98969 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 0637c6752a78..a6d8e7f5fde7 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/wireless/ath/ath5k/pci.c b/drivers/net/wireless/ath/ath5k/pci.c index c6156cc38940..d5ee32ce9eb3 100644 --- a/drivers/net/wireless/ath/ath5k/pci.c +++ b/drivers/net/wireless/ath/ath5k/pci.c @@ -18,7 +18,6 @@ #include #include -#include #include #include #include "../ath.h" diff --git a/drivers/net/wireless/intel/iwlegacy/3945-mac.c b/drivers/net/wireless/intel/iwlegacy/3945-mac.c index b82da75a9ae3..4fbcc7fba3cc 100644 --- a/drivers/net/wireless/intel/iwlegacy/3945-mac.c +++ b/drivers/net/wireless/intel/iwlegacy/3945-mac.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c index fa2c02881939..ffb705b18fb1 100644 --- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c +++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index f5df5b370d78..4c308e33ee21 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -62,7 +62,6 @@ * *****************************************************************************/ #include -#include #include #include #include diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 45049f558860..dd520fe927db 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index e44af7f4d37f..ad6259ec51a6 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "../pci.h" #ifdef MODULE_PARAM_PREFIX diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 208aacf39329..9ac1a7564c9e 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 644f7f5c61a2..4a858789e6c5 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 43a6b5350775..148663373f7d 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 717ba0845a2a..27fdbc165446 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h deleted file mode 100644 index 67064145d76e..000000000000 --- a/include/linux/pci-aspm.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * aspm.h - * - * PCI Express ASPM defines and function prototypes - * - * Copyright (C) 2007 Intel Corp. - * Zhang Yanmin (yanmin.zhang@intel.com) - * Shaohua Li (shaohua.li@intel.com) - * - * For more information, please consult the following manuals (look at - * http://www.pcisig.com/ for how to get them): - * - * PCI Express Specification - */ - -#ifndef LINUX_ASPM_H -#define LINUX_ASPM_H - -#include - -#define PCIE_LINK_STATE_L0S 1 -#define PCIE_LINK_STATE_L1 2 -#define PCIE_LINK_STATE_CLKPM 4 - -#ifdef CONFIG_PCIEASPM -int pci_disable_link_state(struct pci_dev *pdev, int state); -int pci_disable_link_state_locked(struct pci_dev *pdev, int state); -void pcie_no_aspm(void); -#else -static inline int pci_disable_link_state(struct pci_dev *pdev, int state) -{ return 0; } -static inline void pcie_no_aspm(void) { } -#endif - -#endif /* LINUX_ASPM_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..b4e479921e07 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -6,12 +6,18 @@ * Copyright 1994, Drew Eckhardt * Copyright 1997--1999 Martin Mares * + * PCI Express ASPM defines and function prototypes + * Copyright (c) 2007 Intel Corp. + * Zhang Yanmin (yanmin.zhang@intel.com) + * Shaohua Li (shaohua.li@intel.com) + * * For more information, please consult the following manuals (look at * http://www.pcisig.com/ for how to get them): * * PCI BIOS Specification * PCI Local Bus Specification * PCI to PCI Bridge Specification + * PCI Express Specification * PCI System Design Guide */ #ifndef LINUX_PCI_H @@ -1565,9 +1571,21 @@ extern bool pcie_ports_native; #define pcie_ports_native false #endif +#define PCIE_LINK_STATE_L0S 1 +#define PCIE_LINK_STATE_L1 2 +#define PCIE_LINK_STATE_CLKPM 4 + #ifdef CONFIG_PCIEASPM +int pci_disable_link_state(struct pci_dev *pdev, int state); +int pci_disable_link_state_locked(struct pci_dev *pdev, int state); +void pcie_no_aspm(void); bool pcie_aspm_support_enabled(void); #else +static inline int pci_disable_link_state(struct pci_dev *pdev, int state) +{ return 0; } +static inline int pci_disable_link_state_locked(struct pci_dev *pdev, int state) +{ return 0; } +static inline void pcie_no_aspm(void) { } static inline bool pcie_aspm_support_enabled(void) { return false; } #endif From e2797ad31fb40f4ff59ebc4314d6f000d713bad9 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 27 Aug 2019 11:49:49 +0200 Subject: [PATCH 186/721] PCI/ACPI: Rename _HPX structs from hpp_* to hpx_* The names of the hpp_type0, hpp_type1 and hpp_type2 structs suggest that they're related to _HPP, when in fact they're related to _HPX. The struct hpp_type0 denotes an _HPX Type 0 setting record that supersedes the _HPP setting record, and it has been used interchangeably for _HPP as per the ACPI specification (see version 6.3, section 6.2.9.1) which states that it should be applied to PCI, PCI-X and PCI Express devices, with settings being ignored if they are not applicable. Rename them to hpx_type0, hpx_type1 and hpx_type2 to reflect their relation to _HPX rather than _HPP. Link: https://lore.kernel.org/r/20190827094951.10613-2-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-acpi.c | 28 +++++++-------- drivers/pci/probe.c | 72 ++++++++++++++++++------------------- include/linux/pci_hotplug.h | 30 ++++++++-------- 3 files changed, 64 insertions(+), 66 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 45049f558860..02addc47edae 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -119,7 +119,7 @@ phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle) } static acpi_status decode_type0_hpx_record(union acpi_object *record, - struct hpp_type0 *hpx0) + struct hpx_type0 *hpx0) { int i; union acpi_object *fields = record->package.elements; @@ -147,7 +147,7 @@ static acpi_status decode_type0_hpx_record(union acpi_object *record, } static acpi_status decode_type1_hpx_record(union acpi_object *record, - struct hpp_type1 *hpx1) + struct hpx_type1 *hpx1) { int i; union acpi_object *fields = record->package.elements; @@ -174,7 +174,7 @@ static acpi_status decode_type1_hpx_record(union acpi_object *record, } static acpi_status decode_type2_hpx_record(union acpi_object *record, - struct hpp_type2 *hpx2) + struct hpx_type2 *hpx2) { int i; union acpi_object *fields = record->package.elements; @@ -277,9 +277,9 @@ static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, acpi_status status; struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; union acpi_object *package, *record, *fields; - struct hpp_type0 hpx0; - struct hpp_type1 hpx1; - struct hpp_type2 hpx2; + struct hpx_type0 hpx0; + struct hpx_type1 hpx1; + struct hpx_type2 hpx2; u32 type; int i; @@ -353,10 +353,10 @@ static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle, acpi_status status; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *package, *fields; - struct hpp_type0 hpp0; + struct hpx_type0 hpx0; int i; - memset(&hpp0, 0, sizeof(hpp0)); + memset(&hpx0, 0, sizeof(hpx0)); status = acpi_evaluate_object(handle, "_HPP", NULL, &buffer); if (ACPI_FAILURE(status)) @@ -377,13 +377,13 @@ static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle, } } - hpp0.revision = 1; - hpp0.cache_line_size = fields[0].integer.value; - hpp0.latency_timer = fields[1].integer.value; - hpp0.enable_serr = fields[2].integer.value; - hpp0.enable_perr = fields[3].integer.value; + hpx0.revision = 1; + hpx0.cache_line_size = fields[0].integer.value; + hpx0.latency_timer = fields[1].integer.value; + hpx0.enable_serr = fields[2].integer.value; + hpx0.enable_perr = fields[3].integer.value; - hp_ops->program_type0(dev, &hpp0); + hp_ops->program_type0(dev, &hpx0); exit: kfree(buffer.pointer); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index a3c7338fad86..120c70b5003b 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1920,7 +1920,7 @@ static void pci_configure_mps(struct pci_dev *dev) p_mps, mps, mpss); } -static struct hpp_type0 pci_default_type0 = { +static struct hpx_type0 pci_default_type0 = { .revision = 1, .cache_line_size = 8, .latency_timer = 0x40, @@ -1928,44 +1928,44 @@ static struct hpp_type0 pci_default_type0 = { .enable_perr = 0, }; -static void program_hpp_type0(struct pci_dev *dev, struct hpp_type0 *hpp) +static void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) { u16 pci_cmd, pci_bctl; - if (!hpp) - hpp = &pci_default_type0; + if (!hpx) + hpx = &pci_default_type0; - if (hpp->revision > 1) { + if (hpx->revision > 1) { pci_warn(dev, "PCI settings rev %d not supported; using defaults\n", - hpp->revision); - hpp = &pci_default_type0; + hpx->revision); + hpx = &pci_default_type0; } - pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpp->cache_line_size); - pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpp->latency_timer); + pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpx->cache_line_size); + pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpx->latency_timer); pci_read_config_word(dev, PCI_COMMAND, &pci_cmd); - if (hpp->enable_serr) + if (hpx->enable_serr) pci_cmd |= PCI_COMMAND_SERR; - if (hpp->enable_perr) + if (hpx->enable_perr) pci_cmd |= PCI_COMMAND_PARITY; pci_write_config_word(dev, PCI_COMMAND, pci_cmd); /* Program bridge control value */ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER, - hpp->latency_timer); + hpx->latency_timer); pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl); - if (hpp->enable_perr) + if (hpx->enable_perr) pci_bctl |= PCI_BRIDGE_CTL_PARITY; pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl); } } -static void program_hpp_type1(struct pci_dev *dev, struct hpp_type1 *hpp) +static void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) { int pos; - if (!hpp) + if (!hpx) return; pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); @@ -1990,20 +1990,20 @@ static bool pcie_root_rcb_set(struct pci_dev *dev) return false; } -static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) +static void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) { int pos; u32 reg32; - if (!hpp) + if (!hpx) return; if (!pci_is_pcie(dev)) return; - if (hpp->revision > 1) { + if (hpx->revision > 1) { pci_warn(dev, "PCIe settings rev %d not supported\n", - hpp->revision); + hpx->revision); return; } @@ -2012,14 +2012,14 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) * those to make sure they're consistent with the rest of the * platform. */ - hpp->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | + hpx->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ; - hpp->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | + hpx->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ); /* Initialize Device Control Register */ pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, - ~hpp->pci_exp_devctl_and, hpp->pci_exp_devctl_or); + ~hpx->pci_exp_devctl_and, hpx->pci_exp_devctl_or); /* Initialize Link Control Register */ if (pcie_cap_has_lnkctl(dev)) { @@ -2028,13 +2028,13 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) * If the Root Port supports Read Completion Boundary of * 128, set RCB to 128. Otherwise, clear it. */ - hpp->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB; - hpp->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB; + hpx->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB; + hpx->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB; if (pcie_root_rcb_set(dev)) - hpp->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB; + hpx->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB; pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL, - ~hpp->pci_exp_lnkctl_and, hpp->pci_exp_lnkctl_or); + ~hpx->pci_exp_lnkctl_and, hpx->pci_exp_lnkctl_or); } /* Find Advanced Error Reporting Enhanced Capability */ @@ -2044,22 +2044,22 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) /* Initialize Uncorrectable Error Mask Register */ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, ®32); - reg32 = (reg32 & hpp->unc_err_mask_and) | hpp->unc_err_mask_or; + reg32 = (reg32 & hpx->unc_err_mask_and) | hpx->unc_err_mask_or; pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32); /* Initialize Uncorrectable Error Severity Register */ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, ®32); - reg32 = (reg32 & hpp->unc_err_sever_and) | hpp->unc_err_sever_or; + reg32 = (reg32 & hpx->unc_err_sever_and) | hpx->unc_err_sever_or; pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32); /* Initialize Correctable Error Mask Register */ pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, ®32); - reg32 = (reg32 & hpp->cor_err_mask_and) | hpp->cor_err_mask_or; + reg32 = (reg32 & hpx->cor_err_mask_and) | hpx->cor_err_mask_or; pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32); /* Initialize Advanced Error Capabilities and Control Register */ pci_read_config_dword(dev, pos + PCI_ERR_CAP, ®32); - reg32 = (reg32 & hpp->adv_err_cap_and) | hpp->adv_err_cap_or; + reg32 = (reg32 & hpx->adv_err_cap_and) | hpx->adv_err_cap_or; /* Don't enable ECRC generation or checking if unsupported */ if (!(reg32 & PCI_ERR_CAP_ECRC_GENC)) @@ -2178,15 +2178,15 @@ static void program_hpx_type3_register(struct pci_dev *dev, pos, orig_value, write_reg); } -static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx3) +static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) { - if (!hpx3) + if (!hpx) return; if (!pci_is_pcie(dev)) return; - program_hpx_type3_register(dev, hpx3); + program_hpx_type3_register(dev, hpx); } int pci_configure_extended_tags(struct pci_dev *dev, void *ign) @@ -2370,9 +2370,9 @@ static void pci_configure_serr(struct pci_dev *dev) static void pci_configure_device(struct pci_dev *dev) { static const struct hotplug_program_ops hp_ops = { - .program_type0 = program_hpp_type0, - .program_type1 = program_hpp_type1, - .program_type2 = program_hpp_type2, + .program_type0 = program_hpx_type0, + .program_type1 = program_hpx_type1, + .program_type2 = program_hpx_type2, .program_type3 = program_hpx_type3, }; diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index f694eb2ca978..247a49ee27b3 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -86,25 +86,25 @@ void pci_hp_deregister(struct hotplug_slot *slot); #define pci_hp_initialize(slot, bus, nr, name) \ __pci_hp_initialize(slot, bus, nr, name, THIS_MODULE, KBUILD_MODNAME) -/* PCI Setting Record (Type 0) */ -struct hpp_type0 { - u32 revision; - u8 cache_line_size; - u8 latency_timer; +/* _HPX PCI Setting Record (Type 0); same as _HPP */ +struct hpx_type0 { + u32 revision; /* Not present in _HPP */ + u8 cache_line_size; /* Not applicable to PCIe */ + u8 latency_timer; /* Not applicable to PCIe */ u8 enable_serr; u8 enable_perr; }; -/* PCI-X Setting Record (Type 1) */ -struct hpp_type1 { +/* _HPX PCI-X Setting Record (Type 1) */ +struct hpx_type1 { u32 revision; u8 max_mem_read; u8 avg_max_split; u16 tot_max_split; }; -/* PCI Express Setting Record (Type 2) */ -struct hpp_type2 { +/* _HPX PCI Express Setting Record (Type 2) */ +struct hpx_type2 { u32 revision; u32 unc_err_mask_and; u32 unc_err_mask_or; @@ -124,9 +124,7 @@ struct hpp_type2 { u32 sec_unc_err_mask_or; }; -/* - * _HPX PCI Express Setting Record (Type 3) - */ +/* _HPX PCI Express Setting Record (Type 3) */ struct hpx_type3 { u16 device_type; u16 function_type; @@ -145,10 +143,10 @@ struct hpx_type3 { }; struct hotplug_program_ops { - void (*program_type0)(struct pci_dev *dev, struct hpp_type0 *hpp); - void (*program_type1)(struct pci_dev *dev, struct hpp_type1 *hpp); - void (*program_type2)(struct pci_dev *dev, struct hpp_type2 *hpp); - void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpp); + void (*program_type0)(struct pci_dev *dev, struct hpx_type0 *hpx); + void (*program_type1)(struct pci_dev *dev, struct hpx_type1 *hpx); + void (*program_type2)(struct pci_dev *dev, struct hpx_type2 *hpx); + void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpx); }; enum hpx_type3_dev_type { From 8c3aac6e1b6146ce771b1cabd78e593136d3e5f2 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 27 Aug 2019 11:49:50 +0200 Subject: [PATCH 187/721] PCI/ACPI: Move _HPP & _HPX functions to pci-acpi.c Move program_hpx_type0(), program_hpx_type1(), etc., and enums hpx_type3_dev_type, hpx_type3_fn_type and hpx_type3_cfg_loc to drivers/pci/pci-acpi.c as these functions and enums are ACPI-specific. Move structs hpx_type0, hpx_type1, hpx_type2 and hpx_type3 to drivers/pci/pci.h as these are shared between drivers/pci/pci-acpi.c and drivers/pci/probe.c. Link: https://lore.kernel.org/r/20190827094951.10613-3-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-acpi.c | 296 ++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 79 ++++++++++ drivers/pci/probe.c | 269 -------------------------------- include/linux/pci_hotplug.h | 98 ------------ 4 files changed, 375 insertions(+), 367 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 02addc47edae..0bfabb3f9931 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -118,6 +118,47 @@ phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle) return (phys_addr_t)mcfg_addr; } +static struct hpx_type0 pci_default_type0 = { + .revision = 1, + .cache_line_size = 8, + .latency_timer = 0x40, + .enable_serr = 0, + .enable_perr = 0, +}; + +void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) +{ + u16 pci_cmd, pci_bctl; + + if (!hpx) + hpx = &pci_default_type0; + + if (hpx->revision > 1) { + pci_warn(dev, "PCI settings rev %d not supported; using defaults\n", + hpx->revision); + hpx = &pci_default_type0; + } + + pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpx->cache_line_size); + pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpx->latency_timer); + pci_read_config_word(dev, PCI_COMMAND, &pci_cmd); + if (hpx->enable_serr) + pci_cmd |= PCI_COMMAND_SERR; + if (hpx->enable_perr) + pci_cmd |= PCI_COMMAND_PARITY; + pci_write_config_word(dev, PCI_COMMAND, pci_cmd); + + /* Program bridge control value */ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { + pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER, + hpx->latency_timer); + pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl); + if (hpx->enable_perr) + pci_bctl |= PCI_BRIDGE_CTL_PARITY; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl); + } +} + static acpi_status decode_type0_hpx_record(union acpi_object *record, struct hpx_type0 *hpx0) { @@ -146,6 +187,20 @@ static acpi_status decode_type0_hpx_record(union acpi_object *record, return AE_OK; } +void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) +{ + int pos; + + if (!hpx) + return; + + pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (!pos) + return; + + pci_warn(dev, "PCI-X settings not supported\n"); +} + static acpi_status decode_type1_hpx_record(union acpi_object *record, struct hpx_type1 *hpx1) { @@ -173,6 +228,107 @@ static acpi_status decode_type1_hpx_record(union acpi_object *record, return AE_OK; } +static bool pcie_root_rcb_set(struct pci_dev *dev) +{ + struct pci_dev *rp = pcie_find_root_port(dev); + u16 lnkctl; + + if (!rp) + return false; + + pcie_capability_read_word(rp, PCI_EXP_LNKCTL, &lnkctl); + if (lnkctl & PCI_EXP_LNKCTL_RCB) + return true; + + return false; +} + +void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) +{ + int pos; + u32 reg32; + + if (!hpx) + return; + + if (!pci_is_pcie(dev)) + return; + + if (hpx->revision > 1) { + pci_warn(dev, "PCIe settings rev %d not supported\n", + hpx->revision); + return; + } + + /* + * Don't allow _HPX to change MPS or MRRS settings. We manage + * those to make sure they're consistent with the rest of the + * platform. + */ + hpx->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | + PCI_EXP_DEVCTL_READRQ; + hpx->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | + PCI_EXP_DEVCTL_READRQ); + + /* Initialize Device Control Register */ + pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, + ~hpx->pci_exp_devctl_and, hpx->pci_exp_devctl_or); + + /* Initialize Link Control Register */ + if (pcie_cap_has_lnkctl(dev)) { + + /* + * If the Root Port supports Read Completion Boundary of + * 128, set RCB to 128. Otherwise, clear it. + */ + hpx->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB; + hpx->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB; + if (pcie_root_rcb_set(dev)) + hpx->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB; + + pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL, + ~hpx->pci_exp_lnkctl_and, hpx->pci_exp_lnkctl_or); + } + + /* Find Advanced Error Reporting Enhanced Capability */ + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); + if (!pos) + return; + + /* Initialize Uncorrectable Error Mask Register */ + pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, ®32); + reg32 = (reg32 & hpx->unc_err_mask_and) | hpx->unc_err_mask_or; + pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32); + + /* Initialize Uncorrectable Error Severity Register */ + pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, ®32); + reg32 = (reg32 & hpx->unc_err_sever_and) | hpx->unc_err_sever_or; + pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32); + + /* Initialize Correctable Error Mask Register */ + pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, ®32); + reg32 = (reg32 & hpx->cor_err_mask_and) | hpx->cor_err_mask_or; + pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32); + + /* Initialize Advanced Error Capabilities and Control Register */ + pci_read_config_dword(dev, pos + PCI_ERR_CAP, ®32); + reg32 = (reg32 & hpx->adv_err_cap_and) | hpx->adv_err_cap_or; + + /* Don't enable ECRC generation or checking if unsupported */ + if (!(reg32 & PCI_ERR_CAP_ECRC_GENC)) + reg32 &= ~PCI_ERR_CAP_ECRC_GENE; + if (!(reg32 & PCI_ERR_CAP_ECRC_CHKC)) + reg32 &= ~PCI_ERR_CAP_ECRC_CHKE; + pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32); + + /* + * FIXME: The following two registers are not supported yet. + * + * o Secondary Uncorrectable Error Severity Register + * o Secondary Uncorrectable Error Mask Register + */ +} + static acpi_status decode_type2_hpx_record(union acpi_object *record, struct hpx_type2 *hpx2) { @@ -213,6 +369,146 @@ static acpi_status decode_type2_hpx_record(union acpi_object *record, return AE_OK; } +enum hpx_type3_dev_type { + HPX_TYPE_ENDPOINT = BIT(0), + HPX_TYPE_LEG_END = BIT(1), + HPX_TYPE_RC_END = BIT(2), + HPX_TYPE_RC_EC = BIT(3), + HPX_TYPE_ROOT_PORT = BIT(4), + HPX_TYPE_UPSTREAM = BIT(5), + HPX_TYPE_DOWNSTREAM = BIT(6), + HPX_TYPE_PCI_BRIDGE = BIT(7), + HPX_TYPE_PCIE_BRIDGE = BIT(8), +}; + +static u16 hpx3_device_type(struct pci_dev *dev) +{ + u16 pcie_type = pci_pcie_type(dev); + const int pcie_to_hpx3_type[] = { + [PCI_EXP_TYPE_ENDPOINT] = HPX_TYPE_ENDPOINT, + [PCI_EXP_TYPE_LEG_END] = HPX_TYPE_LEG_END, + [PCI_EXP_TYPE_RC_END] = HPX_TYPE_RC_END, + [PCI_EXP_TYPE_RC_EC] = HPX_TYPE_RC_EC, + [PCI_EXP_TYPE_ROOT_PORT] = HPX_TYPE_ROOT_PORT, + [PCI_EXP_TYPE_UPSTREAM] = HPX_TYPE_UPSTREAM, + [PCI_EXP_TYPE_DOWNSTREAM] = HPX_TYPE_DOWNSTREAM, + [PCI_EXP_TYPE_PCI_BRIDGE] = HPX_TYPE_PCI_BRIDGE, + [PCI_EXP_TYPE_PCIE_BRIDGE] = HPX_TYPE_PCIE_BRIDGE, + }; + + if (pcie_type >= ARRAY_SIZE(pcie_to_hpx3_type)) + return 0; + + return pcie_to_hpx3_type[pcie_type]; +} + +enum hpx_type3_fn_type { + HPX_FN_NORMAL = BIT(0), + HPX_FN_SRIOV_PHYS = BIT(1), + HPX_FN_SRIOV_VIRT = BIT(2), +}; + +static u8 hpx3_function_type(struct pci_dev *dev) +{ + if (dev->is_virtfn) + return HPX_FN_SRIOV_VIRT; + else if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV) > 0) + return HPX_FN_SRIOV_PHYS; + else + return HPX_FN_NORMAL; +} + +static bool hpx3_cap_ver_matches(u8 pcie_cap_id, u8 hpx3_cap_id) +{ + u8 cap_ver = hpx3_cap_id & 0xf; + + if ((hpx3_cap_id & BIT(4)) && cap_ver >= pcie_cap_id) + return true; + else if (cap_ver == pcie_cap_id) + return true; + + return false; +} + +enum hpx_type3_cfg_loc { + HPX_CFG_PCICFG = 0, + HPX_CFG_PCIE_CAP = 1, + HPX_CFG_PCIE_CAP_EXT = 2, + HPX_CFG_VEND_CAP = 3, + HPX_CFG_DVSEC = 4, + HPX_CFG_MAX, +}; + +static void program_hpx_type3_register(struct pci_dev *dev, + const struct hpx_type3 *reg) +{ + u32 match_reg, write_reg, header, orig_value; + u16 pos; + + if (!(hpx3_device_type(dev) & reg->device_type)) + return; + + if (!(hpx3_function_type(dev) & reg->function_type)) + return; + + switch (reg->config_space_location) { + case HPX_CFG_PCICFG: + pos = 0; + break; + case HPX_CFG_PCIE_CAP: + pos = pci_find_capability(dev, reg->pci_exp_cap_id); + if (pos == 0) + return; + + break; + case HPX_CFG_PCIE_CAP_EXT: + pos = pci_find_ext_capability(dev, reg->pci_exp_cap_id); + if (pos == 0) + return; + + pci_read_config_dword(dev, pos, &header); + if (!hpx3_cap_ver_matches(PCI_EXT_CAP_VER(header), + reg->pci_exp_cap_ver)) + return; + + break; + case HPX_CFG_VEND_CAP: /* Fall through */ + case HPX_CFG_DVSEC: /* Fall through */ + default: + pci_warn(dev, "Encountered _HPX type 3 with unsupported config space location"); + return; + } + + pci_read_config_dword(dev, pos + reg->match_offset, &match_reg); + + if ((match_reg & reg->match_mask_and) != reg->match_value) + return; + + pci_read_config_dword(dev, pos + reg->reg_offset, &write_reg); + orig_value = write_reg; + write_reg &= reg->reg_mask_and; + write_reg |= reg->reg_mask_or; + + if (orig_value == write_reg) + return; + + pci_write_config_dword(dev, pos + reg->reg_offset, write_reg); + + pci_dbg(dev, "Applied _HPX3 at [0x%x]: 0x%08x -> 0x%08x", + pos, orig_value, write_reg); +} + +void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) +{ + if (!hpx) + return; + + if (!pci_is_pcie(dev)) + return; + + program_hpx_type3_register(dev, hpx); +} + static void parse_hpx3_register(struct hpx_type3 *hpx3_reg, union acpi_object *reg_fields) { diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 1be03a97cb92..dad43c64b350 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -608,4 +608,83 @@ static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline void pci_aer_clear_device_status(struct pci_dev *dev) { } #endif +/* _HPX PCI Setting Record (Type 0); same as _HPP */ +struct hpx_type0 { + u32 revision; /* Not present in _HPP */ + u8 cache_line_size; /* Not applicable to PCIe */ + u8 latency_timer; /* Not applicable to PCIe */ + u8 enable_serr; + u8 enable_perr; +}; + +/* _HPX PCI-X Setting Record (Type 1) */ +struct hpx_type1 { + u32 revision; + u8 max_mem_read; + u8 avg_max_split; + u16 tot_max_split; +}; + +/* _HPX PCI Express Setting Record (Type 2) */ +struct hpx_type2 { + u32 revision; + u32 unc_err_mask_and; + u32 unc_err_mask_or; + u32 unc_err_sever_and; + u32 unc_err_sever_or; + u32 cor_err_mask_and; + u32 cor_err_mask_or; + u32 adv_err_cap_and; + u32 adv_err_cap_or; + u16 pci_exp_devctl_and; + u16 pci_exp_devctl_or; + u16 pci_exp_lnkctl_and; + u16 pci_exp_lnkctl_or; + u32 sec_unc_err_sever_and; + u32 sec_unc_err_sever_or; + u32 sec_unc_err_mask_and; + u32 sec_unc_err_mask_or; +}; + +/* _HPX PCI Express Setting Record (Type 3) */ +struct hpx_type3 { + u16 device_type; + u16 function_type; + u16 config_space_location; + u16 pci_exp_cap_id; + u16 pci_exp_cap_ver; + u16 pci_exp_vendor_id; + u16 dvsec_id; + u16 dvsec_rev; + u16 match_offset; + u32 match_mask_and; + u32 match_value; + u16 reg_offset; + u32 reg_mask_and; + u32 reg_mask_or; +}; + +void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx); +void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx); +void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx); +void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx); + +struct hotplug_program_ops { + void (*program_type0)(struct pci_dev *dev, struct hpx_type0 *hpx); + void (*program_type1)(struct pci_dev *dev, struct hpx_type1 *hpx); + void (*program_type2)(struct pci_dev *dev, struct hpx_type2 *hpx); + void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpx); +}; + +#ifdef CONFIG_ACPI +int pci_acpi_program_hp_params(struct pci_dev *dev, + const struct hotplug_program_ops *hp_ops); +#else +static inline int pci_acpi_program_hp_params(struct pci_dev *dev, + const struct hotplug_program_ops *hp_ops) +{ + return -ENODEV; +} +#endif + #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 120c70b5003b..5847b557dc9a 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1920,275 +1920,6 @@ static void pci_configure_mps(struct pci_dev *dev) p_mps, mps, mpss); } -static struct hpx_type0 pci_default_type0 = { - .revision = 1, - .cache_line_size = 8, - .latency_timer = 0x40, - .enable_serr = 0, - .enable_perr = 0, -}; - -static void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) -{ - u16 pci_cmd, pci_bctl; - - if (!hpx) - hpx = &pci_default_type0; - - if (hpx->revision > 1) { - pci_warn(dev, "PCI settings rev %d not supported; using defaults\n", - hpx->revision); - hpx = &pci_default_type0; - } - - pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpx->cache_line_size); - pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpx->latency_timer); - pci_read_config_word(dev, PCI_COMMAND, &pci_cmd); - if (hpx->enable_serr) - pci_cmd |= PCI_COMMAND_SERR; - if (hpx->enable_perr) - pci_cmd |= PCI_COMMAND_PARITY; - pci_write_config_word(dev, PCI_COMMAND, pci_cmd); - - /* Program bridge control value */ - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { - pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER, - hpx->latency_timer); - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl); - if (hpx->enable_perr) - pci_bctl |= PCI_BRIDGE_CTL_PARITY; - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl); - } -} - -static void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) -{ - int pos; - - if (!hpx) - return; - - pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (!pos) - return; - - pci_warn(dev, "PCI-X settings not supported\n"); -} - -static bool pcie_root_rcb_set(struct pci_dev *dev) -{ - struct pci_dev *rp = pcie_find_root_port(dev); - u16 lnkctl; - - if (!rp) - return false; - - pcie_capability_read_word(rp, PCI_EXP_LNKCTL, &lnkctl); - if (lnkctl & PCI_EXP_LNKCTL_RCB) - return true; - - return false; -} - -static void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) -{ - int pos; - u32 reg32; - - if (!hpx) - return; - - if (!pci_is_pcie(dev)) - return; - - if (hpx->revision > 1) { - pci_warn(dev, "PCIe settings rev %d not supported\n", - hpx->revision); - return; - } - - /* - * Don't allow _HPX to change MPS or MRRS settings. We manage - * those to make sure they're consistent with the rest of the - * platform. - */ - hpx->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | - PCI_EXP_DEVCTL_READRQ; - hpx->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | - PCI_EXP_DEVCTL_READRQ); - - /* Initialize Device Control Register */ - pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, - ~hpx->pci_exp_devctl_and, hpx->pci_exp_devctl_or); - - /* Initialize Link Control Register */ - if (pcie_cap_has_lnkctl(dev)) { - - /* - * If the Root Port supports Read Completion Boundary of - * 128, set RCB to 128. Otherwise, clear it. - */ - hpx->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB; - hpx->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB; - if (pcie_root_rcb_set(dev)) - hpx->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB; - - pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL, - ~hpx->pci_exp_lnkctl_and, hpx->pci_exp_lnkctl_or); - } - - /* Find Advanced Error Reporting Enhanced Capability */ - pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); - if (!pos) - return; - - /* Initialize Uncorrectable Error Mask Register */ - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, ®32); - reg32 = (reg32 & hpx->unc_err_mask_and) | hpx->unc_err_mask_or; - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32); - - /* Initialize Uncorrectable Error Severity Register */ - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, ®32); - reg32 = (reg32 & hpx->unc_err_sever_and) | hpx->unc_err_sever_or; - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32); - - /* Initialize Correctable Error Mask Register */ - pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, ®32); - reg32 = (reg32 & hpx->cor_err_mask_and) | hpx->cor_err_mask_or; - pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32); - - /* Initialize Advanced Error Capabilities and Control Register */ - pci_read_config_dword(dev, pos + PCI_ERR_CAP, ®32); - reg32 = (reg32 & hpx->adv_err_cap_and) | hpx->adv_err_cap_or; - - /* Don't enable ECRC generation or checking if unsupported */ - if (!(reg32 & PCI_ERR_CAP_ECRC_GENC)) - reg32 &= ~PCI_ERR_CAP_ECRC_GENE; - if (!(reg32 & PCI_ERR_CAP_ECRC_CHKC)) - reg32 &= ~PCI_ERR_CAP_ECRC_CHKE; - pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32); - - /* - * FIXME: The following two registers are not supported yet. - * - * o Secondary Uncorrectable Error Severity Register - * o Secondary Uncorrectable Error Mask Register - */ -} - -static u16 hpx3_device_type(struct pci_dev *dev) -{ - u16 pcie_type = pci_pcie_type(dev); - const int pcie_to_hpx3_type[] = { - [PCI_EXP_TYPE_ENDPOINT] = HPX_TYPE_ENDPOINT, - [PCI_EXP_TYPE_LEG_END] = HPX_TYPE_LEG_END, - [PCI_EXP_TYPE_RC_END] = HPX_TYPE_RC_END, - [PCI_EXP_TYPE_RC_EC] = HPX_TYPE_RC_EC, - [PCI_EXP_TYPE_ROOT_PORT] = HPX_TYPE_ROOT_PORT, - [PCI_EXP_TYPE_UPSTREAM] = HPX_TYPE_UPSTREAM, - [PCI_EXP_TYPE_DOWNSTREAM] = HPX_TYPE_DOWNSTREAM, - [PCI_EXP_TYPE_PCI_BRIDGE] = HPX_TYPE_PCI_BRIDGE, - [PCI_EXP_TYPE_PCIE_BRIDGE] = HPX_TYPE_PCIE_BRIDGE, - }; - - if (pcie_type >= ARRAY_SIZE(pcie_to_hpx3_type)) - return 0; - - return pcie_to_hpx3_type[pcie_type]; -} - -static u8 hpx3_function_type(struct pci_dev *dev) -{ - if (dev->is_virtfn) - return HPX_FN_SRIOV_VIRT; - else if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV) > 0) - return HPX_FN_SRIOV_PHYS; - else - return HPX_FN_NORMAL; -} - -static bool hpx3_cap_ver_matches(u8 pcie_cap_id, u8 hpx3_cap_id) -{ - u8 cap_ver = hpx3_cap_id & 0xf; - - if ((hpx3_cap_id & BIT(4)) && cap_ver >= pcie_cap_id) - return true; - else if (cap_ver == pcie_cap_id) - return true; - - return false; -} - -static void program_hpx_type3_register(struct pci_dev *dev, - const struct hpx_type3 *reg) -{ - u32 match_reg, write_reg, header, orig_value; - u16 pos; - - if (!(hpx3_device_type(dev) & reg->device_type)) - return; - - if (!(hpx3_function_type(dev) & reg->function_type)) - return; - - switch (reg->config_space_location) { - case HPX_CFG_PCICFG: - pos = 0; - break; - case HPX_CFG_PCIE_CAP: - pos = pci_find_capability(dev, reg->pci_exp_cap_id); - if (pos == 0) - return; - - break; - case HPX_CFG_PCIE_CAP_EXT: - pos = pci_find_ext_capability(dev, reg->pci_exp_cap_id); - if (pos == 0) - return; - - pci_read_config_dword(dev, pos, &header); - if (!hpx3_cap_ver_matches(PCI_EXT_CAP_VER(header), - reg->pci_exp_cap_ver)) - return; - - break; - case HPX_CFG_VEND_CAP: /* Fall through */ - case HPX_CFG_DVSEC: /* Fall through */ - default: - pci_warn(dev, "Encountered _HPX type 3 with unsupported config space location"); - return; - } - - pci_read_config_dword(dev, pos + reg->match_offset, &match_reg); - - if ((match_reg & reg->match_mask_and) != reg->match_value) - return; - - pci_read_config_dword(dev, pos + reg->reg_offset, &write_reg); - orig_value = write_reg; - write_reg &= reg->reg_mask_and; - write_reg |= reg->reg_mask_or; - - if (orig_value == write_reg) - return; - - pci_write_config_dword(dev, pos + reg->reg_offset, write_reg); - - pci_dbg(dev, "Applied _HPX3 at [0x%x]: 0x%08x -> 0x%08x", - pos, orig_value, write_reg); -} - -static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) -{ - if (!hpx) - return; - - if (!pci_is_pcie(dev)) - return; - - program_hpx_type3_register(dev, hpx); -} - int pci_configure_extended_tags(struct pci_dev *dev, void *ign) { struct pci_host_bridge *host; diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index 247a49ee27b3..b482e42d7153 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -86,112 +86,14 @@ void pci_hp_deregister(struct hotplug_slot *slot); #define pci_hp_initialize(slot, bus, nr, name) \ __pci_hp_initialize(slot, bus, nr, name, THIS_MODULE, KBUILD_MODNAME) -/* _HPX PCI Setting Record (Type 0); same as _HPP */ -struct hpx_type0 { - u32 revision; /* Not present in _HPP */ - u8 cache_line_size; /* Not applicable to PCIe */ - u8 latency_timer; /* Not applicable to PCIe */ - u8 enable_serr; - u8 enable_perr; -}; - -/* _HPX PCI-X Setting Record (Type 1) */ -struct hpx_type1 { - u32 revision; - u8 max_mem_read; - u8 avg_max_split; - u16 tot_max_split; -}; - -/* _HPX PCI Express Setting Record (Type 2) */ -struct hpx_type2 { - u32 revision; - u32 unc_err_mask_and; - u32 unc_err_mask_or; - u32 unc_err_sever_and; - u32 unc_err_sever_or; - u32 cor_err_mask_and; - u32 cor_err_mask_or; - u32 adv_err_cap_and; - u32 adv_err_cap_or; - u16 pci_exp_devctl_and; - u16 pci_exp_devctl_or; - u16 pci_exp_lnkctl_and; - u16 pci_exp_lnkctl_or; - u32 sec_unc_err_sever_and; - u32 sec_unc_err_sever_or; - u32 sec_unc_err_mask_and; - u32 sec_unc_err_mask_or; -}; - -/* _HPX PCI Express Setting Record (Type 3) */ -struct hpx_type3 { - u16 device_type; - u16 function_type; - u16 config_space_location; - u16 pci_exp_cap_id; - u16 pci_exp_cap_ver; - u16 pci_exp_vendor_id; - u16 dvsec_id; - u16 dvsec_rev; - u16 match_offset; - u32 match_mask_and; - u32 match_value; - u16 reg_offset; - u32 reg_mask_and; - u32 reg_mask_or; -}; - -struct hotplug_program_ops { - void (*program_type0)(struct pci_dev *dev, struct hpx_type0 *hpx); - void (*program_type1)(struct pci_dev *dev, struct hpx_type1 *hpx); - void (*program_type2)(struct pci_dev *dev, struct hpx_type2 *hpx); - void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpx); -}; - -enum hpx_type3_dev_type { - HPX_TYPE_ENDPOINT = BIT(0), - HPX_TYPE_LEG_END = BIT(1), - HPX_TYPE_RC_END = BIT(2), - HPX_TYPE_RC_EC = BIT(3), - HPX_TYPE_ROOT_PORT = BIT(4), - HPX_TYPE_UPSTREAM = BIT(5), - HPX_TYPE_DOWNSTREAM = BIT(6), - HPX_TYPE_PCI_BRIDGE = BIT(7), - HPX_TYPE_PCIE_BRIDGE = BIT(8), -}; - -enum hpx_type3_fn_type { - HPX_FN_NORMAL = BIT(0), - HPX_FN_SRIOV_PHYS = BIT(1), - HPX_FN_SRIOV_VIRT = BIT(2), -}; - -enum hpx_type3_cfg_loc { - HPX_CFG_PCICFG = 0, - HPX_CFG_PCIE_CAP = 1, - HPX_CFG_PCIE_CAP_EXT = 2, - HPX_CFG_VEND_CAP = 3, - HPX_CFG_DVSEC = 4, - HPX_CFG_MAX, -}; - #ifdef CONFIG_ACPI #include -int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops); bool pciehp_is_native(struct pci_dev *bridge); int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge); bool shpchp_is_native(struct pci_dev *bridge); int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle); int acpi_pci_detect_ejectable(acpi_handle handle); #else -static inline int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops) -{ - return -ENODEV; -} - static inline int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge) { return 0; From 4a2dbeddd3d54484d2e9c12965e4f9bfa1a0ee36 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Tue, 27 Aug 2019 11:49:51 +0200 Subject: [PATCH 188/721] PCI/ACPI: Remove unnecessary struct hotplug_program_ops Move the ACPI-specific structs hpx_type0, hpx_type1, hpx_type2 and hpx_type3 to drivers/pci/pci-acpi.c as they are not used anywhere else. Then remove the struct hotplug_program_ops that has been shared between drivers/pci/probe.c and drivers/pci/pci-acpi.c from drivers/pci/pci.h as it is no longer needed. The struct hotplug_program_ops was added by 87fcf12e846a ("PCI/ACPI: Remove the need for 'struct hotplug_params'") and replaced previously used struct hotplug_params enabling the support for the _HPX Type 3 Setting Record that was added by f873c51a155a ("PCI/ACPI: Implement _HPX Type 3 Setting Record"). The new struct allowed for the static functions such program_hpx_type0(), program_hpx_type1(), etc., from the drivers/pci/probe.c to be called from the function pci_acpi_program_hp_params() in the drivers/pci/pci-acpi.c. Previously a programming of _HPX Type 0 was as follows: drivers/pci/probe.c: program_hpx_type0() ... pci_configure_device() hp_ops = { .program_type0 = program_hpx_type0, ... } pci_acpi_program_hp_params(&hp_ops) drivers/pci/pci-acpi.c: pci_acpi_program_hp_params(&hp_ops) acpi_run_hpx(hp_ops) decode_type0_hpx_record() hp_ops->program_type0 # program_hpx_type0() called via hp_ops After the ACPI-specific functions, structs, enums, etc., have been moved to drivers/pci/pci-acpi.c there is no need for the hotplug_program_ops as all of the _HPX Type 0, 1, 2 and 3 are directly accessible. Link: https://lore.kernel.org/r/20190827094951.10613-4-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-acpi.c | 95 ++++++++++++++++++++++++++++++++---------- drivers/pci/pci.h | 74 +------------------------------- drivers/pci/probe.c | 9 +--- 3 files changed, 76 insertions(+), 102 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 0bfabb3f9931..6e65d1ff93da 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -118,6 +118,15 @@ phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle) return (phys_addr_t)mcfg_addr; } +/* _HPX PCI Setting Record (Type 0); same as _HPP */ +struct hpx_type0 { + u32 revision; /* Not present in _HPP */ + u8 cache_line_size; /* Not applicable to PCIe */ + u8 latency_timer; /* Not applicable to PCIe */ + u8 enable_serr; + u8 enable_perr; +}; + static struct hpx_type0 pci_default_type0 = { .revision = 1, .cache_line_size = 8, @@ -126,7 +135,7 @@ static struct hpx_type0 pci_default_type0 = { .enable_perr = 0, }; -void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) +static void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) { u16 pci_cmd, pci_bctl; @@ -187,7 +196,15 @@ static acpi_status decode_type0_hpx_record(union acpi_object *record, return AE_OK; } -void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) +/* _HPX PCI-X Setting Record (Type 1) */ +struct hpx_type1 { + u32 revision; + u8 max_mem_read; + u8 avg_max_split; + u16 tot_max_split; +}; + +static void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) { int pos; @@ -243,7 +260,28 @@ static bool pcie_root_rcb_set(struct pci_dev *dev) return false; } -void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) +/* _HPX PCI Express Setting Record (Type 2) */ +struct hpx_type2 { + u32 revision; + u32 unc_err_mask_and; + u32 unc_err_mask_or; + u32 unc_err_sever_and; + u32 unc_err_sever_or; + u32 cor_err_mask_and; + u32 cor_err_mask_or; + u32 adv_err_cap_and; + u32 adv_err_cap_or; + u16 pci_exp_devctl_and; + u16 pci_exp_devctl_or; + u16 pci_exp_lnkctl_and; + u16 pci_exp_lnkctl_or; + u32 sec_unc_err_sever_and; + u32 sec_unc_err_sever_or; + u32 sec_unc_err_mask_and; + u32 sec_unc_err_mask_or; +}; + +static void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) { int pos; u32 reg32; @@ -369,6 +407,24 @@ static acpi_status decode_type2_hpx_record(union acpi_object *record, return AE_OK; } +/* _HPX PCI Express Setting Record (Type 3) */ +struct hpx_type3 { + u16 device_type; + u16 function_type; + u16 config_space_location; + u16 pci_exp_cap_id; + u16 pci_exp_cap_ver; + u16 pci_exp_vendor_id; + u16 dvsec_id; + u16 dvsec_rev; + u16 match_offset; + u32 match_mask_and; + u32 match_value; + u16 reg_offset; + u32 reg_mask_and; + u32 reg_mask_or; +}; + enum hpx_type3_dev_type { HPX_TYPE_ENDPOINT = BIT(0), HPX_TYPE_LEG_END = BIT(1), @@ -498,7 +554,7 @@ static void program_hpx_type3_register(struct pci_dev *dev, pos, orig_value, write_reg); } -void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) +static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) { if (!hpx) return; @@ -529,8 +585,7 @@ static void parse_hpx3_register(struct hpx_type3 *hpx3_reg, } static acpi_status program_type3_hpx_record(struct pci_dev *dev, - union acpi_object *record, - const struct hotplug_program_ops *hp_ops) + union acpi_object *record) { union acpi_object *fields = record->package.elements; u32 desc_count, expected_length, revision; @@ -554,7 +609,7 @@ static acpi_status program_type3_hpx_record(struct pci_dev *dev, for (i = 0; i < desc_count; i++) { reg_fields = fields + 3 + i * 14; parse_hpx3_register(&hpx3, reg_fields); - hp_ops->program_type3(dev, &hpx3); + program_hpx_type3(dev, &hpx3); } break; @@ -567,8 +622,7 @@ static acpi_status program_type3_hpx_record(struct pci_dev *dev, return AE_OK; } -static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, - const struct hotplug_program_ops *hp_ops) +static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle) { acpi_status status; struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; @@ -610,24 +664,24 @@ static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, status = decode_type0_hpx_record(record, &hpx0); if (ACPI_FAILURE(status)) goto exit; - hp_ops->program_type0(dev, &hpx0); + program_hpx_type0(dev, &hpx0); break; case 1: memset(&hpx1, 0, sizeof(hpx1)); status = decode_type1_hpx_record(record, &hpx1); if (ACPI_FAILURE(status)) goto exit; - hp_ops->program_type1(dev, &hpx1); + program_hpx_type1(dev, &hpx1); break; case 2: memset(&hpx2, 0, sizeof(hpx2)); status = decode_type2_hpx_record(record, &hpx2); if (ACPI_FAILURE(status)) goto exit; - hp_ops->program_type2(dev, &hpx2); + program_hpx_type2(dev, &hpx2); break; case 3: - status = program_type3_hpx_record(dev, record, hp_ops); + status = program_type3_hpx_record(dev, record); if (ACPI_FAILURE(status)) goto exit; break; @@ -643,8 +697,7 @@ static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, return status; } -static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle, - const struct hotplug_program_ops *hp_ops) +static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle) { acpi_status status; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -679,20 +732,18 @@ static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle, hpx0.enable_serr = fields[2].integer.value; hpx0.enable_perr = fields[3].integer.value; - hp_ops->program_type0(dev, &hpx0); + program_hpx_type0(dev, &hpx0); exit: kfree(buffer.pointer); return status; } -/* pci_get_hp_params +/* pci_acpi_program_hp_params * * @dev - the pci_dev for which we want parameters - * @hpp - allocated by the caller */ -int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops) +int pci_acpi_program_hp_params(struct pci_dev *dev) { acpi_status status; acpi_handle handle, phandle; @@ -715,10 +766,10 @@ int pci_acpi_program_hp_params(struct pci_dev *dev, * this pci dev. */ while (handle) { - status = acpi_run_hpx(dev, handle, hp_ops); + status = acpi_run_hpx(dev, handle); if (ACPI_SUCCESS(status)) return 0; - status = acpi_run_hpp(dev, handle, hp_ops); + status = acpi_run_hpp(dev, handle); if (ACPI_SUCCESS(status)) return 0; if (acpi_is_root_bridge(handle)) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index dad43c64b350..97180274c60b 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -608,80 +608,10 @@ static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline void pci_aer_clear_device_status(struct pci_dev *dev) { } #endif -/* _HPX PCI Setting Record (Type 0); same as _HPP */ -struct hpx_type0 { - u32 revision; /* Not present in _HPP */ - u8 cache_line_size; /* Not applicable to PCIe */ - u8 latency_timer; /* Not applicable to PCIe */ - u8 enable_serr; - u8 enable_perr; -}; - -/* _HPX PCI-X Setting Record (Type 1) */ -struct hpx_type1 { - u32 revision; - u8 max_mem_read; - u8 avg_max_split; - u16 tot_max_split; -}; - -/* _HPX PCI Express Setting Record (Type 2) */ -struct hpx_type2 { - u32 revision; - u32 unc_err_mask_and; - u32 unc_err_mask_or; - u32 unc_err_sever_and; - u32 unc_err_sever_or; - u32 cor_err_mask_and; - u32 cor_err_mask_or; - u32 adv_err_cap_and; - u32 adv_err_cap_or; - u16 pci_exp_devctl_and; - u16 pci_exp_devctl_or; - u16 pci_exp_lnkctl_and; - u16 pci_exp_lnkctl_or; - u32 sec_unc_err_sever_and; - u32 sec_unc_err_sever_or; - u32 sec_unc_err_mask_and; - u32 sec_unc_err_mask_or; -}; - -/* _HPX PCI Express Setting Record (Type 3) */ -struct hpx_type3 { - u16 device_type; - u16 function_type; - u16 config_space_location; - u16 pci_exp_cap_id; - u16 pci_exp_cap_ver; - u16 pci_exp_vendor_id; - u16 dvsec_id; - u16 dvsec_rev; - u16 match_offset; - u32 match_mask_and; - u32 match_value; - u16 reg_offset; - u32 reg_mask_and; - u32 reg_mask_or; -}; - -void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx); -void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx); -void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx); -void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx); - -struct hotplug_program_ops { - void (*program_type0)(struct pci_dev *dev, struct hpx_type0 *hpx); - void (*program_type1)(struct pci_dev *dev, struct hpx_type1 *hpx); - void (*program_type2)(struct pci_dev *dev, struct hpx_type2 *hpx); - void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpx); -}; - #ifdef CONFIG_ACPI -int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops); +int pci_acpi_program_hp_params(struct pci_dev *dev); #else -static inline int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops) +static inline int pci_acpi_program_hp_params(struct pci_dev *dev) { return -ENODEV; } diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 5847b557dc9a..cdf34a3c531a 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2100,13 +2100,6 @@ static void pci_configure_serr(struct pci_dev *dev) static void pci_configure_device(struct pci_dev *dev) { - static const struct hotplug_program_ops hp_ops = { - .program_type0 = program_hpx_type0, - .program_type1 = program_hpx_type1, - .program_type2 = program_hpx_type2, - .program_type3 = program_hpx_type3, - }; - pci_configure_mps(dev); pci_configure_extended_tags(dev, NULL); pci_configure_relaxed_ordering(dev); @@ -2114,7 +2107,7 @@ static void pci_configure_device(struct pci_dev *dev) pci_configure_eetlp_prefix(dev); pci_configure_serr(dev); - pci_acpi_program_hp_params(dev, &hp_ops); + pci_acpi_program_hp_params(dev); } static void pci_release_capabilities(struct pci_dev *dev) From ca85ee7457dc60771ec0173bfed58671177a74ab Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Tue, 27 Aug 2019 19:04:14 +0200 Subject: [PATCH 189/721] dt-bindings: i2c: bcm2835: Add brcm,bcm2711 compatible Add a new compatible for the BCM2711, which hasn't the clock stretch bug. Signed-off-by: Stefan Wahren Reviewed-by: Eric Anholt Reviewed-by: Rob Herring Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt b/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt index e9de3756752b..c9a6587fe4bb 100644 --- a/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt +++ b/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt @@ -1,7 +1,9 @@ Broadcom BCM2835 I2C controller Required properties: -- compatible : Should be "brcm,bcm2835-i2c". +- compatible : Should be one of: + "brcm,bcm2711-i2c" + "brcm,bcm2835-i2c" - reg: Should contain register location and length. - interrupts: Should contain interrupt. - clocks : The clock feeding the I2C controller. From 67de10fbaa12dc7d46412c5ab80f784ea8b4d7f8 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Tue, 27 Aug 2019 19:04:15 +0200 Subject: [PATCH 190/721] i2c: bcm2835: Avoid clk stretch quirk for BCM2711 The I2C block on the BCM2711 isn't affected by the clk stretching bug. So there is no need to apply the corresponding quirk. Signed-off-by: Stefan Wahren Reviewed-by: Eric Anholt Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-bcm2835.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-bcm2835.c b/drivers/i2c/busses/i2c-bcm2835.c index 67752f7b0371..ab5502f11109 100644 --- a/drivers/i2c/busses/i2c-bcm2835.c +++ b/drivers/i2c/busses/i2c-bcm2835.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -389,7 +390,7 @@ static const struct i2c_algorithm bcm2835_i2c_algo = { }; /* - * This HW was reported to have problems with clock stretching: + * The BCM2835 was reported to have problems with clock stretching: * http://www.advamation.com/knowhow/raspberrypi/rpi-i2c-bug.html * https://www.raspberrypi.org/forums/viewtopic.php?p=146272 */ @@ -475,7 +476,7 @@ static int bcm2835_i2c_probe(struct platform_device *pdev) adap->algo = &bcm2835_i2c_algo; adap->dev.parent = &pdev->dev; adap->dev.of_node = pdev->dev.of_node; - adap->quirks = &bcm2835_i2c_quirks; + adap->quirks = of_device_get_match_data(&pdev->dev); bcm2835_i2c_writel(i2c_dev, BCM2835_I2C_C, 0); @@ -501,7 +502,8 @@ static int bcm2835_i2c_remove(struct platform_device *pdev) } static const struct of_device_id bcm2835_i2c_of_match[] = { - { .compatible = "brcm,bcm2835-i2c" }, + { .compatible = "brcm,bcm2711-i2c" }, + { .compatible = "brcm,bcm2835-i2c", .data = &bcm2835_i2c_quirks }, {}, }; MODULE_DEVICE_TABLE(of, bcm2835_i2c_of_match); From 250212b59a8e33f1b47f15ad9afa3db19ba0fc8d Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Tue, 27 Aug 2019 19:04:16 +0200 Subject: [PATCH 191/721] i2c: bcm2835: Add full name of devicetree node to adapter name Inspired by Lori Hikichi's patch for iproc, this adds the full name of the devicetree node to the adapter name. With the introduction of BCM2711 it's very difficult to distinguish between the multiple instances. Signed-off-by: Stefan Wahren Acked-by: Scott Branden Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-bcm2835.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-bcm2835.c b/drivers/i2c/busses/i2c-bcm2835.c index ab5502f11109..e01b2b57e724 100644 --- a/drivers/i2c/busses/i2c-bcm2835.c +++ b/drivers/i2c/busses/i2c-bcm2835.c @@ -472,7 +472,8 @@ static int bcm2835_i2c_probe(struct platform_device *pdev) i2c_set_adapdata(adap, i2c_dev); adap->owner = THIS_MODULE; adap->class = I2C_CLASS_DEPRECATED; - strlcpy(adap->name, "bcm2835 I2C adapter", sizeof(adap->name)); + snprintf(adap->name, sizeof(adap->name), "bcm2835 (%s)", + of_node_full_name(pdev->dev.of_node)); adap->algo = &bcm2835_i2c_algo; adap->dev.parent = &pdev->dev; adap->dev.of_node = pdev->dev.of_node; From 4768e90ecaec6b503ff64229bda5d91186d2edd3 Mon Sep 17 00:00:00 2001 From: Max Staudt Date: Tue, 20 Aug 2019 11:27:39 +0200 Subject: [PATCH 192/721] i2c: Add i2c-icy for I2C on m68k/Amiga This is the i2c-icy driver for the ICY board for Amiga computers. It connects a PCF8584 I2C controller to the Zorro bus, providing I2C connectivity. The original documentation can be found on Aminet: https://aminet.net/package/docs/hard/icy IRQ support is currently not implemented, as i2c-algo-pcf is built for the ISA bus and a straight implementation of the same stack locks up a Zorro machine. Signed-off-by: Max Staudt Reviewed-by: Geert Uytterhoeven [wsa: added a missing newline reported by checkpatch] Signed-off-by: Wolfram Sang --- MAINTAINERS | 6 ++ drivers/i2c/busses/Kconfig | 11 +++ drivers/i2c/busses/Makefile | 1 + drivers/i2c/busses/i2c-icy.c | 173 +++++++++++++++++++++++++++++++++++ 4 files changed, 191 insertions(+) create mode 100644 drivers/i2c/busses/i2c-icy.c diff --git a/MAINTAINERS b/MAINTAINERS index f89f27a2d09a..97d35f4e2068 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7750,6 +7750,12 @@ S: Maintained F: drivers/mfd/lpc_ich.c F: drivers/gpio/gpio-ich.c +ICY I2C DRIVER +M: Max Staudt +L: linux-i2c@vger.kernel.org +S: Maintained +F: drivers/i2c/busses/i2c-icy.c + IDE SUBSYSTEM M: "David S. Miller" L: linux-ide@vger.kernel.org diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 69f19312a574..8b97ba4a2c0c 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -1309,6 +1309,17 @@ config I2C_ELEKTOR This support is also available as a module. If so, the module will be called i2c-elektor. +config I2C_ICY + tristate "ICY Zorro card" + depends on ZORRO + select I2C_ALGOPCF + help + This supports the PCF8584 Zorro bus I2C adapter, known as ICY. + Say Y if you own such an adapter. + + This support is also available as a module. If so, the module + will be called i2c-icy. + config I2C_MLXCPLD tristate "Mellanox I2C driver" depends on X86_64 diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index 80c23895eaaf..3ab8aebc39c9 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -140,6 +140,7 @@ obj-$(CONFIG_I2C_BCM_KONA) += i2c-bcm-kona.o obj-$(CONFIG_I2C_BRCMSTB) += i2c-brcmstb.o obj-$(CONFIG_I2C_CROS_EC_TUNNEL) += i2c-cros-ec-tunnel.o obj-$(CONFIG_I2C_ELEKTOR) += i2c-elektor.o +obj-$(CONFIG_I2C_ICY) += i2c-icy.o obj-$(CONFIG_I2C_MLXCPLD) += i2c-mlxcpld.o obj-$(CONFIG_I2C_OPAL) += i2c-opal.o obj-$(CONFIG_I2C_PCA_ISA) += i2c-pca-isa.o diff --git a/drivers/i2c/busses/i2c-icy.c b/drivers/i2c/busses/i2c-icy.c new file mode 100644 index 000000000000..0d61936dfc0b --- /dev/null +++ b/drivers/i2c/busses/i2c-icy.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * I2C driver for stand-alone PCF8584 style adapters on Zorro cards + * + * Original ICY documentation can be found on Aminet: + * https://aminet.net/package/docs/hard/icy + * + * There has been a modern community re-print of this design in 2019: + * https://www.a1k.org/forum/index.php?threads/70106/ + * + * The card is basically a Philips PCF8584 connected straight to the + * beginning of the AutoConfig'd address space (register S1 on base+2), + * with /INT on /INT2 on the Zorro bus. + * + * Copyright (c) 2019 Max Staudt + * + * This started as a fork of i2c-elektor.c and has evolved since. + * Thanks go to its authors for providing a base to grow on. + * + * + * IRQ support is currently not implemented. + * + * As it turns out, i2c-algo-pcf is really written with i2c-elektor's + * edge-triggered ISA interrupts in mind, while the Amiga's Zorro bus has + * level-triggered interrupts. This means that once an interrupt occurs, we + * have to tell the PCF8584 to shut up immediately, or it will keep the + * interrupt line busy and cause an IRQ storm. + + * However, because of the PCF8584's host-side protocol, there is no good + * way to just quieten it without side effects. Rather, we have to perform + * the next read/write operation straight away, which will reset the /INT + * pin. This entails re-designing the core of i2c-algo-pcf in the future. + * For now, we never request an IRQ from the PCF8584, and poll it instead. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "../algos/i2c-algo-pcf.h" + +struct icy_i2c { + struct i2c_adapter adapter; + + void __iomem *reg_s0; + void __iomem *reg_s1; +}; + +/* + * Functions called by i2c-algo-pcf + */ +static void icy_pcf_setpcf(void *data, int ctl, int val) +{ + struct icy_i2c *i2c = (struct icy_i2c *)data; + + u8 __iomem *address = ctl ? i2c->reg_s1 : i2c->reg_s0; + + z_writeb(val, address); +} + +static int icy_pcf_getpcf(void *data, int ctl) +{ + struct icy_i2c *i2c = (struct icy_i2c *)data; + + u8 __iomem *address = ctl ? i2c->reg_s1 : i2c->reg_s0; + + return z_readb(address); +} + +static int icy_pcf_getown(void *data) +{ + return 0x55; +} + +static int icy_pcf_getclock(void *data) +{ + return 0x1c; +} + +static void icy_pcf_waitforpin(void *data) +{ + usleep_range(50, 150); +} + +/* + * Main i2c-icy part + */ +static int icy_probe(struct zorro_dev *z, + const struct zorro_device_id *ent) +{ + struct icy_i2c *i2c; + struct i2c_algo_pcf_data *algo_data; + + i2c = devm_kzalloc(&z->dev, sizeof(*i2c), GFP_KERNEL); + if (!i2c) + return -ENOMEM; + + algo_data = devm_kzalloc(&z->dev, sizeof(*algo_data), GFP_KERNEL); + if (!algo_data) + return -ENOMEM; + + dev_set_drvdata(&z->dev, i2c); + i2c->adapter.dev.parent = &z->dev; + i2c->adapter.owner = THIS_MODULE; + /* i2c->adapter.algo assigned by i2c_pcf_add_bus() */ + i2c->adapter.algo_data = algo_data; + strlcpy(i2c->adapter.name, "ICY I2C Zorro adapter", + sizeof(i2c->adapter.name)); + + if (!devm_request_mem_region(&z->dev, + z->resource.start, + 4, i2c->adapter.name)) + return -ENXIO; + + /* Driver private data */ + i2c->reg_s0 = ZTWO_VADDR(z->resource.start); + i2c->reg_s1 = ZTWO_VADDR(z->resource.start + 2); + + algo_data->data = i2c; + algo_data->setpcf = icy_pcf_setpcf; + algo_data->getpcf = icy_pcf_getpcf; + algo_data->getown = icy_pcf_getown; + algo_data->getclock = icy_pcf_getclock; + algo_data->waitforpin = icy_pcf_waitforpin; + + if (i2c_pcf_add_bus(&i2c->adapter)) { + dev_err(&z->dev, "i2c_pcf_add_bus() failed\n"); + return -ENXIO; + } + + dev_info(&z->dev, "ICY I2C controller at %pa, IRQ not implemented\n", + &z->resource.start); + + return 0; +} + +static void icy_remove(struct zorro_dev *z) +{ + struct icy_i2c *i2c = dev_get_drvdata(&z->dev); + + i2c_del_adapter(&i2c->adapter); +} + +static const struct zorro_device_id icy_zorro_tbl[] = { + { ZORRO_ID(VMC, 15, 0), }, + { 0 } +}; + +MODULE_DEVICE_TABLE(zorro, icy_zorro_tbl); + +static struct zorro_driver icy_driver = { + .name = "i2c-icy", + .id_table = icy_zorro_tbl, + .probe = icy_probe, + .remove = icy_remove, +}; + +module_driver(icy_driver, + zorro_register_driver, + zorro_unregister_driver); + +MODULE_AUTHOR("Max Staudt "); +MODULE_DESCRIPTION("I2C bus via PCF8584 on ICY Zorro card"); +MODULE_LICENSE("GPL v2"); From 724041ae15ed9639b72bb54dabfac3279c6b4e55 Mon Sep 17 00:00:00 2001 From: Max Staudt Date: Mon, 19 Aug 2019 14:16:18 +0200 Subject: [PATCH 193/721] i2c: icy: Add LTC2990 present on 2019 board revision Since the 2019 a1k.org community re-print of these PCBs sports an LTC2990 hwmon chip as an example use case, let this driver autoprobe for that as well. If it is present, modprobing ltc2990 is sufficient. The property_entry enables the three additional inputs available on this particular board: in1 will be the voltage of the 5V rail, divided by 2. in2 will be the voltage of the 12V rail, divided by 4. temp3 will be measured using a PCB loop next the chip. Signed-off-by: Max Staudt Reviewed-by: Geert Uytterhoeven Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 3 ++ drivers/i2c/busses/i2c-icy.c | 57 ++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 8b97ba4a2c0c..ee7f415a625a 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -1320,6 +1320,9 @@ config I2C_ICY This support is also available as a module. If so, the module will be called i2c-icy. + If you have a 2019 edition board with an LTC2990 sensor at address + 0x4c, loading the module 'ltc2990' is sufficient to enable it. + config I2C_MLXCPLD tristate "Mellanox I2C driver" depends on X86_64 diff --git a/drivers/i2c/busses/i2c-icy.c b/drivers/i2c/busses/i2c-icy.c index 0d61936dfc0b..8382eb64b424 100644 --- a/drivers/i2c/busses/i2c-icy.c +++ b/drivers/i2c/busses/i2c-icy.c @@ -53,6 +53,8 @@ struct icy_i2c { void __iomem *reg_s0; void __iomem *reg_s1; + struct fwnode_handle *ltc2990_fwnode; + struct i2c_client *ltc2990_client; }; /* @@ -94,11 +96,34 @@ static void icy_pcf_waitforpin(void *data) /* * Main i2c-icy part */ +static unsigned short const icy_ltc2990_addresses[] = { + 0x4c, 0x4d, 0x4e, 0x4f, I2C_CLIENT_END +}; + +/* + * Additional sensors exposed once this property is applied: + * + * in1 will be the voltage of the 5V rail, divided by 2. + * in2 will be the voltage of the 12V rail, divided by 4. + * temp3 will be measured using a PCB loop next the chip. + */ +static const u32 icy_ltc2990_meas_mode[] = {0, 3}; + +static const struct property_entry icy_ltc2990_props[] = { + PROPERTY_ENTRY_U32_ARRAY("lltc,meas-mode", icy_ltc2990_meas_mode), + { } +}; + static int icy_probe(struct zorro_dev *z, const struct zorro_device_id *ent) { struct icy_i2c *i2c; struct i2c_algo_pcf_data *algo_data; + struct fwnode_handle *new_fwnode; + struct i2c_board_info ltc2990_info = { + .type = "ltc2990", + .addr = 0x4c, + }; i2c = devm_kzalloc(&z->dev, sizeof(*i2c), GFP_KERNEL); if (!i2c) @@ -140,6 +165,35 @@ static int icy_probe(struct zorro_dev *z, dev_info(&z->dev, "ICY I2C controller at %pa, IRQ not implemented\n", &z->resource.start); + /* + * The 2019 a1k.org PCBs have an LTC2990 at 0x4c, so start + * it automatically once ltc2990 is modprobed. + * + * in0 is the voltage of the internal 5V power supply. + * temp1 is the temperature inside the chip. + * + * See property_entry above for in1, in2, temp3. + */ + new_fwnode = fwnode_create_software_node(icy_ltc2990_props, NULL); + if (IS_ERR(new_fwnode)) { + dev_info(&z->dev, "Failed to create fwnode for LTC2990, error: %ld\n", + PTR_ERR(new_fwnode)); + } else { + /* + * Store the fwnode so we can destroy it on .remove(). + * Only store it on success, as fwnode_remove_software_node() + * is NULL safe, but not PTR_ERR safe. + */ + i2c->ltc2990_fwnode = new_fwnode; + ltc2990_info.fwnode = new_fwnode; + + i2c->ltc2990_client = + i2c_new_probed_device(&i2c->adapter, + <c2990_info, + icy_ltc2990_addresses, + NULL); + } + return 0; } @@ -147,6 +201,9 @@ static void icy_remove(struct zorro_dev *z) { struct icy_i2c *i2c = dev_get_drvdata(&z->dev); + i2c_unregister_device(i2c->ltc2990_client); + fwnode_remove_software_node(i2c->ltc2990_fwnode); + i2c_del_adapter(&i2c->adapter); } From f0b576801d83cecab29888010017333babd61ede Mon Sep 17 00:00:00 2001 From: "Adamski, Krzysztof (Nokia - PL/Wroclaw)" Date: Mon, 19 Aug 2019 09:07:07 +0000 Subject: [PATCH 194/721] i2c: axxia: support slave mode This device contains both master and slave controllers which can be enabled simultaneously. Both controllers share the same SDA/SCL lines and interrupt source but has separate control and status registers. Controllers also works in loopback mode - slave device can communicate with its own master controller internally. The controller can handle up to two addresses, both of which may be 10 bit. Most of the logic (sending (N)ACK, handling repeated start or switching between write/read) is handled automatically which makes working with this controller quite easy. For simplicity, this patch adds basic support, limiting to only one slave address. Support for the 2nd device may be added in the future. Note that synchronize_irq() is used to ensure any running slave interrupt is finished to make sure slave i2c_client structure can be safely used by i2c_slave_event. Signed-off-by: Krzysztof Adamski Reviewed-by: Alexander Sverdlin Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 1 + drivers/i2c/busses/i2c-axxia.c | 152 +++++++++++++++++++++++++++++++-- 2 files changed, 145 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index ee7f415a625a..d3023e5379ea 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -429,6 +429,7 @@ config I2C_AXXIA tristate "Axxia I2C controller" depends on ARCH_AXXIA || COMPILE_TEST default ARCH_AXXIA + select I2C_SLAVE help Say yes if you want to support the I2C bus on Axxia platforms. diff --git a/drivers/i2c/busses/i2c-axxia.c b/drivers/i2c/busses/i2c-axxia.c index ff3142b15cab..0214daa913ff 100644 --- a/drivers/i2c/busses/i2c-axxia.c +++ b/drivers/i2c/busses/i2c-axxia.c @@ -77,6 +77,40 @@ MST_STATUS_IP) #define MST_TX_BYTES_XFRD 0x50 #define MST_RX_BYTES_XFRD 0x54 +#define SLV_ADDR_DEC_CTL 0x58 +#define SLV_ADDR_DEC_GCE BIT(0) /* ACK to General Call Address from own master (loopback) */ +#define SLV_ADDR_DEC_OGCE BIT(1) /* ACK to General Call Address from external masters */ +#define SLV_ADDR_DEC_SA1E BIT(2) /* ACK to addr_1 enabled */ +#define SLV_ADDR_DEC_SA1M BIT(3) /* 10-bit addressing for addr_1 enabled */ +#define SLV_ADDR_DEC_SA2E BIT(4) /* ACK to addr_2 enabled */ +#define SLV_ADDR_DEC_SA2M BIT(5) /* 10-bit addressing for addr_2 enabled */ +#define SLV_ADDR_1 0x5c +#define SLV_ADDR_2 0x60 +#define SLV_RX_CTL 0x64 +#define SLV_RX_ACSA1 BIT(0) /* Generate ACK for writes to addr_1 */ +#define SLV_RX_ACSA2 BIT(1) /* Generate ACK for writes to addr_2 */ +#define SLV_RX_ACGCA BIT(2) /* ACK data phase transfers to General Call Address */ +#define SLV_DATA 0x68 +#define SLV_RX_FIFO 0x6c +#define SLV_FIFO_DV1 BIT(0) /* Data Valid for addr_1 */ +#define SLV_FIFO_DV2 BIT(1) /* Data Valid for addr_2 */ +#define SLV_FIFO_AS BIT(2) /* (N)ACK Sent */ +#define SLV_FIFO_TNAK BIT(3) /* Timeout NACK */ +#define SLV_FIFO_STRC BIT(4) /* First byte after start condition received */ +#define SLV_FIFO_RSC BIT(5) /* Repeated Start Condition */ +#define SLV_FIFO_STPC BIT(6) /* Stop Condition */ +#define SLV_FIFO_DV (SLV_FIFO_DV1 | SLV_FIFO_DV2) +#define SLV_INT_ENABLE 0x70 +#define SLV_INT_STATUS 0x74 +#define SLV_STATUS_RFH BIT(0) /* FIFO service */ +#define SLV_STATUS_WTC BIT(1) /* Write transfer complete */ +#define SLV_STATUS_SRS1 BIT(2) /* Slave read from addr 1 */ +#define SLV_STATUS_SRRS1 BIT(3) /* Repeated start from addr 1 */ +#define SLV_STATUS_SRND1 BIT(4) /* Read request not following start condition */ +#define SLV_STATUS_SRC1 BIT(5) /* Read canceled */ +#define SLV_STATUS_SRAT1 BIT(6) /* Slave Read timed out */ +#define SLV_STATUS_SRDRE1 BIT(7) /* Data written after timed out */ +#define SLV_READ_DUMMY 0x78 #define SCL_HIGH_PERIOD 0x80 #define SCL_LOW_PERIOD 0x84 #define SPIKE_FLTR_LEN 0x88 @@ -111,6 +145,8 @@ struct axxia_i2c_dev { struct clk *i2c_clk; u32 bus_clk_rate; bool last; + struct i2c_client *slave; + int irq; }; static void i2c_int_disable(struct axxia_i2c_dev *idev, u32 mask) @@ -276,13 +312,65 @@ static int axxia_i2c_fill_tx_fifo(struct axxia_i2c_dev *idev) return ret; } +static void axxia_i2c_slv_fifo_event(struct axxia_i2c_dev *idev) +{ + u32 fifo_status = readl(idev->base + SLV_RX_FIFO); + u8 val; + + dev_dbg(idev->dev, "slave irq fifo_status=0x%x\n", fifo_status); + + if (fifo_status & SLV_FIFO_DV1) { + if (fifo_status & SLV_FIFO_STRC) + i2c_slave_event(idev->slave, + I2C_SLAVE_WRITE_REQUESTED, &val); + + val = readl(idev->base + SLV_DATA); + i2c_slave_event(idev->slave, I2C_SLAVE_WRITE_RECEIVED, &val); + } + if (fifo_status & SLV_FIFO_STPC) { + readl(idev->base + SLV_DATA); /* dummy read */ + i2c_slave_event(idev->slave, I2C_SLAVE_STOP, &val); + } + if (fifo_status & SLV_FIFO_RSC) + readl(idev->base + SLV_DATA); /* dummy read */ +} + +static irqreturn_t axxia_i2c_slv_isr(struct axxia_i2c_dev *idev) +{ + u32 status = readl(idev->base + SLV_INT_STATUS); + u8 val; + + dev_dbg(idev->dev, "slave irq status=0x%x\n", status); + + if (status & SLV_STATUS_RFH) + axxia_i2c_slv_fifo_event(idev); + if (status & SLV_STATUS_SRS1) { + i2c_slave_event(idev->slave, I2C_SLAVE_READ_REQUESTED, &val); + writel(val, idev->base + SLV_DATA); + } + if (status & SLV_STATUS_SRND1) { + i2c_slave_event(idev->slave, I2C_SLAVE_READ_PROCESSED, &val); + writel(val, idev->base + SLV_DATA); + } + if (status & SLV_STATUS_SRC1) + i2c_slave_event(idev->slave, I2C_SLAVE_STOP, &val); + + writel(INT_SLV, idev->base + INTERRUPT_STATUS); + return IRQ_HANDLED; +} + static irqreturn_t axxia_i2c_isr(int irq, void *_dev) { struct axxia_i2c_dev *idev = _dev; + irqreturn_t ret = IRQ_NONE; u32 status; - if (!(readl(idev->base + INTERRUPT_STATUS) & INT_MST)) - return IRQ_NONE; + status = readl(idev->base + INTERRUPT_STATUS); + + if (status & INT_SLV) + ret = axxia_i2c_slv_isr(idev); + if (!(status & INT_MST)) + return ret; /* Read interrupt status bits */ status = readl(idev->base + MST_INT_STATUS); @@ -583,9 +671,58 @@ static u32 axxia_i2c_func(struct i2c_adapter *adap) return caps; } +static int axxia_i2c_reg_slave(struct i2c_client *slave) +{ + struct axxia_i2c_dev *idev = i2c_get_adapdata(slave->adapter); + u32 slv_int_mask = SLV_STATUS_RFH; + u32 dec_ctl; + + if (idev->slave) + return -EBUSY; + + idev->slave = slave; + + /* Enable slave mode as well */ + writel(GLOBAL_MST_EN | GLOBAL_SLV_EN, idev->base + GLOBAL_CONTROL); + writel(INT_MST | INT_SLV, idev->base + INTERRUPT_ENABLE); + + /* Set slave address */ + dec_ctl = SLV_ADDR_DEC_SA1E; + if (slave->flags & I2C_CLIENT_TEN) + dec_ctl |= SLV_ADDR_DEC_SA1M; + + writel(SLV_RX_ACSA1, idev->base + SLV_RX_CTL); + writel(dec_ctl, idev->base + SLV_ADDR_DEC_CTL); + writel(slave->addr, idev->base + SLV_ADDR_1); + + /* Enable interrupts */ + slv_int_mask |= SLV_STATUS_SRS1 | SLV_STATUS_SRRS1 | SLV_STATUS_SRND1; + slv_int_mask |= SLV_STATUS_SRC1; + writel(slv_int_mask, idev->base + SLV_INT_ENABLE); + + return 0; +} + +static int axxia_i2c_unreg_slave(struct i2c_client *slave) +{ + struct axxia_i2c_dev *idev = i2c_get_adapdata(slave->adapter); + + /* Disable slave mode */ + writel(GLOBAL_MST_EN, idev->base + GLOBAL_CONTROL); + writel(INT_MST, idev->base + INTERRUPT_ENABLE); + + synchronize_irq(idev->irq); + + idev->slave = NULL; + + return 0; +} + static const struct i2c_algorithm axxia_i2c_algo = { .master_xfer = axxia_i2c_xfer, .functionality = axxia_i2c_func, + .reg_slave = axxia_i2c_reg_slave, + .unreg_slave = axxia_i2c_unreg_slave, }; static const struct i2c_adapter_quirks axxia_i2c_quirks = { @@ -599,7 +736,6 @@ static int axxia_i2c_probe(struct platform_device *pdev) struct axxia_i2c_dev *idev = NULL; struct resource *res; void __iomem *base; - int irq; int ret = 0; idev = devm_kzalloc(&pdev->dev, sizeof(*idev), GFP_KERNEL); @@ -611,10 +747,10 @@ static int axxia_i2c_probe(struct platform_device *pdev) if (IS_ERR(base)) return PTR_ERR(base); - irq = platform_get_irq(pdev, 0); - if (irq < 0) { + idev->irq = platform_get_irq(pdev, 0); + if (idev->irq < 0) { dev_err(&pdev->dev, "missing interrupt resource\n"); - return irq; + return idev->irq; } idev->i2c_clk = devm_clk_get(&pdev->dev, "i2c"); @@ -643,10 +779,10 @@ static int axxia_i2c_probe(struct platform_device *pdev) goto error_disable_clk; } - ret = devm_request_irq(&pdev->dev, irq, axxia_i2c_isr, 0, + ret = devm_request_irq(&pdev->dev, idev->irq, axxia_i2c_isr, 0, pdev->name, idev); if (ret) { - dev_err(&pdev->dev, "failed to claim IRQ%d\n", irq); + dev_err(&pdev->dev, "failed to claim IRQ%d\n", idev->irq); goto error_disable_clk; } From 21aa3983d619055cc3ecd0f0ceea4981dc8ee751 Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Thu, 15 Aug 2019 17:29:43 +0300 Subject: [PATCH 195/721] i2c: designware-pci: Switch over to MSI interrupts Some devices support MSI interrupts. Let's at least try to use them in platforms that provide MSI capability. Signed-off-by: Felipe Balbi Signed-off-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-pcidrv.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c index 7d2e6959679c..249ee3ee2a09 100644 --- a/drivers/i2c/busses/i2c-designware-pcidrv.c +++ b/drivers/i2c/busses/i2c-designware-pcidrv.c @@ -225,6 +225,8 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev, return r; } + pci_set_master(pdev); + r = pcim_iomap_regions(pdev, 1 << 0, pci_name(pdev)); if (r) { dev_err(&pdev->dev, "I/O memory remapping failed\n"); @@ -235,18 +237,24 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev, if (!dev) return -ENOMEM; + r = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); + if (r < 0) + return r; + dev->clk = NULL; dev->controller = controller; dev->get_clk_rate_khz = i2c_dw_get_clk_rate_khz; dev->base = pcim_iomap_table(pdev)[0]; dev->dev = &pdev->dev; - dev->irq = pdev->irq; + dev->irq = pci_irq_vector(pdev, 0); dev->flags |= controller->flags; if (controller->setup) { r = controller->setup(pdev, controller); - if (r) + if (r) { + pci_free_irq_vectors(pdev); return r; + } } dev->functionality = controller->functionality | @@ -274,8 +282,10 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev, adap->nr = controller->bus_num; r = i2c_dw_probe(dev); - if (r) + if (r) { + pci_free_irq_vectors(pdev); return r; + } pm_runtime_set_autosuspend_delay(&pdev->dev, 1000); pm_runtime_use_autosuspend(&pdev->dev); @@ -294,6 +304,7 @@ static void i2c_dw_pci_remove(struct pci_dev *pdev) pm_runtime_get_noresume(&pdev->dev); i2c_del_adapter(&dev->adapter); + pci_free_irq_vectors(pdev); } /* work with hotplug and coldplug */ From 70fb95e213147a88417eaae5b93ba6e59be087c0 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Thu, 15 Aug 2019 17:29:44 +0300 Subject: [PATCH 196/721] i2c: designware-pci: Add support for Elkhart Lake PSE I2C Add support for Intel(R) Programmable Services Engine (Intel(R) PSE) I2C controller in Intel Elkhart Lake when interface is assigned to the host processor. Signed-off-by: Jarkko Nikula Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-pcidrv.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c index 249ee3ee2a09..050adda7c1bd 100644 --- a/drivers/i2c/busses/i2c-designware-pcidrv.c +++ b/drivers/i2c/busses/i2c-designware-pcidrv.c @@ -33,6 +33,7 @@ enum dw_pci_ctl_id_t { baytrail, cherrytrail, haswell, + elkhartlake, }; struct dw_scl_sda_cfg { @@ -168,6 +169,14 @@ static struct dw_pci_controller dw_pci_controllers[] = { .flags = MODEL_CHERRYTRAIL, .scl_sda_cfg = &byt_config, }, + [elkhartlake] = { + .bus_num = -1, + .bus_cfg = INTEL_MID_STD_CFG | DW_IC_CON_SPEED_FAST, + .tx_fifo_depth = 32, + .rx_fifo_depth = 32, + .functionality = I2C_FUNC_10BIT_ADDR, + .clk_khz = 100000, + }, }; #ifdef CONFIG_PM @@ -340,6 +349,15 @@ static const struct pci_device_id i2_designware_pci_ids[] = { { PCI_VDEVICE(INTEL, 0x22C5), cherrytrail }, { PCI_VDEVICE(INTEL, 0x22C6), cherrytrail }, { PCI_VDEVICE(INTEL, 0x22C7), cherrytrail }, + /* Elkhart Lake (PSE I2C) */ + { PCI_VDEVICE(INTEL, 0x4bb9), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bba), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbb), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbc), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbd), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbe), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbf), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bc0), elkhartlake }, { 0,} }; MODULE_DEVICE_TABLE(pci, i2_designware_pci_ids); From f9bf7a899412b5f8c96b084343bde4168dc12efa Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Mon, 19 Aug 2019 13:16:01 +0530 Subject: [PATCH 197/721] i2c: taos-evm: Make structure tsl2550_info constant Static structure tsl2550_info, of type i2c_board_info, is referenced only twice: the first time in arguments to dev_info() (which does not modify it) and the second time as the last argument to function i2c_new_device() (where the corresponding parameter is declared as const). As tsl2550_info is therefore never modified, make it const to protect it from unintended modifications. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-taos-evm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-taos-evm.c b/drivers/i2c/busses/i2c-taos-evm.c index c82e78f57386..056df6b2538a 100644 --- a/drivers/i2c/busses/i2c-taos-evm.c +++ b/drivers/i2c/busses/i2c-taos-evm.c @@ -39,7 +39,7 @@ struct taos_data { }; /* TAOS TSL2550 EVM */ -static struct i2c_board_info tsl2550_info = { +static const struct i2c_board_info tsl2550_info = { I2C_BOARD_INFO("tsl2550", 0x39), }; From 71dc297ca9ab6311e7d39fc53811fd871d51854f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 19 Aug 2019 13:24:23 +0300 Subject: [PATCH 198/721] i2c: designware: assert reset when error happen at ->probe() The commit c62ebb3d5f0d ("i2c: designware: Add support for an interface clock") introduced an optional clock while missed correct error handling. assert reset line back if error happen at ->probe(). Fixes: c62ebb3d5f0d ("i2c: designware: Add support for an interface clock") Signed-off-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index ddfb81872906..4624ef8fbae8 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -346,8 +346,10 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) /* Optional interface clock */ dev->pclk = devm_clk_get_optional(&pdev->dev, "pclk"); - if (IS_ERR(dev->pclk)) - return PTR_ERR(dev->pclk); + if (IS_ERR(dev->pclk)) { + ret = PTR_ERR(dev->pclk); + goto exit_reset; + } dev->clk = devm_clk_get(&pdev->dev, NULL); if (!i2c_dw_prepare_clk(dev, true)) { From a6af48ec0712a0c98d8abe6b47c655b26026fceb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 19 Aug 2019 13:31:30 +0300 Subject: [PATCH 199/721] i2c: designware: Fix optional reset error handling The commit bb475230b8e5 ("reset: make optional functions really optional") brought a missed part of the support for an optional reset handlers. Since that we don't need to have special error handling in the driver. Signed-off-by: Andy Shevchenko Acked-by: Jarkko Nikula Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 4624ef8fbae8..16dd338877d0 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -279,12 +279,10 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dev); dev->rst = devm_reset_control_get_optional_exclusive(&pdev->dev, NULL); - if (IS_ERR(dev->rst)) { - if (PTR_ERR(dev->rst) == -EPROBE_DEFER) - return -EPROBE_DEFER; - } else { - reset_control_deassert(dev->rst); - } + if (IS_ERR(dev->rst)) + return PTR_ERR(dev->rst); + + reset_control_deassert(dev->rst); t = &dev->timings; if (pdata) @@ -402,8 +400,7 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) exit_probe: dw_i2c_plat_pm_cleanup(dev); exit_reset: - if (!IS_ERR_OR_NULL(dev->rst)) - reset_control_assert(dev->rst); + reset_control_assert(dev->rst); return ret; } @@ -421,8 +418,7 @@ static int dw_i2c_plat_remove(struct platform_device *pdev) pm_runtime_put_sync(&pdev->dev); dw_i2c_plat_pm_cleanup(dev); - if (!IS_ERR_OR_NULL(dev->rst)) - reset_control_assert(dev->rst); + reset_control_assert(dev->rst); return 0; } From ba919403566dba52bf074851896ba0ca7f72c1e2 Mon Sep 17 00:00:00 2001 From: Federico Vaga Date: Thu, 22 Aug 2019 15:21:32 +0200 Subject: [PATCH 200/721] i2c: ocores: use request_any_context_irq() to register IRQ handler The i2c-ocores device is an HDL component that get instantiated in FPGA. The software stack used to drive an FPGA can be very different, and the i2c-ocore ip-core must work in different context. With respect to this patch the IRQ controller behind this device, and its driver, can have different implementations (nested threads). For this reason, it is safer to use `request_any_context_irq()` to avoid errors at probe time. Signed-off-by: Federico Vaga Reviewed-by: Andrew Lunn Reviewed-by: Peter Korsgaard Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-ocores.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index 4117f1abc7c6..ca8b3ecfa93d 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -703,8 +703,9 @@ static int ocores_i2c_probe(struct platform_device *pdev) } if (ocores_algorithm.master_xfer != ocores_xfer_polling) { - ret = devm_request_irq(&pdev->dev, irq, ocores_isr, 0, - pdev->name, i2c); + ret = devm_request_any_context_irq(&pdev->dev, irq, + ocores_isr, 0, + pdev->name, i2c); if (ret) { dev_err(&pdev->dev, "Cannot claim IRQ\n"); goto err_clk; From 528d53a1592b0e27c423f7cafc1df85f77fc1163 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 2 Aug 2019 14:54:38 +0200 Subject: [PATCH 201/721] i2c: piix4: Fix probing of reserved ports on AMD Family 16h Model 30h Prevent bus timeouts and resets on Family 16h Model 30h by not probing reserved Ports 3 and 4. According to the AMD BIOS and Kernel Developer's Guides (BKDG), Port 3 and Port 4 are reserved on the following devices: - Family 15h Model 60h-6Fh - Family 15h Model 70h-7Fh - Family 16h Model 30h-3Fh Based on earlier work by Andrew Cooks. Reported-by: Andrew Cooks Signed-off-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-piix4.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index c46c4bddc7ca..d9f7e771f6ad 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -72,7 +72,8 @@ #define PIIX4_BLOCK_DATA 0x14 /* Multi-port constants */ -#define PIIX4_MAX_ADAPTERS 4 +#define PIIX4_MAX_ADAPTERS 4 +#define HUDSON2_MAIN_PORTS 2 /* HUDSON2, KERNCZ reserves ports 3, 4 */ /* SB800 constants */ #define SB800_PIIX4_SMB_IDX 0xcd6 @@ -808,6 +809,7 @@ MODULE_DEVICE_TABLE (pci, piix4_ids); static struct i2c_adapter *piix4_main_adapters[PIIX4_MAX_ADAPTERS]; static struct i2c_adapter *piix4_aux_adapter; +static int piix4_adapter_count; static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba, bool sb800_main, u8 port, bool notify_imc, @@ -867,7 +869,15 @@ static int piix4_add_adapters_sb800(struct pci_dev *dev, unsigned short smba, int port; int retval; - for (port = 0; port < PIIX4_MAX_ADAPTERS; port++) { + if (dev->device == PCI_DEVICE_ID_AMD_KERNCZ_SMBUS || + (dev->device == PCI_DEVICE_ID_AMD_HUDSON2_SMBUS && + dev->revision >= 0x1F)) { + piix4_adapter_count = HUDSON2_MAIN_PORTS; + } else { + piix4_adapter_count = PIIX4_MAX_ADAPTERS; + } + + for (port = 0; port < piix4_adapter_count; port++) { retval = piix4_add_adapter(dev, smba, true, port, notify_imc, piix4_main_port_names_sb800[port], &piix4_main_adapters[port]); @@ -989,7 +999,7 @@ static void piix4_adap_remove(struct i2c_adapter *adap) static void piix4_remove(struct pci_dev *dev) { - int port = PIIX4_MAX_ADAPTERS; + int port = piix4_adapter_count; while (--port >= 0) { if (piix4_main_adapters[port]) { From 0183eb8bb59d45f26ec4fc73aaa416067fe6c0be Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 2 Aug 2019 14:55:26 +0200 Subject: [PATCH 202/721] i2c: piix4: Add ACPI support Enable the i2c-piix4 SMBus controller driver to enumerate I2C slave devices using ACPI. It builds on the related I2C mux device work in commit 8eb5c87a92c0 ("i2c: add ACPI support for I2C mux ports") In the i2c-piix4 driver the adapters are enumerated as: Main SMBus adapter Port 0, Port 2, ..., aux port (i.e., ASF adapter) However, in the AMD BKDG documentation[1], the implied order of ports is: Main SMBus adapter Port 0, ASF adapter, Port 2, Port 3, ... This ordering difference is unfortunate. We assume that ACPI developers will use the AMD documentation ordering, so we have to pass an extra parameter to piix4_add_adapter(). [1] 52740 BIOS and Kernel Developer's Guide (BKDG) for AMD Family 16h Models 30h-3Fh Processors Based on earlier work by Andrew Cooks. Reported-by: Andrew Cooks Signed-off-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-piix4.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index d9f7e771f6ad..890b8d029b28 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -813,7 +813,8 @@ static int piix4_adapter_count; static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba, bool sb800_main, u8 port, bool notify_imc, - const char *name, struct i2c_adapter **padap) + u8 hw_port_nr, const char *name, + struct i2c_adapter **padap) { struct i2c_adapter *adap; struct i2c_piix4_adapdata *adapdata; @@ -845,6 +846,12 @@ static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba, /* set up the sysfs linkage to our parent device */ adap->dev.parent = &dev->dev; + if (has_acpi_companion(&dev->dev)) { + acpi_preset_companion(&adap->dev, + ACPI_COMPANION(&dev->dev), + hw_port_nr); + } + snprintf(adap->name, sizeof(adap->name), "SMBus PIIX4 adapter%s at %04x", name, smba); @@ -878,7 +885,10 @@ static int piix4_add_adapters_sb800(struct pci_dev *dev, unsigned short smba, } for (port = 0; port < piix4_adapter_count; port++) { + u8 hw_port_nr = port == 0 ? 0 : port + 1; + retval = piix4_add_adapter(dev, smba, true, port, notify_imc, + hw_port_nr, piix4_main_port_names_sb800[port], &piix4_main_adapters[port]); if (retval < 0) @@ -949,8 +959,8 @@ static int piix4_probe(struct pci_dev *dev, const struct pci_device_id *id) return retval; /* Try to register main SMBus adapter, give up if we can't */ - retval = piix4_add_adapter(dev, retval, false, 0, false, "", - &piix4_main_adapters[0]); + retval = piix4_add_adapter(dev, retval, false, 0, false, 0, + "", &piix4_main_adapters[0]); if (retval < 0) return retval; } @@ -976,7 +986,7 @@ static int piix4_probe(struct pci_dev *dev, const struct pci_device_id *id) if (retval > 0) { /* Try to add the aux adapter if it exists, * piix4_add_adapter will clean up if this fails */ - piix4_add_adapter(dev, retval, false, 0, false, + piix4_add_adapter(dev, retval, false, 0, false, 1, is_sb800 ? piix4_aux_port_name_sb800 : "", &piix4_aux_adapter); } From d3b9f659fac6e5f74f857364c5ed08b94e7f94ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Aug 2019 16:03:47 +0200 Subject: [PATCH 203/721] microblaze/nommu: use the generic uncached segment support Stop providing our own arch alloc/free hooks for nommu platforms and just expose the segment offset and use the generic dma-direct allocator. Signed-off-by: Christoph Hellwig Signed-off-by: Michal Simek --- arch/microblaze/Kconfig | 2 + arch/microblaze/mm/consistent.c | 93 +++++++++++++++------------------ 2 files changed, 43 insertions(+), 52 deletions(-) diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index d411de05b628..a0d749c309f3 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -5,9 +5,11 @@ config MICROBLAZE select ARCH_NO_SWAP select ARCH_HAS_BINFMT_FLAT if !MMU select ARCH_HAS_DMA_COHERENT_TO_PFN if MMU + select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_UNCACHED_SEGMENT if !MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_COHERENT_DMA_MMAP if !MMU select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c index bc7042209c57..1a859e8b58c2 100644 --- a/arch/microblaze/mm/consistent.c +++ b/arch/microblaze/mm/consistent.c @@ -42,21 +42,48 @@ #include #include -#ifndef CONFIG_MMU -/* I have to use dcache values because I can't relate on ram size */ -# define UNCACHED_SHADOW_MASK (cpuinfo.dcache_high - cpuinfo.dcache_base + 1) -#endif +void arch_dma_prep_coherent(struct page *page, size_t size) +{ + phys_addr_t paddr = page_to_phys(page); + flush_dcache_range(paddr, paddr + size); +} + +#ifndef CONFIG_MMU /* - * Consistent memory allocators. Used for DMA devices that want to - * share uncached memory with the processor core. - * My crufty no-MMU approach is simple. In the HW platform we can optionally - * mirror the DDR up above the processor cacheable region. So, memory accessed - * in this mirror region will not be cached. It's alloced from the same - * pool as normal memory, but the handle we return is shifted up into the - * uncached region. This will no doubt cause big problems if memory allocated - * here is not also freed properly. -- JW + * Consistent memory allocators. Used for DMA devices that want to share + * uncached memory with the processor core. My crufty no-MMU approach is + * simple. In the HW platform we can optionally mirror the DDR up above the + * processor cacheable region. So, memory accessed in this mirror region will + * not be cached. It's alloced from the same pool as normal memory, but the + * handle we return is shifted up into the uncached region. This will no doubt + * cause big problems if memory allocated here is not also freed properly. -- JW + * + * I have to use dcache values because I can't relate on ram size: */ +#ifdef CONFIG_XILINX_UNCACHED_SHADOW +#define UNCACHED_SHADOW_MASK (cpuinfo.dcache_high - cpuinfo.dcache_base + 1) +#else +#define UNCACHED_SHADOW_MASK 0 +#endif /* CONFIG_XILINX_UNCACHED_SHADOW */ + +void *uncached_kernel_address(void *ptr) +{ + unsigned long addr = (unsigned long)ptr; + + addr |= UNCACHED_SHADOW_MASK; + if (addr > cpuinfo.dcache_base && addr < cpuinfo.dcache_high) + pr_warn("ERROR: Your cache coherent area is CACHED!!!\n"); + return (void *)addr; +} + +void *cached_kernel_address(void *ptr) +{ + unsigned long addr = (unsigned long)ptr; + + return (void *)(addr & ~UNCACHED_SHADOW_MASK); +} +#else /* CONFIG_MMU */ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { @@ -64,12 +91,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, void *ret; unsigned int i, err = 0; struct page *page, *end; - -#ifdef CONFIG_MMU phys_addr_t pa; struct vm_struct *area; unsigned long va; -#endif if (in_interrupt()) BUG(); @@ -86,26 +110,8 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, * we need to ensure that there are no cachelines in use, * or worse dirty in this area. */ - flush_dcache_range(virt_to_phys((void *)vaddr), - virt_to_phys((void *)vaddr) + size); + arch_dma_prep_coherent(virt_to_page((unsigned long)vaddr), size); -#ifndef CONFIG_MMU - ret = (void *)vaddr; - /* - * Here's the magic! Note if the uncached shadow is not implemented, - * it's up to the calling code to also test that condition and make - * other arranegments, such as manually flushing the cache and so on. - */ -# ifdef CONFIG_XILINX_UNCACHED_SHADOW - ret = (void *)((unsigned) ret | UNCACHED_SHADOW_MASK); -# endif - if ((unsigned int)ret > cpuinfo.dcache_base && - (unsigned int)ret < cpuinfo.dcache_high) - pr_warn("ERROR: Your cache coherent area is CACHED!!!\n"); - - /* dma_handle is same as physical (shadowed) address */ - *dma_handle = (dma_addr_t)ret; -#else /* Allocate some common virtual space to map the new pages. */ area = get_vm_area(size, VM_ALLOC); if (!area) { @@ -117,7 +123,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* This gives us the real physical address of the first page. */ *dma_handle = pa = __virt_to_phys(vaddr); -#endif /* * free wasted pages. We skip the first page since we know @@ -131,10 +136,8 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, split_page(page, order); for (i = 0; i < size && err == 0; i += PAGE_SIZE) { -#ifdef CONFIG_MMU /* MS: This is the whole magic - use cache inhibit pages */ err = map_page(va + i, pa + i, _PAGE_KERNEL | _PAGE_NO_CACHE); -#endif SetPageReserved(page); page++; @@ -154,7 +157,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, return ret; } -#ifdef CONFIG_MMU static pte_t *consistent_virt_to_pte(void *vaddr) { unsigned long addr = (unsigned long)vaddr; @@ -172,7 +174,6 @@ long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, return pte_pfn(*ptep); } -#endif /* * free page(s) as defined by the above mapping. @@ -187,18 +188,6 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, size = PAGE_ALIGN(size); -#ifndef CONFIG_MMU - /* Clear SHADOW_MASK bit in address, and free as per usual */ -# ifdef CONFIG_XILINX_UNCACHED_SHADOW - vaddr = (void *)((unsigned)vaddr & ~UNCACHED_SHADOW_MASK); -# endif - page = virt_to_page(vaddr); - - do { - __free_reserved_page(page); - page++; - } while (size -= PAGE_SIZE); -#else do { pte_t *ptep = consistent_virt_to_pte(vaddr); unsigned long pfn; @@ -216,5 +205,5 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, /* flush tlb */ flush_tlb_all(); -#endif } +#endif /* CONFIG_MMU */ From 04e3543e228fcbb000de922ec5d366f398bdd9ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Aug 2019 16:03:48 +0200 Subject: [PATCH 204/721] microblaze: use the generic dma coherent remap allocator This switches to using common code for the DMA allocations, including potential use of the CMA allocator if configured. Switching to the generic code enables DMA allocations from atomic context, which is required by the DMA API documentation, and also adds various other minor features drivers start relying upon. It also makes sure we have on tested code base for all architectures that require uncached pte bits for coherent DMA allocations. Signed-off-by: Christoph Hellwig Signed-off-by: Michal Simek --- arch/microblaze/Kconfig | 1 + arch/microblaze/mm/consistent.c | 152 +------------------------------- 2 files changed, 5 insertions(+), 148 deletions(-) diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index a0d749c309f3..e477896fbae6 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -17,6 +17,7 @@ config MICROBLAZE select TIMER_OF select CLONE_BACKWARDS3 select COMMON_CLK + select DMA_DIRECT_REMAP if MMU select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c index 1a859e8b58c2..0e0f733eb846 100644 --- a/arch/microblaze/mm/consistent.c +++ b/arch/microblaze/mm/consistent.c @@ -4,43 +4,16 @@ * Copyright (C) 2010 Michal Simek * Copyright (C) 2010 PetaLogix * Copyright (C) 2005 John Williams - * - * Based on PowerPC version derived from arch/arm/mm/consistent.c - * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) - * Copyright (C) 2000 Russell King */ -#include -#include -#include #include -#include #include #include -#include -#include #include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include #include - -#include -#include -#include -#include -#include -#include -#include #include -#include +#include void arch_dma_prep_coherent(struct page *page, size_t size) { @@ -84,126 +57,9 @@ void *cached_kernel_address(void *ptr) return (void *)(addr & ~UNCACHED_SHADOW_MASK); } #else /* CONFIG_MMU */ -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) +static int __init atomic_pool_init(void) { - unsigned long order, vaddr; - void *ret; - unsigned int i, err = 0; - struct page *page, *end; - phys_addr_t pa; - struct vm_struct *area; - unsigned long va; - - if (in_interrupt()) - BUG(); - - /* Only allocate page size areas. */ - size = PAGE_ALIGN(size); - order = get_order(size); - - vaddr = __get_free_pages(gfp | __GFP_ZERO, order); - if (!vaddr) - return NULL; - - /* - * we need to ensure that there are no cachelines in use, - * or worse dirty in this area. - */ - arch_dma_prep_coherent(virt_to_page((unsigned long)vaddr), size); - - /* Allocate some common virtual space to map the new pages. */ - area = get_vm_area(size, VM_ALLOC); - if (!area) { - free_pages(vaddr, order); - return NULL; - } - va = (unsigned long) area->addr; - ret = (void *)va; - - /* This gives us the real physical address of the first page. */ - *dma_handle = pa = __virt_to_phys(vaddr); - - /* - * free wasted pages. We skip the first page since we know - * that it will have count = 1 and won't require freeing. - * We also mark the pages in use as reserved so that - * remap_page_range works. - */ - page = virt_to_page(vaddr); - end = page + (1 << order); - - split_page(page, order); - - for (i = 0; i < size && err == 0; i += PAGE_SIZE) { - /* MS: This is the whole magic - use cache inhibit pages */ - err = map_page(va + i, pa + i, _PAGE_KERNEL | _PAGE_NO_CACHE); - - SetPageReserved(page); - page++; - } - - /* Free the otherwise unused pages. */ - while (page < end) { - __free_page(page); - page++; - } - - if (err) { - free_pages(vaddr, order); - return NULL; - } - - return ret; -} - -static pte_t *consistent_virt_to_pte(void *vaddr) -{ - unsigned long addr = (unsigned long)vaddr; - - return pte_offset_kernel(pmd_offset(pgd_offset_k(addr), addr), addr); -} - -long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, - dma_addr_t dma_addr) -{ - pte_t *ptep = consistent_virt_to_pte(vaddr); - - if (pte_none(*ptep) || !pte_present(*ptep)) - return 0; - - return pte_pfn(*ptep); -} - -/* - * free page(s) as defined by the above mapping. - */ -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, unsigned long attrs) -{ - struct page *page; - - if (in_interrupt()) - BUG(); - - size = PAGE_ALIGN(size); - - do { - pte_t *ptep = consistent_virt_to_pte(vaddr); - unsigned long pfn; - - if (!pte_none(*ptep) && pte_present(*ptep)) { - pfn = pte_pfn(*ptep); - pte_clear(&init_mm, (unsigned int)vaddr, ptep); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - __free_reserved_page(page); - } - } - vaddr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - /* flush tlb */ - flush_tlb_all(); + return dma_atomic_pool_init(GFP_KERNEL, pgprot_noncached(PAGE_KERNEL)); } +postcore_initcall(atomic_pool_init); #endif /* CONFIG_MMU */ From 539005ffc6260fcb3bb4171138be5f66a41185a9 Mon Sep 17 00:00:00 2001 From: Lori Hikichi Date: Thu, 8 Aug 2019 09:07:53 +0530 Subject: [PATCH 205/721] i2c: iproc: Add full name of devicetree node to adapter name Add the full name of the devicetree node to the adapter name. Without this change, all adapters have the same name making it difficult to distinguish between multiple instances. The most obvious way to see this is to use the utility i2c_detect. e.g. "i2c-detect -l" Before i2c-1 i2c Broadcom iProc I2C adapter I2C adapter i2c-0 i2c Broadcom iProc I2C adapter I2C adapter After i2c-1 i2c Broadcom iProc (i2c@e0000) I2C adapter i2c-0 i2c Broadcom iProc (i2c@b0000) I2C adapter Now it is easy to figure out which adapter maps to a which DT node. Signed-off-by: Lori Hikichi Signed-off-by: Rayagonda Kokatanur Reviewed-by: Ray Jui Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-bcm-iproc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c index 2c7f145a036e..ee5578173fb1 100644 --- a/drivers/i2c/busses/i2c-bcm-iproc.c +++ b/drivers/i2c/busses/i2c-bcm-iproc.c @@ -917,7 +917,9 @@ static int bcm_iproc_i2c_probe(struct platform_device *pdev) adap = &iproc_i2c->adapter; i2c_set_adapdata(adap, iproc_i2c); - strlcpy(adap->name, "Broadcom iProc I2C adapter", sizeof(adap->name)); + snprintf(adap->name, sizeof(adap->name), + "Broadcom iProc (%s)", + of_node_full_name(iproc_i2c->device->of_node)); adap->algo = &bcm_iproc_algo; adap->quirks = &bcm_iproc_i2c_quirks; adap->dev.parent = &pdev->dev; From 67a53081e655d41a77f510377364600e5e9bf89c Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Thu, 15 Aug 2019 11:25:50 +0530 Subject: [PATCH 206/721] i2c: iproc: Make bcm_iproc_i2c_quirks constant Static structure bcm_iproc_i2c_quirks, of type i2c_adapter_quirks, is only used when being assigned to constant field quirks of a variable having type i2c_adapter. Hence make bcm_iproc_i2c_quirks constant as well to prevent it from unintended modification. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Reviewed-by: Ray Jui Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-bcm-iproc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c index ee5578173fb1..24269a5e3af6 100644 --- a/drivers/i2c/busses/i2c-bcm-iproc.c +++ b/drivers/i2c/busses/i2c-bcm-iproc.c @@ -803,7 +803,7 @@ static struct i2c_algorithm bcm_iproc_algo = { .unreg_slave = bcm_iproc_i2c_unreg_slave, }; -static struct i2c_adapter_quirks bcm_iproc_i2c_quirks = { +static const struct i2c_adapter_quirks bcm_iproc_i2c_quirks = { .max_read_len = M_RX_MAX_READ_LEN, }; From d1bbf38aaf882240cccca414e5a48db42e105da7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 30 Jul 2019 08:04:00 -0500 Subject: [PATCH 207/721] PCI: Fix typos and whitespace errors Fix typos in drivers/pci. Comment and whitespace changes only. Link: https://lore.kernel.org/r/20190819115306.27338-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring Acked-by: Thomas Petazzoni # armada8k --- Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt | 2 +- Documentation/devicetree/bindings/pci/pci-armada8k.txt | 2 +- drivers/pci/Kconfig | 2 +- drivers/pci/vc.c | 1 - include/linux/pci.h | 4 ++-- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt index a7f5f5afa0e6..de4b2baf91e8 100644 --- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt +++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt @@ -50,7 +50,7 @@ Additional required properties for imx7d-pcie and imx8mq-pcie: - power-domains: Must be set to a phandle pointing to PCIE_PHY power domain - resets: Must contain phandles to PCIe-related reset lines exposed by SRC IP block -- reset-names: Must contain the following entires: +- reset-names: Must contain the following entries: - "pciephy" - "apps" - "turnoff" diff --git a/Documentation/devicetree/bindings/pci/pci-armada8k.txt b/Documentation/devicetree/bindings/pci/pci-armada8k.txt index 9e3fc15e1af8..1aaa09254001 100644 --- a/Documentation/devicetree/bindings/pci/pci-armada8k.txt +++ b/Documentation/devicetree/bindings/pci/pci-armada8k.txt @@ -11,7 +11,7 @@ Required properties: - reg-names: - "ctrl" for the control register region - "config" for the config space region -- interrupts: Interrupt specifier for the PCIe controler +- interrupts: Interrupt specifier for the PCIe controller - clocks: reference to the PCIe controller clocks - clock-names: mandatory if there is a second clock, in this case the name must be "core" for the first clock and "reg" for the second diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 2ab92409210a..46f4912a370d 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -170,7 +170,7 @@ config PCI_P2PDMA Many PCIe root complexes do not support P2P transactions and it's hard to tell which support it at all, so at this time, - P2P DMA transations must be between devices behind the same root + P2P DMA transactions must be between devices behind the same root port. If unsure, say N. diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c index 5acd9c02683a..b39e854b80ec 100644 --- a/drivers/pci/vc.c +++ b/drivers/pci/vc.c @@ -409,7 +409,6 @@ void pci_restore_vc_state(struct pci_dev *dev) * For each type of VC capability, VC/VC9/MFVC, find the capability, size * it, and allocate a buffer for save/restore. */ - void pci_allocate_vc_save_buffers(struct pci_dev *dev) { int i; diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..c133579c01fa 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -969,7 +969,7 @@ resource_size_t pcibios_align_resource(void *, const struct resource *, resource_size_t, resource_size_t); -/* Weak but can be overriden by arch */ +/* Weak but can be overridden by arch */ void pci_fixup_cardbus(struct pci_bus *); /* Generic PCI functions used internally */ @@ -1801,7 +1801,7 @@ static inline int pci_ats_page_aligned(struct pci_dev *dev) { return 0; } #include -/* These two functions provide almost identical functionality. Depennding +/* These two functions provide almost identical functionality. Depending * on the architecture, one will be implemented as a wrapper around the * other (in drivers/pci/mmap.c). * From b071c1fd7a8ad041759d9a9e87d1b4333417e36c Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Wed, 7 Aug 2019 15:20:49 +0200 Subject: [PATCH 208/721] PCI: OF: Correct of_irq_parse_pci() documentation 530210c7814e ("of/irq: Replace of_irq with of_phandle_args") changed the of_irq_parse_pci() parameter type but didn't change the corresponding documentation. Update the function doc to match. Link: https://lore.kernel.org/r/20190807132049.10304-1-lkundrak@v3.sk Signed-off-by: Lubomir Rintel Signed-off-by: Bjorn Helgaas --- drivers/pci/of.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/of.c b/drivers/pci/of.c index bc7b27a28795..36891e7deee3 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -353,7 +353,7 @@ EXPORT_SYMBOL_GPL(devm_of_pci_get_host_bridge_resources); /** * of_irq_parse_pci - Resolve the interrupt for a PCI device * @pdev: the device whose interrupt is to be resolved - * @out_irq: structure of_irq filled by this function + * @out_irq: structure of_phandle_args filled by this function * * This function resolves the PCI interrupt for a given PCI device. If a * device-node exists for a given pci_dev, it will use normal OF tree From 55507aea58824578610eb0cb5c250a0c997987c9 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Mon, 26 Aug 2019 00:10:39 +0200 Subject: [PATCH 209/721] PCI: Remove unnecessary returns Remove unnecessary "return" statements at the end of void functions. No functional change intended. Link: https://lore.kernel.org/r/20190825221039.6977-1-kw@linux.com Link: https://lore.kernel.org/r/20190826095143.21353-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-mediatek.c | 2 -- drivers/pci/hotplug/cpci_hotplug_core.c | 1 - drivers/pci/hotplug/cpqphp_core.c | 1 - drivers/pci/hotplug/cpqphp_ctrl.c | 4 ---- drivers/pci/hotplug/cpqphp_nvram.h | 5 +---- drivers/pci/hotplug/rpadlpar_core.c | 1 - drivers/pci/hotplug/rpaphp_core.c | 1 - 7 files changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c index 80601e1b939e..b92ce6ef718e 100644 --- a/drivers/pci/controller/pcie-mediatek.c +++ b/drivers/pci/controller/pcie-mediatek.c @@ -630,8 +630,6 @@ static void mtk_pcie_intr_handler(struct irq_desc *desc) } chained_irq_exit(irqchip, desc); - - return; } static int mtk_pcie_setup_irq(struct mtk_pcie_port *port, diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c index 603eadf3d965..d0559d2faf50 100644 --- a/drivers/pci/hotplug/cpci_hotplug_core.c +++ b/drivers/pci/hotplug/cpci_hotplug_core.c @@ -563,7 +563,6 @@ cleanup_slots(void) } cleanup_null: up_write(&list_rwsem); - return; } int diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c index 16bbb183695a..b8aacb41a83c 100644 --- a/drivers/pci/hotplug/cpqphp_core.c +++ b/drivers/pci/hotplug/cpqphp_core.c @@ -173,7 +173,6 @@ static void pci_print_IRQ_route(void) dbg("%d %d %d %d\n", tbus, tdevice >> 3, tdevice & 0x7, tslot); } - return; } diff --git a/drivers/pci/hotplug/cpqphp_ctrl.c b/drivers/pci/hotplug/cpqphp_ctrl.c index b7f4e1f099d9..68de958a9be8 100644 --- a/drivers/pci/hotplug/cpqphp_ctrl.c +++ b/drivers/pci/hotplug/cpqphp_ctrl.c @@ -1872,8 +1872,6 @@ static void interrupt_event_handler(struct controller *ctrl) } } /* End of FOR loop */ } - - return; } @@ -1943,8 +1941,6 @@ void cpqhp_pushbutton_thread(struct timer_list *t) p_slot->state = STATIC_STATE; } - - return; } diff --git a/drivers/pci/hotplug/cpqphp_nvram.h b/drivers/pci/hotplug/cpqphp_nvram.h index 918ff8dbfe62..70e879b6a23f 100644 --- a/drivers/pci/hotplug/cpqphp_nvram.h +++ b/drivers/pci/hotplug/cpqphp_nvram.h @@ -16,10 +16,7 @@ #ifndef CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM -static inline void compaq_nvram_init(void __iomem *rom_start) -{ - return; -} +static inline void compaq_nvram_init(void __iomem *rom_start) { } static inline int compaq_nvram_load(void __iomem *rom_start, struct controller *ctrl) { diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c index 182f9e3443ee..977946e4e613 100644 --- a/drivers/pci/hotplug/rpadlpar_core.c +++ b/drivers/pci/hotplug/rpadlpar_core.c @@ -473,7 +473,6 @@ int __init rpadlpar_io_init(void) void rpadlpar_io_exit(void) { dlpar_sysfs_exit(); - return; } module_init(rpadlpar_io_init); diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index bcd5d357ca23..d34a3baf9b3a 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -412,7 +412,6 @@ static void __exit cleanup_slots(void) pci_hp_deregister(&slot->hotplug_slot); dealloc_slot_struct(slot); } - return; } static int __init rpaphp_init(void) From 1fb027d7596464d3fad3ed59f70f43807ef926c6 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 8 Jul 2019 17:03:31 +0000 Subject: [PATCH 210/721] fuse: require /dev/fuse reads to have enough buffer capacity (take 2) [ This retries commit d4b13963f217 ("fuse: require /dev/fuse reads to have enough buffer capacity"), which was reverted. In this version we require only `sizeof(fuse_in_header) + sizeof(fuse_write_in)` instead of 4K for FUSE request header room, because, contrary to libfuse and kernel client behaviour, GlusterFS actually provides only so much room for request header. ] A FUSE filesystem server queues /dev/fuse sys_read calls to get filesystem requests to handle. It does not know in advance what would be that request as it can be anything that client issues - LOOKUP, READ, WRITE, ... Many requests are short and retrieve data from the filesystem. However WRITE and NOTIFY_REPLY write data into filesystem. Before getting into operation phase, FUSE filesystem server and kernel client negotiate what should be the maximum write size the client will ever issue. After negotiation the contract in between server/client is that the filesystem server then should queue /dev/fuse sys_read calls with enough buffer capacity to receive any client request - WRITE in particular, while FUSE client should not, in particular, send WRITE requests with > negotiated max_write payload. FUSE client in kernel and libfuse historically reserve 4K for request header. However an existing filesystem server - GlusterFS - was found which reserves only 80 bytes for header room (= `sizeof(fuse_in_header) + sizeof(fuse_write_in)`). Since `sizeof(fuse_in_header) + sizeof(fuse_write_in)` == `sizeof(fuse_in_header) + sizeof(fuse_read_in)` == `sizeof(fuse_in_header) + sizeof(fuse_notify_retrieve_in)` is the absolute minimum any sane filesystem should be using for header room, the contract is that filesystem server should queue sys_reads with `sizeof(fuse_in_header) + sizeof(fuse_write_in)` + max_write buffer. If the filesystem server does not follow this contract, what can happen is that fuse_dev_do_read will see that request size is > buffer size, and then it will return EIO to client who issued the request but won't indicate in any way that there is a problem to filesystem server. This can be hard to diagnose because for some requests, e.g. for NOTIFY_REPLY which mimics WRITE, there is no client thread that is waiting for request completion and that EIO goes nowhere, while on filesystem server side things look like the kernel is not replying back after successful NOTIFY_RETRIEVE request made by the server. We can make the problem easy to diagnose if we indicate via error return to filesystem server when it is violating the contract. This should not practically cause problems because if a filesystem server is using shorter buffer, writes to it were already very likely to cause EIO, and if the filesystem is read-only it should be too following FUSE_MIN_READ_BUFFER minimum buffer size. Please see [1] for context where the problem of stuck filesystem was hit for real (because kernel client was incorrectly sending more than max_write data with NOTIFY_REPLY; see also previous patch), how the situation was traced and for more involving patch that did not make it into the tree. [1] https://marc.info/?l=linux-fsdevel&m=155057023600853&w=2 Signed-off-by: Kirill Smelkov Tested-by: Sander Eikelenboom Cc: Han-Wen Nienhuys Cc: Jakob Unterwurzacher Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ea8237513dfa..fdb85895737b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1317,6 +1317,24 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, unsigned reqsize; unsigned int hash; + /* + * Require sane minimum read buffer - that has capacity for fixed part + * of any request header + negotiated max_write room for data. + * + * Historically libfuse reserves 4K for fixed header room, but e.g. + * GlusterFS reserves only 80 bytes + * + * = `sizeof(fuse_in_header) + sizeof(fuse_write_in)` + * + * which is the absolute minimum any sane filesystem should be using + * for header room. + */ + if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER, + sizeof(struct fuse_in_header) + + sizeof(struct fuse_write_in) + + fc->max_write)) + return -EINVAL; + restart: spin_lock(&fiq->waitq.lock); err = -EAGAIN; From 17b2cbe294922ec2b78e29b93ed602b56cd75a4e Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 22 Jul 2019 10:17:17 +0300 Subject: [PATCH 211/721] fuse: cleanup fuse_wait_on_page_writeback fuse_wait_on_page_writeback() always returns zero and nobody cares. Let's make it void. Signed-off-by: Maxim Patlasov Signed-off-by: Vasily Averin Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ae2828beb00..e076c2cf65b0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -383,12 +383,11 @@ static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) * Since fuse doesn't rely on the VM writeback tracking, this has to * use some other means. */ -static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) +static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) { struct fuse_inode *fi = get_fuse_inode(inode); wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); - return 0; } /* From 56d250ef9650edce600c96e2f918b9b9bafda85e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 29 Aug 2019 11:01:18 +0200 Subject: [PATCH 212/721] cuse: fix broken release The inode parameter in cuse_release() is likely *not* a fuse inode. It's a small wonder it didn't blow up until now. Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index bab7a0db81dd..d011a1ad1d4f 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -142,11 +142,10 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_sync_release(fi, ff, file->f_flags); + fuse_sync_release(NULL, ff, file->f_flags); fuse_conn_put(fc); return 0; From b65dc4f6b339ff57321fd95f2f7b6197a3c24ba4 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Tue, 13 Aug 2019 18:31:33 +0800 Subject: [PATCH 213/721] mfd: ezx-pcap: Replace mutex_lock with spin_lock As mutex_lock might sleep. Function pcap_adc_irq is an interrupt handler. The use of mutex_lock in pcap_adc_irq may cause sleep in IRQ context. Replace mutex_lock with spin_lock to avoid this. Signed-off-by: Fuqian Huang Signed-off-by: Lee Jones --- drivers/mfd/ezx-pcap.c | 53 ++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c index f505e3e1274b..70fa18b04ad2 100644 --- a/drivers/mfd/ezx-pcap.c +++ b/drivers/mfd/ezx-pcap.c @@ -35,7 +35,7 @@ struct pcap_chip { /* IO */ u32 buf; - struct mutex io_mutex; + spinlock_t io_lock; /* IRQ */ unsigned int irq_base; @@ -48,7 +48,7 @@ struct pcap_chip { struct pcap_adc_request *adc_queue[PCAP_ADC_MAXQ]; u8 adc_head; u8 adc_tail; - struct mutex adc_mutex; + spinlock_t adc_lock; }; /* IO */ @@ -76,14 +76,15 @@ static int ezx_pcap_putget(struct pcap_chip *pcap, u32 *data) int ezx_pcap_write(struct pcap_chip *pcap, u8 reg_num, u32 value) { + unsigned long flags; int ret; - mutex_lock(&pcap->io_mutex); + spin_lock_irqsave(&pcap->io_lock, flags); value &= PCAP_REGISTER_VALUE_MASK; value |= PCAP_REGISTER_WRITE_OP_BIT | (reg_num << PCAP_REGISTER_ADDRESS_SHIFT); ret = ezx_pcap_putget(pcap, &value); - mutex_unlock(&pcap->io_mutex); + spin_unlock_irqrestore(&pcap->io_lock, flags); return ret; } @@ -91,14 +92,15 @@ EXPORT_SYMBOL_GPL(ezx_pcap_write); int ezx_pcap_read(struct pcap_chip *pcap, u8 reg_num, u32 *value) { + unsigned long flags; int ret; - mutex_lock(&pcap->io_mutex); + spin_lock_irqsave(&pcap->io_lock, flags); *value = PCAP_REGISTER_READ_OP_BIT | (reg_num << PCAP_REGISTER_ADDRESS_SHIFT); ret = ezx_pcap_putget(pcap, value); - mutex_unlock(&pcap->io_mutex); + spin_unlock_irqrestore(&pcap->io_lock, flags); return ret; } @@ -106,11 +108,12 @@ EXPORT_SYMBOL_GPL(ezx_pcap_read); int ezx_pcap_set_bits(struct pcap_chip *pcap, u8 reg_num, u32 mask, u32 val) { + unsigned long flags; int ret; u32 tmp = PCAP_REGISTER_READ_OP_BIT | (reg_num << PCAP_REGISTER_ADDRESS_SHIFT); - mutex_lock(&pcap->io_mutex); + spin_lock_irqsave(&pcap->io_lock, flags); ret = ezx_pcap_putget(pcap, &tmp); if (ret) goto out_unlock; @@ -121,7 +124,7 @@ int ezx_pcap_set_bits(struct pcap_chip *pcap, u8 reg_num, u32 mask, u32 val) ret = ezx_pcap_putget(pcap, &tmp); out_unlock: - mutex_unlock(&pcap->io_mutex); + spin_unlock_irqrestore(&pcap->io_lock, flags); return ret; } @@ -212,14 +215,15 @@ static void pcap_irq_handler(struct irq_desc *desc) /* ADC */ void pcap_set_ts_bits(struct pcap_chip *pcap, u32 bits) { + unsigned long flags; u32 tmp; - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, flags); ezx_pcap_read(pcap, PCAP_REG_ADC, &tmp); tmp &= ~(PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR); tmp |= bits & (PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR); ezx_pcap_write(pcap, PCAP_REG_ADC, tmp); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); } EXPORT_SYMBOL_GPL(pcap_set_ts_bits); @@ -234,15 +238,16 @@ static void pcap_disable_adc(struct pcap_chip *pcap) static void pcap_adc_trigger(struct pcap_chip *pcap) { + unsigned long flags; u32 tmp; u8 head; - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, flags); head = pcap->adc_head; if (!pcap->adc_queue[head]) { /* queue is empty, save power */ pcap_disable_adc(pcap); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); return; } /* start conversion on requested bank, save TS_M bits */ @@ -254,7 +259,7 @@ static void pcap_adc_trigger(struct pcap_chip *pcap) tmp |= PCAP_ADC_AD_SEL1; ezx_pcap_write(pcap, PCAP_REG_ADC, tmp); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); ezx_pcap_write(pcap, PCAP_REG_ADR, PCAP_ADR_ASC); } @@ -265,11 +270,11 @@ static irqreturn_t pcap_adc_irq(int irq, void *_pcap) u16 res[2]; u32 tmp; - mutex_lock(&pcap->adc_mutex); + spin_lock(&pcap->adc_lock); req = pcap->adc_queue[pcap->adc_head]; if (WARN(!req, "adc irq without pending request\n")) { - mutex_unlock(&pcap->adc_mutex); + spin_unlock(&pcap->adc_lock); return IRQ_HANDLED; } @@ -285,7 +290,7 @@ static irqreturn_t pcap_adc_irq(int irq, void *_pcap) pcap->adc_queue[pcap->adc_head] = NULL; pcap->adc_head = (pcap->adc_head + 1) & (PCAP_ADC_MAXQ - 1); - mutex_unlock(&pcap->adc_mutex); + spin_unlock(&pcap->adc_lock); /* pass the results and release memory */ req->callback(req->data, res); @@ -301,6 +306,7 @@ int pcap_adc_async(struct pcap_chip *pcap, u8 bank, u32 flags, u8 ch[], void *callback, void *data) { struct pcap_adc_request *req; + unsigned long irq_flags; /* This will be freed after we have a result */ req = kmalloc(sizeof(struct pcap_adc_request), GFP_KERNEL); @@ -314,15 +320,15 @@ int pcap_adc_async(struct pcap_chip *pcap, u8 bank, u32 flags, u8 ch[], req->callback = callback; req->data = data; - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, irq_flags); if (pcap->adc_queue[pcap->adc_tail]) { - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, irq_flags); kfree(req); return -EBUSY; } pcap->adc_queue[pcap->adc_tail] = req; pcap->adc_tail = (pcap->adc_tail + 1) & (PCAP_ADC_MAXQ - 1); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, irq_flags); /* start conversion */ pcap_adc_trigger(pcap); @@ -389,16 +395,17 @@ static int pcap_add_subdev(struct pcap_chip *pcap, static int ezx_pcap_remove(struct spi_device *spi) { struct pcap_chip *pcap = spi_get_drvdata(spi); + unsigned long flags; int i; /* remove all registered subdevs */ device_for_each_child(&spi->dev, NULL, pcap_remove_subdev); /* cleanup ADC */ - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, flags); for (i = 0; i < PCAP_ADC_MAXQ; i++) kfree(pcap->adc_queue[i]); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); /* cleanup irqchip */ for (i = pcap->irq_base; i < (pcap->irq_base + PCAP_NIRQS); i++) @@ -426,8 +433,8 @@ static int ezx_pcap_probe(struct spi_device *spi) goto ret; } - mutex_init(&pcap->io_mutex); - mutex_init(&pcap->adc_mutex); + spin_lock_init(&pcap->io_lock); + spin_lock_init(&pcap->adc_lock); INIT_WORK(&pcap->isr_work, pcap_isr_work); INIT_WORK(&pcap->msr_work, pcap_msr_work); spi_set_drvdata(spi, pcap); From b9a801dfa59163dc2db8147a98af406eb79e51de Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 1 Aug 2019 22:03:35 +0300 Subject: [PATCH 214/721] mfd: Add support for Merrifield Basin Cove PMIC Add an MFD driver for Intel Merrifield Basin Cove PMIC. Firmware on the platforms which are using Basin Cove PMIC is "smarter" than on the rest supported by vanilla kernel. It handles first level of interrupt itself, while others do it on OS level. The driver is done in the same way as the rest of Intel PMIC MFD drivers in the kernel to support the initial design. The design allows to use one driver among few PMICs without knowing implementation details of the each hardware version or generation. Signed-off-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 11 ++ drivers/mfd/Makefile | 1 + drivers/mfd/intel_soc_pmic_mrfld.c | 157 +++++++++++++++++++++++ include/linux/mfd/intel_soc_pmic_mrfld.h | 81 ++++++++++++ 4 files changed, 250 insertions(+) create mode 100644 drivers/mfd/intel_soc_pmic_mrfld.c create mode 100644 include/linux/mfd/intel_soc_pmic_mrfld.h diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 4a07afe50b35..a6854d41d1ca 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -597,6 +597,17 @@ config INTEL_SOC_PMIC_CHTDC_TI Select this option for supporting Dollar Cove (TI version) PMIC device that is found on some Intel Cherry Trail systems. +config INTEL_SOC_PMIC_MRFLD + tristate "Support for Intel Merrifield Basin Cove PMIC" + depends on GPIOLIB + depends on ACPI + depends on INTEL_SCU_IPC + select MFD_CORE + select REGMAP_IRQ + help + Select this option for supporting Basin Cove PMIC device + that is found on Intel Merrifield systems. + config MFD_INTEL_LPSS tristate select COMMON_CLK diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 7b6a6aa4fe42..3ae6fa6552d3 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -242,6 +242,7 @@ obj-$(CONFIG_INTEL_SOC_PMIC_CHTWC) += intel_soc_pmic_chtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI) += intel_soc_pmic_chtdc_ti.o mt6397-objs := mt6397-core.o mt6397-irq.o obj-$(CONFIG_MFD_MT6397) += mt6397.o +obj-$(CONFIG_INTEL_SOC_PMIC_MRFLD) += intel_soc_pmic_mrfld.o obj-$(CONFIG_MFD_ALTERA_A10SR) += altera-a10sr.o obj-$(CONFIG_MFD_ALTERA_SYSMGR) += altera-sysmgr.o diff --git a/drivers/mfd/intel_soc_pmic_mrfld.c b/drivers/mfd/intel_soc_pmic_mrfld.c new file mode 100644 index 000000000000..26a1551c5faf --- /dev/null +++ b/drivers/mfd/intel_soc_pmic_mrfld.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device access for Basin Cove PMIC + * + * Copyright (c) 2019, Intel Corporation. + * Author: Andy Shevchenko + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Level 2 IRQs + * + * Firmware on the systems with Basin Cove PMIC services Level 1 IRQs + * without an assistance. Thus, each of the Level 1 IRQ is represented + * as a separate RTE in IOAPIC. + */ +static struct resource irq_level2_resources[] = { + DEFINE_RES_IRQ(0), /* power button */ + DEFINE_RES_IRQ(0), /* TMU */ + DEFINE_RES_IRQ(0), /* thermal */ + DEFINE_RES_IRQ(0), /* BCU */ + DEFINE_RES_IRQ(0), /* ADC */ + DEFINE_RES_IRQ(0), /* charger */ + DEFINE_RES_IRQ(0), /* GPIO */ +}; + +static const struct mfd_cell bcove_dev[] = { + { + .name = "mrfld_bcove_pwrbtn", + .num_resources = 1, + .resources = &irq_level2_resources[0], + }, { + .name = "mrfld_bcove_tmu", + .num_resources = 1, + .resources = &irq_level2_resources[1], + }, { + .name = "mrfld_bcove_thermal", + .num_resources = 1, + .resources = &irq_level2_resources[2], + }, { + .name = "mrfld_bcove_bcu", + .num_resources = 1, + .resources = &irq_level2_resources[3], + }, { + .name = "mrfld_bcove_adc", + .num_resources = 1, + .resources = &irq_level2_resources[4], + }, { + .name = "mrfld_bcove_charger", + .num_resources = 1, + .resources = &irq_level2_resources[5], + }, { + .name = "mrfld_bcove_pwrsrc", + .num_resources = 1, + .resources = &irq_level2_resources[5], + }, { + .name = "mrfld_bcove_gpio", + .num_resources = 1, + .resources = &irq_level2_resources[6], + }, + { .name = "mrfld_bcove_region", }, +}; + +static int bcove_ipc_byte_reg_read(void *context, unsigned int reg, + unsigned int *val) +{ + u8 ipc_out; + int ret; + + ret = intel_scu_ipc_ioread8(reg, &ipc_out); + if (ret) + return ret; + + *val = ipc_out; + return 0; +} + +static int bcove_ipc_byte_reg_write(void *context, unsigned int reg, + unsigned int val) +{ + u8 ipc_in = val; + int ret; + + ret = intel_scu_ipc_iowrite8(reg, ipc_in); + if (ret) + return ret; + + return 0; +} + +static const struct regmap_config bcove_regmap_config = { + .reg_bits = 16, + .val_bits = 8, + .max_register = 0xff, + .reg_write = bcove_ipc_byte_reg_write, + .reg_read = bcove_ipc_byte_reg_read, +}; + +static int bcove_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct intel_soc_pmic *pmic; + unsigned int i; + int ret; + + pmic = devm_kzalloc(dev, sizeof(*pmic), GFP_KERNEL); + if (!pmic) + return -ENOMEM; + + platform_set_drvdata(pdev, pmic); + pmic->dev = &pdev->dev; + + pmic->regmap = devm_regmap_init(dev, NULL, pmic, &bcove_regmap_config); + if (IS_ERR(pmic->regmap)) + return PTR_ERR(pmic->regmap); + + for (i = 0; i < ARRAY_SIZE(irq_level2_resources); i++) { + ret = platform_get_irq(pdev, i); + if (ret < 0) + return ret; + + irq_level2_resources[i].start = ret; + irq_level2_resources[i].end = ret; + } + + return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, + bcove_dev, ARRAY_SIZE(bcove_dev), + NULL, 0, NULL); +} + +static const struct acpi_device_id bcove_acpi_ids[] = { + { "INTC100E" }, + {} +}; +MODULE_DEVICE_TABLE(acpi, bcove_acpi_ids); + +static struct platform_driver bcove_driver = { + .driver = { + .name = "intel_soc_pmic_mrfld", + .acpi_match_table = bcove_acpi_ids, + }, + .probe = bcove_probe, +}; +module_platform_driver(bcove_driver); + +MODULE_DESCRIPTION("IPC driver for Intel SoC Basin Cove PMIC"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/mfd/intel_soc_pmic_mrfld.h b/include/linux/mfd/intel_soc_pmic_mrfld.h new file mode 100644 index 000000000000..4daecd682275 --- /dev/null +++ b/include/linux/mfd/intel_soc_pmic_mrfld.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Header file for Intel Merrifield Basin Cove PMIC + * + * Copyright (C) 2019 Intel Corporation. All rights reserved. + */ + +#ifndef __INTEL_SOC_PMIC_MRFLD_H__ +#define __INTEL_SOC_PMIC_MRFLD_H__ + +#include + +#define BCOVE_ID 0x00 + +#define BCOVE_ID_MINREV0 GENMASK(2, 0) +#define BCOVE_ID_MAJREV0 GENMASK(5, 3) +#define BCOVE_ID_VENDID0 GENMASK(7, 6) + +#define BCOVE_MINOR(x) (unsigned int)(((x) & BCOVE_ID_MINREV0) >> 0) +#define BCOVE_MAJOR(x) (unsigned int)(((x) & BCOVE_ID_MAJREV0) >> 3) +#define BCOVE_VENDOR(x) (unsigned int)(((x) & BCOVE_ID_VENDID0) >> 6) + +#define BCOVE_IRQLVL1 0x01 + +#define BCOVE_PBIRQ 0x02 +#define BCOVE_TMUIRQ 0x03 +#define BCOVE_THRMIRQ 0x04 +#define BCOVE_BCUIRQ 0x05 +#define BCOVE_ADCIRQ 0x06 +#define BCOVE_CHGRIRQ0 0x07 +#define BCOVE_CHGRIRQ1 0x08 +#define BCOVE_GPIOIRQ 0x09 +#define BCOVE_CRITIRQ 0x0B + +#define BCOVE_MIRQLVL1 0x0C + +#define BCOVE_MPBIRQ 0x0D +#define BCOVE_MTMUIRQ 0x0E +#define BCOVE_MTHRMIRQ 0x0F +#define BCOVE_MBCUIRQ 0x10 +#define BCOVE_MADCIRQ 0x11 +#define BCOVE_MCHGRIRQ0 0x12 +#define BCOVE_MCHGRIRQ1 0x13 +#define BCOVE_MGPIOIRQ 0x14 +#define BCOVE_MCRITIRQ 0x16 + +#define BCOVE_SCHGRIRQ0 0x4E +#define BCOVE_SCHGRIRQ1 0x4F + +/* Level 1 IRQs */ +#define BCOVE_LVL1_PWRBTN BIT(0) /* power button */ +#define BCOVE_LVL1_TMU BIT(1) /* time management unit */ +#define BCOVE_LVL1_THRM BIT(2) /* thermal */ +#define BCOVE_LVL1_BCU BIT(3) /* burst control unit */ +#define BCOVE_LVL1_ADC BIT(4) /* ADC */ +#define BCOVE_LVL1_CHGR BIT(5) /* charger */ +#define BCOVE_LVL1_GPIO BIT(6) /* GPIO */ +#define BCOVE_LVL1_CRIT BIT(7) /* critical event */ + +/* Level 2 IRQs: power button */ +#define BCOVE_PBIRQ_PBTN BIT(0) +#define BCOVE_PBIRQ_UBTN BIT(1) + +/* Level 2 IRQs: ADC */ +#define BCOVE_ADCIRQ_BATTEMP BIT(2) +#define BCOVE_ADCIRQ_SYSTEMP BIT(3) +#define BCOVE_ADCIRQ_BATTID BIT(4) +#define BCOVE_ADCIRQ_VIBATT BIT(5) +#define BCOVE_ADCIRQ_CCTICK BIT(7) + +/* Level 2 IRQs: charger */ +#define BCOVE_CHGRIRQ_BAT0ALRT BIT(4) +#define BCOVE_CHGRIRQ_BAT1ALRT BIT(5) +#define BCOVE_CHGRIRQ_BATCRIT BIT(6) + +#define BCOVE_CHGRIRQ_VBUSDET BIT(0) +#define BCOVE_CHGRIRQ_DCDET BIT(1) +#define BCOVE_CHGRIRQ_BATTDET BIT(2) +#define BCOVE_CHGRIRQ_USBIDDET BIT(3) + +#endif /* __INTEL_SOC_PMIC_MRFLD_H__ */ From cbd1c5c4d443c4a83c9c7eecc0561b9aeafddf50 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 16 Aug 2019 20:33:42 +0300 Subject: [PATCH 215/721] mfd: intel-lpss: Consistently use GENMASK() Since we already are using BIT() macro, use GENMASK() as well for sake of consistency. Signed-off-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c index 277f48f1cc1c..3e16a1765142 100644 --- a/drivers/mfd/intel-lpss.c +++ b/drivers/mfd/intel-lpss.c @@ -47,10 +47,10 @@ #define LPSS_PRIV_IDLELTR 0x14 #define LPSS_PRIV_LTR_REQ BIT(15) -#define LPSS_PRIV_LTR_SCALE_MASK 0xc00 -#define LPSS_PRIV_LTR_SCALE_1US 0x800 -#define LPSS_PRIV_LTR_SCALE_32US 0xc00 -#define LPSS_PRIV_LTR_VALUE_MASK 0x3ff +#define LPSS_PRIV_LTR_SCALE_MASK GENMASK(11, 10) +#define LPSS_PRIV_LTR_SCALE_1US (2 << 10) +#define LPSS_PRIV_LTR_SCALE_32US (3 << 10) +#define LPSS_PRIV_LTR_VALUE_MASK GENMASK(9, 0) #define LPSS_PRIV_SSP_REG 0x20 #define LPSS_PRIV_SSP_REG_DIS_DMA_FIN BIT(0) @@ -59,8 +59,8 @@ #define LPSS_PRIV_CAPS 0xfc #define LPSS_PRIV_CAPS_NO_IDMA BIT(8) +#define LPSS_PRIV_CAPS_TYPE_MASK GENMASK(7, 4) #define LPSS_PRIV_CAPS_TYPE_SHIFT 4 -#define LPSS_PRIV_CAPS_TYPE_MASK (0xf << LPSS_PRIV_CAPS_TYPE_SHIFT) /* This matches the type field in CAPS register */ enum intel_lpss_dev_type { From c5b90cb26e83ad89cb18e5ec97a992f02d8f750d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 16 Aug 2019 20:56:02 +0300 Subject: [PATCH 216/721] mfd: intel-lpss: Add Intel Skylake ACPI IDs Some of the laptops, like ASUS U306UA, may expose LPSS devices via ACPI. Add their IDs to the list. Signed-off-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss-acpi.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/mfd/intel-lpss-acpi.c b/drivers/mfd/intel-lpss-acpi.c index 61ffb8b393e4..c8fe334b5fe8 100644 --- a/drivers/mfd/intel-lpss-acpi.c +++ b/drivers/mfd/intel-lpss-acpi.c @@ -18,6 +18,10 @@ #include "intel-lpss.h" +static const struct intel_lpss_platform_info spt_info = { + .clk_rate = 120000000, +}; + static struct property_entry spt_i2c_properties[] = { PROPERTY_ENTRY_U32("i2c-sda-hold-time-ns", 230), { }, @@ -28,6 +32,19 @@ static const struct intel_lpss_platform_info spt_i2c_info = { .properties = spt_i2c_properties, }; +static struct property_entry uart_properties[] = { + PROPERTY_ENTRY_U32("reg-io-width", 4), + PROPERTY_ENTRY_U32("reg-shift", 2), + PROPERTY_ENTRY_BOOL("snps,uart-16550-compatible"), + { }, +}; + +static const struct intel_lpss_platform_info spt_uart_info = { + .clk_rate = 120000000, + .clk_con_id = "baudclk", + .properties = uart_properties, +}; + static const struct intel_lpss_platform_info bxt_info = { .clk_rate = 100000000, }; @@ -58,8 +75,17 @@ static const struct intel_lpss_platform_info apl_i2c_info = { static const struct acpi_device_id intel_lpss_acpi_ids[] = { /* SPT */ + { "INT3440", (kernel_ulong_t)&spt_info }, + { "INT3441", (kernel_ulong_t)&spt_info }, + { "INT3442", (kernel_ulong_t)&spt_i2c_info }, + { "INT3443", (kernel_ulong_t)&spt_i2c_info }, + { "INT3444", (kernel_ulong_t)&spt_i2c_info }, + { "INT3445", (kernel_ulong_t)&spt_i2c_info }, { "INT3446", (kernel_ulong_t)&spt_i2c_info }, { "INT3447", (kernel_ulong_t)&spt_i2c_info }, + { "INT3448", (kernel_ulong_t)&spt_uart_info }, + { "INT3449", (kernel_ulong_t)&spt_uart_info }, + { "INT344A", (kernel_ulong_t)&spt_uart_info }, /* BXT */ { "80860AAC", (kernel_ulong_t)&bxt_i2c_info }, { "80860ABC", (kernel_ulong_t)&bxt_info }, From f68c0a873ef28637a201ec37e1bafdf040813454 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 20 Aug 2019 12:37:15 +0200 Subject: [PATCH 217/721] mfd: sm501: Include the GPIO driver header This driver creates a gpio chip so it needs to include the appropriate header explicitly rather than implicitly. Signed-off-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/sm501.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c index d5e34a5eb250..154270f8d8d7 100644 --- a/drivers/mfd/sm501.c +++ b/drivers/mfd/sm501.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include From 1094422253db01ac3a12bde010c75e5135021d2d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 20 Aug 2019 17:34:43 +0200 Subject: [PATCH 218/721] mfd: htc-i2cpld: Drop check because i2c_unregister_device() is NULL safe No need to check the argument of i2c_unregister_device() because the function itself does it. Signed-off-by: Wolfram Sang Signed-off-by: Lee Jones --- drivers/mfd/htc-i2cpld.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mfd/htc-i2cpld.c b/drivers/mfd/htc-i2cpld.c index 370519af5d0b..8ad6768bd7a2 100644 --- a/drivers/mfd/htc-i2cpld.c +++ b/drivers/mfd/htc-i2cpld.c @@ -385,8 +385,7 @@ static void htcpld_unregister_chip_i2c( htcpld = platform_get_drvdata(pdev); chip = &htcpld->chip[chip_index]; - if (chip->client) - i2c_unregister_device(chip->client); + i2c_unregister_device(chip->client); } static int htcpld_register_chip_gpio( From 569fac74627cc332a2097a7a4bfdc654b8e7f273 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 21 Aug 2019 11:37:12 +0300 Subject: [PATCH 219/721] mfd: intel-lpss: Use MODULE_SOFTDEP() instead of implicit request There is no need to handle optional module request in the driver when user space tools has that feature for ages. Replace custom code by MODULE_SOFTDEP() macro to let user space know that we would like to have the DMA driver loaded first, if any. Signed-off-by: Andy Shevchenko Signed-off-by: Lee Jones --- drivers/mfd/intel-lpss.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c index 3e16a1765142..bfe4ff337581 100644 --- a/drivers/mfd/intel-lpss.c +++ b/drivers/mfd/intel-lpss.c @@ -128,17 +128,6 @@ static const struct mfd_cell intel_lpss_spi_cell = { static DEFINE_IDA(intel_lpss_devid_ida); static struct dentry *intel_lpss_debugfs; -static int intel_lpss_request_dma_module(const char *name) -{ - static bool intel_lpss_dma_requested; - - if (intel_lpss_dma_requested) - return 0; - - intel_lpss_dma_requested = true; - return request_module("%s", name); -} - static void intel_lpss_cache_ltr(struct intel_lpss *lpss) { lpss->active_ltr = readl(lpss->priv + LPSS_PRIV_ACTIVELTR); @@ -429,16 +418,6 @@ int intel_lpss_probe(struct device *dev, dev_warn(dev, "Failed to create debugfs entries\n"); if (intel_lpss_has_idma(lpss)) { - /* - * Ensure the DMA driver is loaded before the host - * controller device appears, so that the host controller - * driver can request its DMA channels as early as - * possible. - * - * If the DMA module is not there that's OK as well. - */ - intel_lpss_request_dma_module(LPSS_IDMA64_DRIVER_NAME); - ret = mfd_add_devices(dev, lpss->devid, &intel_lpss_idma64_cell, 1, info->mem, info->irq, NULL); if (ret) @@ -554,3 +533,11 @@ MODULE_AUTHOR("Heikki Krogerus "); MODULE_AUTHOR("Jarkko Nikula "); MODULE_DESCRIPTION("Intel LPSS core driver"); MODULE_LICENSE("GPL v2"); +/* + * Ensure the DMA driver is loaded before the host controller device appears, + * so that the host controller driver can request its DMA channels as early + * as possible. + * + * If the DMA module is not there that's OK as well. + */ +MODULE_SOFTDEP("pre: platform:" LPSS_IDMA64_DRIVER_NAME); From fea3ac55e112ee52feba03c48c182ab6d0ad8e92 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 29 Aug 2019 13:25:01 +0200 Subject: [PATCH 220/721] mfd: db8500-prcmu: Support the higher DB8520 ARMSS The DB8520 used in a lot of Samsung phones has a slightly higher maximum ARMSS frequency than the DB8500. In order to not confuse the OPP framework and cpufreq, make sure the PRCMU driver returns the correct frequency. Cc: Stephan Gerhold Signed-off-by: Linus Walleij Signed-off-by: Lee Jones --- drivers/mfd/db8500-prcmu.c | 40 +++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index 0f459c246634..0e019cc5da42 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -1695,21 +1695,41 @@ static long round_clock_rate(u8 clock, unsigned long rate) return rounded_rate; } -static const unsigned long armss_freqs[] = { +static const unsigned long db8500_armss_freqs[] = { 200000000, 400000000, 800000000, 998400000 }; +/* The DB8520 has slightly higher ARMSS max frequency */ +static const unsigned long db8520_armss_freqs[] = { + 200000000, + 400000000, + 800000000, + 1152000000 +}; + + + static long round_armss_rate(unsigned long rate) { unsigned long freq = 0; + const unsigned long *freqs; + int nfreqs; int i; + if (fw_info.version.project == PRCMU_FW_PROJECT_U8520) { + freqs = db8520_armss_freqs; + nfreqs = ARRAY_SIZE(db8520_armss_freqs); + } else { + freqs = db8500_armss_freqs; + nfreqs = ARRAY_SIZE(db8500_armss_freqs); + } + /* Find the corresponding arm opp from the cpufreq table. */ - for (i = 0; i < ARRAY_SIZE(armss_freqs); i++) { - freq = armss_freqs[i]; + for (i = 0; i < nfreqs; i++) { + freq = freqs[i]; if (rate <= freq) break; } @@ -1854,11 +1874,21 @@ static int set_armss_rate(unsigned long rate) { unsigned long freq; u8 opps[] = { ARM_EXTCLK, ARM_50_OPP, ARM_100_OPP, ARM_MAX_OPP }; + const unsigned long *freqs; + int nfreqs; int i; + if (fw_info.version.project == PRCMU_FW_PROJECT_U8520) { + freqs = db8520_armss_freqs; + nfreqs = ARRAY_SIZE(db8520_armss_freqs); + } else { + freqs = db8500_armss_freqs; + nfreqs = ARRAY_SIZE(db8500_armss_freqs); + } + /* Find the corresponding arm opp from the cpufreq table. */ - for (i = 0; i < ARRAY_SIZE(armss_freqs); i++) { - freq = armss_freqs[i]; + for (i = 0; i < nfreqs; i++) { + freq = freqs[i]; if (rate == freq) break; } From b788d111e6760602a097cd774e09d0f427de707c Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Sun, 18 Aug 2019 15:55:59 +0200 Subject: [PATCH 221/721] dt-bindings: mfd: mediatek: mt6397: Change to relative paths Paths in dt-bindings should be relative as suggested by Lee Jones. Suggested-By: Lee Jones Signed-off-by: Frank Wunderlich Reviewed-by: Rob Herring Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/mt6397.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/mfd/mt6397.txt b/Documentation/devicetree/bindings/mfd/mt6397.txt index 0ebd08af777d..ab3163a60929 100644 --- a/Documentation/devicetree/bindings/mfd/mt6397.txt +++ b/Documentation/devicetree/bindings/mfd/mt6397.txt @@ -12,7 +12,7 @@ MT6397/MT6323 is a multifunction device with the following sub modules: It is interfaced to host controller using SPI interface by a proprietary hardware called PMIC wrapper or pwrap. MT6397/MT6323 MFD is a child device of pwrap. See the following for pwarp node definitions: -Documentation/devicetree/bindings/soc/mediatek/pwrap.txt +../soc/mediatek/pwrap.txt This document describes the binding for MFD device and its sub module. @@ -27,9 +27,9 @@ Optional subnodes: - regulators Required properties: - compatible: "mediatek,mt6397-regulator" - see Documentation/devicetree/bindings/regulator/mt6397-regulator.txt + see ../regulator/mt6397-regulator.txt - compatible: "mediatek,mt6323-regulator" - see Documentation/devicetree/bindings/regulator/mt6323-regulator.txt + see ../regulator/mt6323-regulator.txt - codec Required properties: - compatible: "mediatek,mt6397-codec" @@ -39,12 +39,12 @@ Optional subnodes: - led Required properties: - compatible: "mediatek,mt6323-led" - see Documentation/devicetree/bindings/leds/leds-mt6323.txt + see ../leds/leds-mt6323.txt - keys Required properties: - compatible: "mediatek,mt6397-keys" or "mediatek,mt6323-keys" - see Documentation/devicetree/bindings/input/mtk-pmic-keys.txt + see ../input/mtk-pmic-keys.txt Example: pwrap: pwrap@1000f000 { From 7051919d17512f51e21d91b6564882b0476f2a2c Mon Sep 17 00:00:00 2001 From: Josef Friedl Date: Sun, 18 Aug 2019 15:56:00 +0200 Subject: [PATCH 222/721] dt-bindings: mfd: mediatek: Update RTC to include MT6323 Add MT6323 to RTC bindings. Signed-off-by: Josef Friedl Signed-off-by: Frank Wunderlich Reviewed-by: Rob Herring Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/mt6397.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mfd/mt6397.txt b/Documentation/devicetree/bindings/mfd/mt6397.txt index ab3163a60929..5fccf987865b 100644 --- a/Documentation/devicetree/bindings/mfd/mt6397.txt +++ b/Documentation/devicetree/bindings/mfd/mt6397.txt @@ -22,8 +22,10 @@ compatible: "mediatek,mt6397" or "mediatek,mt6323" Optional subnodes: - rtc - Required properties: + Required properties: Should be one of follows + - compatible: "mediatek,mt6323-rtc" - compatible: "mediatek,mt6397-rtc" + For details, see ../rtc/rtc-mt6397.txt - regulators Required properties: - compatible: "mediatek,mt6397-regulator" From 8ab1267ff73dc372d63dffc7736058d456b8c75b Mon Sep 17 00:00:00 2001 From: Josef Friedl Date: Sun, 18 Aug 2019 15:56:01 +0200 Subject: [PATCH 223/721] dt-bindings: mfd: mediatek: Add MT6323 Power Controller - Add Power Controller section to existing binding document. - Add MT6323 PWRC bindings document with example. Suggested-by: Frank Wunderlich Signed-off-by: Josef Friedl Signed-off-by: Frank Wunderlich Signed-off-by: Lee Jones --- .../devicetree/bindings/mfd/mt6397.txt | 6 ++++++ .../bindings/power/reset/mt6323-poweroff.txt | 20 +++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt diff --git a/Documentation/devicetree/bindings/mfd/mt6397.txt b/Documentation/devicetree/bindings/mfd/mt6397.txt index 5fccf987865b..a9b105ac00a8 100644 --- a/Documentation/devicetree/bindings/mfd/mt6397.txt +++ b/Documentation/devicetree/bindings/mfd/mt6397.txt @@ -8,6 +8,7 @@ MT6397/MT6323 is a multifunction device with the following sub modules: - Clock - LED - Keys +- Power controller It is interfaced to host controller using SPI interface by a proprietary hardware called PMIC wrapper or pwrap. MT6397/MT6323 MFD is a child device of pwrap. @@ -48,6 +49,11 @@ Optional subnodes: - compatible: "mediatek,mt6397-keys" or "mediatek,mt6323-keys" see ../input/mtk-pmic-keys.txt +- power-controller + Required properties: + - compatible: "mediatek,mt6323-pwrc" + For details, see ../power/reset/mt6323-poweroff.txt + Example: pwrap: pwrap@1000f000 { compatible = "mediatek,mt8135-pwrap"; diff --git a/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt b/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt new file mode 100644 index 000000000000..933f0c48e887 --- /dev/null +++ b/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt @@ -0,0 +1,20 @@ +Device Tree Bindings for Power Controller on MediaTek PMIC + +The power controller which could be found on PMIC is responsible for externally +powering off or on the remote MediaTek SoC through the circuit BBPU. + +Required properties: +- compatible: Should be one of follows + "mediatek,mt6323-pwrc": for MT6323 PMIC + +Example: + + pmic { + compatible = "mediatek,mt6323"; + + ... + + power-controller { + compatible = "mediatek,mt6323-pwrc"; + }; + } From 09c1dec470029f812e86aa92908abfd9132c2fab Mon Sep 17 00:00:00 2001 From: Josef Friedl Date: Sun, 18 Aug 2019 15:56:04 +0200 Subject: [PATCH 224/721] mfd: mt6397: Add mutex include Add missing mutex.h. Signed-off-by: Josef Friedl Signed-off-by: Frank Wunderlich Signed-off-by: Lee Jones --- include/linux/mfd/mt6397/core.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h index 9320c2a1e3e6..fc88d315bdde 100644 --- a/include/linux/mfd/mt6397/core.h +++ b/include/linux/mfd/mt6397/core.h @@ -7,6 +7,8 @@ #ifndef __MFD_MT6397_CORE_H__ #define __MFD_MT6397_CORE_H__ +#include + enum chip_id { MT6323_CHIP_ID = 0x23, MT6391_CHIP_ID = 0x91, From 7c3f7cd5a0c96ab40baaaf08968249fcb96c4057 Mon Sep 17 00:00:00 2001 From: Josef Friedl Date: Sun, 18 Aug 2019 15:56:06 +0200 Subject: [PATCH 225/721] mfd: mt6323: Replace boilerplate resource code with DEFINE_RES_* macros Simplifies and reduces LoC. Signed-off-by: Josef Friedl Signed-off-by: Frank Wunderlich Signed-off-by: Lee Jones --- drivers/mfd/mt6397-core.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index 93c888153b77..7c81a20865f0 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -19,16 +20,8 @@ #define MT6397_RTC_SIZE 0x3e static const struct resource mt6397_rtc_resources[] = { - { - .start = MT6397_RTC_BASE, - .end = MT6397_RTC_BASE + MT6397_RTC_SIZE, - .flags = IORESOURCE_MEM, - }, - { - .start = MT6397_IRQ_RTC, - .end = MT6397_IRQ_RTC, - .flags = IORESOURCE_IRQ, - }, + DEFINE_RES_MEM(MT6397_RTC_BASE, MT6397_RTC_SIZE), + DEFINE_RES_IRQ(MT6397_IRQ_RTC), }; static const struct resource mt6323_keys_resources[] = { From 8391c6cb2414d9a75bbe247a838d28cb0cee77ee Mon Sep 17 00:00:00 2001 From: Josef Friedl Date: Sun, 18 Aug 2019 15:56:08 +0200 Subject: [PATCH 226/721] mfd: mt6323: Add MT6323 RTC and PWRC Add entry for RTC and Power Controller to MT6323. Signed-off-by: Josef Friedl Signed-off-by: Frank Wunderlich Signed-off-by: Lee Jones --- drivers/mfd/mt6397-core.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index 7c81a20865f0..310dae26ddff 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -16,9 +16,20 @@ #include #include +#define MT6323_RTC_BASE 0x8000 +#define MT6323_RTC_SIZE 0x40 + #define MT6397_RTC_BASE 0xe000 #define MT6397_RTC_SIZE 0x3e +#define MT6323_PWRC_BASE 0x8000 +#define MT6323_PWRC_SIZE 0x40 + +static const struct resource mt6323_rtc_resources[] = { + DEFINE_RES_MEM(MT6323_RTC_BASE, MT6323_RTC_SIZE), + DEFINE_RES_IRQ(MT6323_IRQ_STATUS_RTC), +}; + static const struct resource mt6397_rtc_resources[] = { DEFINE_RES_MEM(MT6397_RTC_BASE, MT6397_RTC_SIZE), DEFINE_RES_IRQ(MT6397_IRQ_RTC), @@ -34,8 +45,17 @@ static const struct resource mt6397_keys_resources[] = { DEFINE_RES_IRQ(MT6397_IRQ_HOMEKEY), }; +static const struct resource mt6323_pwrc_resources[] = { + DEFINE_RES_MEM(MT6323_PWRC_BASE, MT6323_PWRC_SIZE), +}; + static const struct mfd_cell mt6323_devs[] = { { + .name = "mt6323-rtc", + .num_resources = ARRAY_SIZE(mt6323_rtc_resources), + .resources = mt6323_rtc_resources, + .of_compatible = "mediatek,mt6323-rtc", + }, { .name = "mt6323-regulator", .of_compatible = "mediatek,mt6323-regulator" }, { @@ -46,6 +66,11 @@ static const struct mfd_cell mt6323_devs[] = { .num_resources = ARRAY_SIZE(mt6323_keys_resources), .resources = mt6323_keys_resources, .of_compatible = "mediatek,mt6323-keys" + }, { + .name = "mt6323-pwrc", + .num_resources = ARRAY_SIZE(mt6323_pwrc_resources), + .resources = mt6323_pwrc_resources, + .of_compatible = "mediatek,mt6323-pwrc" }, }; From c709bf455d60662377a47686180c0ac5c65f4d07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Aug 2019 18:12:36 +0200 Subject: [PATCH 227/721] microblaze: remove ioremap_fullcache No callers of this function. Signed-off-by: Christoph Hellwig Signed-off-by: Michal Simek --- arch/microblaze/include/asm/io.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h index c7968139486f..86c95b2a1ce1 100644 --- a/arch/microblaze/include/asm/io.h +++ b/arch/microblaze/include/asm/io.h @@ -40,7 +40,6 @@ extern void iounmap(volatile void __iomem *addr); extern void __iomem *ioremap(phys_addr_t address, unsigned long size); #define ioremap_nocache(addr, size) ioremap((addr), (size)) -#define ioremap_fullcache(addr, size) ioremap((addr), (size)) #define ioremap_wc(addr, size) ioremap((addr), (size)) #define ioremap_wt(addr, size) ioremap((addr), (size)) From bcd69da98e36afccb4cb5fbdbd2b7bb78c07184d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 13 Aug 2019 13:58:53 +0200 Subject: [PATCH 228/721] video: backlight: Drop default m for {LCD,BACKLIGHT_CLASS_DEVICE} When running "make oldconfig" on a .config where CONFIG_BACKLIGHT_LCD_SUPPORT is not set, two new config options ("Lowlevel LCD controls" and "Lowlevel Backlight controls") appear, both defaulting to "m". Drop the "default m", as options should default to disabled, and because several driver config options already select LCD_CLASS_DEVICE or BACKLIGHT_CLASS_DEVICE when needed. Fixes: 8c5dc8d9f19c7992 ("video: backlight: Remove useless BACKLIGHT_LCD_SUPPORT kernel symbol") Signed-off-by: Geert Uytterhoeven Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/Kconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index 8b081d61773e..40676be2e46a 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -10,7 +10,6 @@ menu "Backlight & LCD device support" # config LCD_CLASS_DEVICE tristate "Lowlevel LCD controls" - default m help This framework adds support for low-level control of LCD. Some framebuffer devices connect to platform-specific LCD modules @@ -143,7 +142,6 @@ endif # LCD_CLASS_DEVICE # config BACKLIGHT_CLASS_DEVICE tristate "Lowlevel Backlight controls" - default m help This framework adds support for low-level control of the LCD backlight. This includes support for brightness and power. From 28a1d72a221ecdf8f63205d40cb654c81a2b2da7 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 20 Aug 2019 17:34:39 +0200 Subject: [PATCH 229/721] video: backlight: tosa_lcd: drop check because i2c_unregister_device() is NULL safe No need to check the argument of i2c_unregister_device() because the function itself does it. Signed-off-by: Wolfram Sang Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/tosa_lcd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c index 65cb7578776f..29af8e27b6e5 100644 --- a/drivers/video/backlight/tosa_lcd.c +++ b/drivers/video/backlight/tosa_lcd.c @@ -222,8 +222,7 @@ static int tosa_lcd_remove(struct spi_device *spi) { struct tosa_lcd_data *data = spi_get_drvdata(spi); - if (data->i2c) - i2c_unregister_device(data->i2c); + i2c_unregister_device(data->i2c); tosa_lcd_tg_off(data); From ec665b756e6f79c60078b00dbdabea3aa8a4b787 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 31 Jul 2019 11:40:18 +0300 Subject: [PATCH 230/721] backlight: gpio-backlight: Correct initial power state handling The default-on property - or the def_value via legacy pdata) should be handled as: if it is 1, the backlight must be enabled (kept enabled) if it is 0, the backlight must be disabled (kept disabled) This only works for the case when default-on is set. If it is not set then the brightness of the backlight is set to 0. Now if the backlight is enabled by external driver (graphics) the backlight will stay disabled since the brightness is configured as 0. The backlight will not turn on. In order to minimize screen flickering during device boot: The initial brightness should be set to 1. If booted in non DT mode or no phandle link to the backlight node: follow the def_value/default-on to select UNBLANK or POWERDOWN If in DT boot we have phandle link then leave the GPIO in a state which the bootloader left it and let the user of the backlight to configure it further. Signed-off-by: Peter Ujfalusi Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/gpio_backlight.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/video/backlight/gpio_backlight.c b/drivers/video/backlight/gpio_backlight.c index e84f3087e29f..18e053e4716c 100644 --- a/drivers/video/backlight/gpio_backlight.c +++ b/drivers/video/backlight/gpio_backlight.c @@ -59,13 +59,11 @@ static int gpio_backlight_probe_dt(struct platform_device *pdev, struct gpio_backlight *gbl) { struct device *dev = &pdev->dev; - enum gpiod_flags flags; int ret; gbl->def_value = device_property_read_bool(dev, "default-on"); - flags = gbl->def_value ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW; - gbl->gpiod = devm_gpiod_get(dev, NULL, flags); + gbl->gpiod = devm_gpiod_get(dev, NULL, GPIOD_ASIS); if (IS_ERR(gbl->gpiod)) { ret = PTR_ERR(gbl->gpiod); @@ -79,6 +77,22 @@ static int gpio_backlight_probe_dt(struct platform_device *pdev, return 0; } +static int gpio_backlight_initial_power_state(struct gpio_backlight *gbl) +{ + struct device_node *node = gbl->dev->of_node; + + /* Not booted with device tree or no phandle link to the node */ + if (!node || !node->phandle) + return gbl->def_value ? FB_BLANK_UNBLANK : FB_BLANK_POWERDOWN; + + /* if the enable GPIO is disabled, do not enable the backlight */ + if (gpiod_get_value_cansleep(gbl->gpiod) == 0) + return FB_BLANK_POWERDOWN; + + return FB_BLANK_UNBLANK; +} + + static int gpio_backlight_probe(struct platform_device *pdev) { struct gpio_backlight_platform_data *pdata = @@ -136,7 +150,9 @@ static int gpio_backlight_probe(struct platform_device *pdev) return PTR_ERR(bl); } - bl->props.brightness = gbl->def_value; + bl->props.power = gpio_backlight_initial_power_state(gbl); + bl->props.brightness = 1; + backlight_update_status(bl); platform_set_drvdata(pdev, bl); From 6451e123dec34e295cb0affcd48f671275bdb671 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 9 Jul 2019 12:00:04 -0700 Subject: [PATCH 231/721] MAINTAINERS: Add entry for stable backlight sysfs ABI documentation Add an entry for the stable backlight sysfs ABI to the MAINTAINERS file. Signed-off-by: Matthias Kaehlcke Acked-by: Daniel Thompson Signed-off-by: Lee Jones --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..5d7bb3024e13 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2906,6 +2906,7 @@ F: drivers/video/backlight/ F: include/linux/backlight.h F: include/linux/pwm_backlight.h F: Documentation/devicetree/bindings/leds/backlight +F: Documentation/ABI/stable/sysfs-class-backlight BATMAN ADVANCED M: Marek Lindner From d55c028f8b25bdaaba9ae08026052b5b44d031b0 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 9 Jul 2019 12:00:05 -0700 Subject: [PATCH 232/721] backlight: Expose brightness curve type through sysfs Backlight brightness curves can have different shapes. The two main types are linear and non-linear curves. The human eye doesn't perceive linearly increasing/decreasing brightness as linear (see also 88ba95bedb79 "backlight: pwm_bl: Compute brightness of LED linearly to human eye"), hence many backlights use non-linear (often logarithmic) brightness curves. The type of curve currently is opaque to userspace, so userspace often uses more or less reliable heuristics (like the number of brightness levels) to decide whether to treat a backlight device as linear or non-linear. Export the type of the brightness curve via the new sysfs attribute 'scale'. The value of the attribute can be 'linear', 'non-linear' or 'unknown'. For devices that don't provide information about the scale of their brightness curve the value of the 'scale' attribute is 'unknown'. Signed-off-by: Matthias Kaehlcke Reviewed-by: Daniel Thompson Signed-off-by: Lee Jones --- .../ABI/testing/sysfs-class-backlight | 26 +++++++++++++++++++ MAINTAINERS | 1 + drivers/video/backlight/backlight.c | 19 ++++++++++++++ include/linux/backlight.h | 8 ++++++ 4 files changed, 54 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-class-backlight diff --git a/Documentation/ABI/testing/sysfs-class-backlight b/Documentation/ABI/testing/sysfs-class-backlight new file mode 100644 index 000000000000..3ab175a3f5cb --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-backlight @@ -0,0 +1,26 @@ +What: /sys/class/backlight//scale +Date: July 2019 +KernelVersion: 5.4 +Contact: Daniel Thompson +Description: + Description of the scale of the brightness curve. + + The human eye senses brightness approximately logarithmically, + hence linear changes in brightness are perceived as being + non-linear. To achieve a linear perception of brightness changes + controls like sliders need to apply a logarithmic mapping for + backlights with a linear brightness curve. + + Possible values of the attribute are: + + unknown + The scale of the brightness curve is unknown. + + linear + The brightness changes linearly with each step. Brightness + controls should apply a logarithmic mapping for a linear + perception. + + non-linear + The brightness changes non-linearly with each step. Brightness + controls should use a linear mapping for a linear perception. diff --git a/MAINTAINERS b/MAINTAINERS index 5d7bb3024e13..fe75bb96a317 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2907,6 +2907,7 @@ F: include/linux/backlight.h F: include/linux/pwm_backlight.h F: Documentation/devicetree/bindings/leds/backlight F: Documentation/ABI/stable/sysfs-class-backlight +F: Documentation/ABI/testing/sysfs-class-backlight BATMAN ADVANCED M: Marek Lindner diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 5dc07106a59e..cac3e35d7630 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -32,6 +32,12 @@ static const char *const backlight_types[] = { [BACKLIGHT_FIRMWARE] = "firmware", }; +static const char *const backlight_scale_types[] = { + [BACKLIGHT_SCALE_UNKNOWN] = "unknown", + [BACKLIGHT_SCALE_LINEAR] = "linear", + [BACKLIGHT_SCALE_NON_LINEAR] = "non-linear", +}; + #if defined(CONFIG_FB) || (defined(CONFIG_FB_MODULE) && \ defined(CONFIG_BACKLIGHT_CLASS_DEVICE_MODULE)) /* This callback gets called when something important happens inside a @@ -246,6 +252,18 @@ static ssize_t actual_brightness_show(struct device *dev, } static DEVICE_ATTR_RO(actual_brightness); +static ssize_t scale_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct backlight_device *bd = to_backlight_device(dev); + + if (WARN_ON(bd->props.scale > BACKLIGHT_SCALE_NON_LINEAR)) + return sprintf(buf, "unknown\n"); + + return sprintf(buf, "%s\n", backlight_scale_types[bd->props.scale]); +} +static DEVICE_ATTR_RO(scale); + static struct class *backlight_class; #ifdef CONFIG_PM_SLEEP @@ -292,6 +310,7 @@ static struct attribute *bl_device_attrs[] = { &dev_attr_brightness.attr, &dev_attr_actual_brightness.attr, &dev_attr_max_brightness.attr, + &dev_attr_scale.attr, &dev_attr_type.attr, NULL, }; diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 0b5897446dca..c7d6b2e8c3b5 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -46,6 +46,12 @@ enum backlight_notification { BACKLIGHT_UNREGISTERED, }; +enum backlight_scale { + BACKLIGHT_SCALE_UNKNOWN = 0, + BACKLIGHT_SCALE_LINEAR, + BACKLIGHT_SCALE_NON_LINEAR, +}; + struct backlight_device; struct fb_info; @@ -80,6 +86,8 @@ struct backlight_properties { enum backlight_type type; /* Flags used to signal drivers of state changes */ unsigned int state; + /* Type of the brightness scale (linear, non-linear, ...) */ + enum backlight_scale scale; #define BL_CORE_SUSPENDED (1 << 0) /* backlight is suspended */ #define BL_CORE_FBBLANK (1 << 1) /* backlight is under an fb blank event */ From 511a204638d7d750f859c332635d09f38273b4f0 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 9 Jul 2019 12:00:06 -0700 Subject: [PATCH 233/721] backlight: pwm_bl: Set scale type for CIE 1931 curves For backlight curves calculated with the CIE 1931 algorithm set the brightness scale type to non-linear. This makes the scale type available to userspace via the 'scale' sysfs attribute. Signed-off-by: Matthias Kaehlcke Tested-by: Enric Balletbo i Serra Acked-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/pwm_bl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 2201b8c78641..d6d2b6407d7e 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -536,6 +536,8 @@ static int pwm_backlight_probe(struct platform_device *pdev) goto err_alloc; } + memset(&props, 0, sizeof(struct backlight_properties)); + if (data->levels) { /* * For the DT case, only when brightness levels is defined @@ -574,6 +576,8 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->levels = data->levels; } + + props.scale = BACKLIGHT_SCALE_NON_LINEAR; } else { /* * That only happens for the non-DT case, where platform data @@ -584,7 +588,6 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->lth_brightness = data->lth_brightness * (state.period / pb->scale); - memset(&props, 0, sizeof(struct backlight_properties)); props.type = BACKLIGHT_RAW; props.max_brightness = data->max_brightness; bl = backlight_device_register(dev_name(&pdev->dev), &pdev->dev, pb, From c0b64faf0fe6ca2574a00faed1ae833130db4e08 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 9 Jul 2019 12:00:07 -0700 Subject: [PATCH 234/721] backlight: pwm_bl: Set scale type for brightness curves specified in the DT Check if a brightness curve specified in the device tree is linear or not and set the corresponding property accordingly. This makes the scale type available to userspace via the 'scale' sysfs attribute. To determine if a curve is linear it is compared to a interpolated linear curve between min and max brightness. The curve is considered linear if no value deviates more than +/-5% of ${brightness_range} from their interpolated value. Signed-off-by: Matthias Kaehlcke Acked-by: Daniel Thompson Signed-off-by: Lee Jones --- drivers/video/backlight/pwm_bl.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index d6d2b6407d7e..746eebc411df 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -387,6 +387,31 @@ int pwm_backlight_brightness_default(struct device *dev, } #endif +static bool pwm_backlight_is_linear(struct platform_pwm_backlight_data *data) +{ + unsigned int nlevels = data->max_brightness + 1; + unsigned int min_val = data->levels[0]; + unsigned int max_val = data->levels[nlevels - 1]; + /* + * Multiplying by 128 means that even in pathological cases such + * as (max_val - min_val) == nlevels the error at max_val is less + * than 1%. + */ + unsigned int slope = (128 * (max_val - min_val)) / nlevels; + unsigned int margin = (max_val - min_val) / 20; /* 5% */ + int i; + + for (i = 1; i < nlevels; i++) { + unsigned int linear_value = min_val + ((i * slope) / 128); + unsigned int delta = abs(linear_value - data->levels[i]); + + if (delta > margin) + return false; + } + + return true; +} + static int pwm_backlight_initial_power_state(const struct pwm_bl_data *pb) { struct device_node *node = pb->dev->of_node; @@ -550,6 +575,11 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->levels = data->levels; } + + if (pwm_backlight_is_linear(data)) + props.scale = BACKLIGHT_SCALE_LINEAR; + else + props.scale = BACKLIGHT_SCALE_NON_LINEAR; } else if (!data->max_brightness) { /* * If no brightness levels are provided and max_brightness is From 345f0254e5b2f4090e4a00ebc996e07e9bdcd070 Mon Sep 17 00:00:00 2001 From: Maya Nakamura Date: Fri, 12 Jul 2019 08:27:38 +0000 Subject: [PATCH 235/721] HID: hv: Remove dependencies on PAGE_SIZE for ring buffer Define the ring buffer size as a constant expression because it should not depend on the guest page size. Signed-off-by: Maya Nakamura Reviewed-by: Michael Kelley Acked-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-hyperv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-hyperv.c b/drivers/hid/hid-hyperv.c index 7795831d37c2..cc5b09b87ab0 100644 --- a/drivers/hid/hid-hyperv.c +++ b/drivers/hid/hid-hyperv.c @@ -104,8 +104,8 @@ struct synthhid_input_report { #pragma pack(pop) -#define INPUTVSC_SEND_RING_BUFFER_SIZE (10*PAGE_SIZE) -#define INPUTVSC_RECV_RING_BUFFER_SIZE (10*PAGE_SIZE) +#define INPUTVSC_SEND_RING_BUFFER_SIZE (40 * 1024) +#define INPUTVSC_RECV_RING_BUFFER_SIZE (40 * 1024) enum pipe_prot_msg_type { From 2acf40f0454d41b8d51c95d317283c20c931164d Mon Sep 17 00:00:00 2001 From: Sebastian Parschauer Date: Mon, 2 Sep 2019 12:39:30 +0200 Subject: [PATCH 236/721] HID: Add quirk for HP X500 PIXART OEM mouse The PixArt OEM mice are known for disconnecting every minute in runlevel 1 or 3 if they are not always polled. So add quirk ALWAYS_POLL for this one as well. Ville Viinikka (viinikv) reported and tested the quirk. Link: https://github.com/sriemer/fix-linux-mouse issue 15 Signed-off-by: Sebastian Parschauer CC: stable@vger.kernel.org # v4.16+ Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-quirks.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 0a00be19f7a0..e4d51ce20a6a 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -568,6 +568,7 @@ #define USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A 0x0b4a #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE 0x134a #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_094A 0x094a +#define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0941 0x0941 #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0641 0x0641 #define USB_VENDOR_ID_HUION 0x256c diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 166f41f3173b..c50bcd967d99 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -92,6 +92,7 @@ static const struct hid_device_id hid_quirks[] = { { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_094A), HID_QUIRK_ALWAYS_POLL }, + { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0941), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0641), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_IDEACOM, USB_DEVICE_ID_IDEACOM_IDC6680), HID_QUIRK_MULTI_INPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_INNOMEDIA, USB_DEVICE_ID_INNEX_GENESIS_ATARI), HID_QUIRK_MULTI_INPUT }, From f7c4f737ced2b9e0ba412ed7be9683f80c247bd2 Mon Sep 17 00:00:00 2001 From: Bastien Nocera Date: Tue, 2 Jul 2019 10:39:31 +0200 Subject: [PATCH 237/721] HID: sb0540: add support for Creative SB0540 IR receivers Add a new hid driver for the Creative SB0540 IR receiver. This receiver is usually coupled with an RM-1500 or an RM-1800 remote control. The scrollwheels on the RM-1800 remote are not bound, as they are labelled for specific audio controls that don't usually exist on most systems. They can be remapped using standard Linux keyboard remapping tools. Signed-off-by: Bastien Nocera Signed-off-by: Benjamin Tissoires --- MAINTAINERS | 6 + drivers/hid/Kconfig | 9 + drivers/hid/Makefile | 1 + drivers/hid/hid-creative-sb0540.c | 268 ++++++++++++++++++++++++++++++ drivers/hid/hid-ids.h | 1 + 5 files changed, 285 insertions(+) create mode 100644 drivers/hid/hid-creative-sb0540.c diff --git a/MAINTAINERS b/MAINTAINERS index 08176d64eed5..2c59928d5b79 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4308,6 +4308,12 @@ S: Maintained F: Documentation/filesystems/cramfs.txt F: fs/cramfs/ +CREATIVE SB0540 +M: Bastien Nocera +L: linux-input@vger.kernel.org +S: Maintained +F: drivers/hid/hid-creative-sb0540.c + CRYPTO API M: Herbert Xu M: "David S. Miller" diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 3872e03d9a59..a70999f9c639 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -273,6 +273,15 @@ config HID_CP2112 and gpiochip to expose these functions of the CP2112. The customizable USB descriptor fields are exposed as sysfs attributes. +config HID_CREATIVE_SB0540 + tristate "Creative SB0540 infrared receiver" + depends on USB_HID + help + Support for Creative infrared SB0540-compatible remote controls, such + as the RM-1500 and RM-1800 remotes. + + Say Y here if you want support for Creative SB0540 infrared receiver. + config HID_CYPRESS tristate "Cypress mouse and barcode readers" depends on HID diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile index cc5d827c9164..0c03308cfb08 100644 --- a/drivers/hid/Makefile +++ b/drivers/hid/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_HID_ALPS) += hid-alps.o obj-$(CONFIG_HID_ACRUX) += hid-axff.o obj-$(CONFIG_HID_APPLE) += hid-apple.o obj-$(CONFIG_HID_APPLEIR) += hid-appleir.o +obj-$(CONFIG_HID_CREATIVE_SB0540) += hid-creative-sb0540.o obj-$(CONFIG_HID_ASUS) += hid-asus.o obj-$(CONFIG_HID_AUREAL) += hid-aureal.o obj-$(CONFIG_HID_BELKIN) += hid-belkin.o diff --git a/drivers/hid/hid-creative-sb0540.c b/drivers/hid/hid-creative-sb0540.c new file mode 100644 index 000000000000..b4c8e7a5d3e0 --- /dev/null +++ b/drivers/hid/hid-creative-sb0540.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HID driver for the Creative SB0540 receiver + * + * Copyright (C) 2019 Red Hat Inc. All Rights Reserved + * + */ + +#include +#include +#include +#include "hid-ids.h" + +MODULE_AUTHOR("Bastien Nocera "); +MODULE_DESCRIPTION("HID Creative SB0540 receiver"); +MODULE_LICENSE("GPL"); + +static const unsigned short creative_sb0540_key_table[] = { + KEY_POWER, + KEY_RESERVED, /* text: 24bit */ + KEY_RESERVED, /* 24bit wheel up */ + KEY_RESERVED, /* 24bit wheel down */ + KEY_RESERVED, /* text: CMSS */ + KEY_RESERVED, /* CMSS wheel Up */ + KEY_RESERVED, /* CMSS wheel Down */ + KEY_RESERVED, /* text: EAX */ + KEY_RESERVED, /* EAX wheel up */ + KEY_RESERVED, /* EAX wheel down */ + KEY_RESERVED, /* text: 3D Midi */ + KEY_RESERVED, /* 3D Midi wheel up */ + KEY_RESERVED, /* 3D Midi wheel down */ + KEY_MUTE, + KEY_VOLUMEUP, + KEY_VOLUMEDOWN, + KEY_UP, + KEY_LEFT, + KEY_RIGHT, + KEY_REWIND, + KEY_OK, + KEY_FASTFORWARD, + KEY_DOWN, + KEY_AGAIN, /* text: Return, symbol: Jump to */ + KEY_PLAY, /* text: Start */ + KEY_ESC, /* text: Cancel */ + KEY_RECORD, + KEY_OPTION, + KEY_MENU, /* text: Display */ + KEY_PREVIOUS, + KEY_PLAYPAUSE, + KEY_NEXT, + KEY_SLOW, + KEY_STOP, + KEY_NUMERIC_1, + KEY_NUMERIC_2, + KEY_NUMERIC_3, + KEY_NUMERIC_4, + KEY_NUMERIC_5, + KEY_NUMERIC_6, + KEY_NUMERIC_7, + KEY_NUMERIC_8, + KEY_NUMERIC_9, + KEY_NUMERIC_0 +}; + +/* + * Codes and keys from lirc's + * remotes/creative/lircd.conf.alsa_usb + * order and size must match creative_sb0540_key_table[] above + */ +static const unsigned short creative_sb0540_codes[] = { + 0x619E, + 0x916E, + 0x926D, + 0x936C, + 0x718E, + 0x946B, + 0x956A, + 0x8C73, + 0x9669, + 0x9768, + 0x9867, + 0x9966, + 0x9A65, + 0x6E91, + 0x629D, + 0x639C, + 0x7B84, + 0x6B94, + 0x728D, + 0x8778, + 0x817E, + 0x758A, + 0x8D72, + 0x8E71, + 0x8877, + 0x7C83, + 0x738C, + 0x827D, + 0x7689, + 0x7F80, + 0x7986, + 0x7A85, + 0x7D82, + 0x857A, + 0x8B74, + 0x8F70, + 0x906F, + 0x8A75, + 0x847B, + 0x7887, + 0x8976, + 0x837C, + 0x7788, + 0x807F +}; + +struct creative_sb0540 { + struct input_dev *input_dev; + struct hid_device *hid; + unsigned short keymap[ARRAY_SIZE(creative_sb0540_key_table)]; +}; + +static inline u64 reverse(u64 data, int bits) +{ + int i; + u64 c; + + c = 0; + for (i = 0; i < bits; i++) { + c |= (u64) (((data & (((u64) 1) << i)) ? 1 : 0)) + << (bits - 1 - i); + } + return (c); +} + +static int get_key(struct creative_sb0540 *creative_sb0540, u64 keycode) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(creative_sb0540_codes); i++) { + if (creative_sb0540_codes[i] == keycode) + return creative_sb0540->keymap[i]; + } + + return 0; + +} + +static int creative_sb0540_raw_event(struct hid_device *hid, + struct hid_report *report, u8 *data, int len) +{ + struct creative_sb0540 *creative_sb0540 = hid_get_drvdata(hid); + u64 code, main_code; + int key; + + if (len != 6) + return 0; + + /* From daemons/hw_hiddev.c sb0540_rec() in lirc */ + code = reverse(data[5], 8); + main_code = (code << 8) + ((~code) & 0xff); + + /* + * Flip to get values in the same format as + * remotes/creative/lircd.conf.alsa_usb in lirc + */ + main_code = ((main_code & 0xff) << 8) + + ((main_code & 0xff00) >> 8); + + key = get_key(creative_sb0540, main_code); + if (key == 0 || key == KEY_RESERVED) { + hid_err(hid, "Could not get a key for main_code %llX\n", + main_code); + return 0; + } + + input_report_key(creative_sb0540->input_dev, key, 1); + input_report_key(creative_sb0540->input_dev, key, 0); + input_sync(creative_sb0540->input_dev); + + /* let hidraw and hiddev handle the report */ + return 0; +} + +static int creative_sb0540_input_configured(struct hid_device *hid, + struct hid_input *hidinput) +{ + struct input_dev *input_dev = hidinput->input; + struct creative_sb0540 *creative_sb0540 = hid_get_drvdata(hid); + int i; + + creative_sb0540->input_dev = input_dev; + + input_dev->keycode = creative_sb0540->keymap; + input_dev->keycodesize = sizeof(unsigned short); + input_dev->keycodemax = ARRAY_SIZE(creative_sb0540->keymap); + + input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + + memcpy(creative_sb0540->keymap, creative_sb0540_key_table, + sizeof(creative_sb0540->keymap)); + for (i = 0; i < ARRAY_SIZE(creative_sb0540_key_table); i++) + set_bit(creative_sb0540->keymap[i], input_dev->keybit); + clear_bit(KEY_RESERVED, input_dev->keybit); + + return 0; +} + +static int creative_sb0540_input_mapping(struct hid_device *hid, + struct hid_input *hi, struct hid_field *field, + struct hid_usage *usage, unsigned long **bit, int *max) +{ + /* + * We are remapping the keys ourselves, so ignore the hid-input + * keymap processing. + */ + return -1; +} + +static int creative_sb0540_probe(struct hid_device *hid, + const struct hid_device_id *id) +{ + int ret; + struct creative_sb0540 *creative_sb0540; + + creative_sb0540 = devm_kzalloc(&hid->dev, + sizeof(struct creative_sb0540), GFP_KERNEL); + + if (!creative_sb0540) + return -ENOMEM; + + creative_sb0540->hid = hid; + + /* force input as some remotes bypass the input registration */ + hid->quirks |= HID_QUIRK_HIDINPUT_FORCE; + + hid_set_drvdata(hid, creative_sb0540); + + ret = hid_parse(hid); + if (ret) { + hid_err(hid, "parse failed\n"); + return ret; + } + + ret = hid_hw_start(hid, HID_CONNECT_DEFAULT); + if (ret) { + hid_err(hid, "hw start failed\n"); + return ret; + } + + return ret; +} + +static const struct hid_device_id creative_sb0540_devices[] = { + { HID_USB_DEVICE(USB_VENDOR_ID_CREATIVELABS, USB_DEVICE_ID_CREATIVE_SB0540) }, + { } +}; +MODULE_DEVICE_TABLE(hid, creative_sb0540_devices); + +static struct hid_driver creative_sb0540_driver = { + .name = "creative-sb0540", + .id_table = creative_sb0540_devices, + .raw_event = creative_sb0540_raw_event, + .input_configured = creative_sb0540_input_configured, + .probe = creative_sb0540_probe, + .input_mapping = creative_sb0540_input_mapping, +}; +module_hid_driver(creative_sb0540_driver); diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 0a00be19f7a0..b2d97333ff15 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -314,6 +314,7 @@ #define USB_VENDOR_ID_CREATIVELABS 0x041e #define USB_DEVICE_ID_CREATIVE_SB_OMNI_SURROUND_51 0x322c #define USB_DEVICE_ID_PRODIKEYS_PCMIDI 0x2801 +#define USB_DEVICE_ID_CREATIVE_SB0540 0x3100 #define USB_VENDOR_ID_CVTOUCH 0x1ff7 #define USB_DEVICE_ID_CVTOUCH_SCREEN 0x0013 From da23b6faa8bf0f1c50a0700440e9ff3f52b3bd9a Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Sat, 31 Aug 2019 17:24:01 +0300 Subject: [PATCH 238/721] watchdog: iTCO: Add support for Cannon Lake PCH iTCO In Intel Cannon Lake PCH the NO_REBOOT bit was moved from the private register space to be part of the TCO1_CNT register. For this reason introduce another version (6) that uses this register to set and clear NO_REBOOT bit. Signed-off-by: Mika Westerberg Acked-by: Guenter Roeck Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/watchdog/iTCO_wdt.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index c559f706ae7e..156360e37714 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -48,6 +48,7 @@ /* Includes */ #include /* For ACPI support */ +#include /* For BIT() */ #include /* For module specific items */ #include /* For new moduleparam's */ #include /* For standard types (like size_t) */ @@ -215,6 +216,23 @@ static int update_no_reboot_bit_mem(void *priv, bool set) return 0; } +static int update_no_reboot_bit_cnt(void *priv, bool set) +{ + struct iTCO_wdt_private *p = priv; + u16 val, newval; + + val = inw(TCO1_CNT(p)); + if (set) + val |= BIT(0); + else + val &= ~BIT(0); + outw(val, TCO1_CNT(p)); + newval = inw(TCO1_CNT(p)); + + /* make sure the update is successful */ + return val != newval ? -EIO : 0; +} + static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p, struct itco_wdt_platform_data *pdata) { @@ -224,7 +242,9 @@ static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p, return; } - if (p->iTCO_version >= 2) + if (p->iTCO_version >= 6) + p->update_no_reboot_bit = update_no_reboot_bit_cnt; + else if (p->iTCO_version >= 2) p->update_no_reboot_bit = update_no_reboot_bit_mem; else if (p->iTCO_version == 1) p->update_no_reboot_bit = update_no_reboot_bit_pci; @@ -452,7 +472,8 @@ static int iTCO_wdt_probe(struct platform_device *pdev) * Get the Memory-Mapped GCS or PMC register, we need it for the * NO_REBOOT flag (TCO v2 and v3). */ - if (p->iTCO_version >= 2 && !pdata->update_no_reboot_bit) { + if (p->iTCO_version >= 2 && p->iTCO_version < 6 && + !pdata->update_no_reboot_bit) { p->gcs_pmc_res = platform_get_resource(pdev, IORESOURCE_MEM, ICH_RES_MEM_GCS_PMC); @@ -502,6 +523,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev) /* Clear out the (probably old) status */ switch (p->iTCO_version) { + case 6: case 5: case 4: outw(0x0008, TCO1_STS(p)); /* Clear the Time Out Status bit */ From b84398d6d7f900805662b1619223fd644d862d7c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Sat, 31 Aug 2019 17:24:02 +0300 Subject: [PATCH 239/721] i2c: i801: Use iTCO version 6 in Cannon Lake PCH and beyond Intel Cannon Lake PCH moved the NO_REBOOT bit to reside as part of the TCO registers instead so update the i2c-i801 driver so that for Cannon Lake and beyond register platform device for iTCO using version 6. The affected PCHs are Cannon Lake, Cedar Fork, Comet Lake, Elkhart Lake and Ice Lake. Signed-off-by: Mika Westerberg Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-i801.c | 144 +++++++++++++++++++++------------- 1 file changed, 89 insertions(+), 55 deletions(-) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index a6469978e735..38afd16c857a 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -292,7 +292,8 @@ struct i801_priv { #define FEATURE_HOST_NOTIFY BIT(5) /* Not really a feature, but it's convenient to handle it as such */ #define FEATURE_IDF BIT(15) -#define FEATURE_TCO BIT(16) +#define FEATURE_TCO_SPT BIT(16) +#define FEATURE_TCO_CNL BIT(17) static const char *i801_feature_names[] = { "SMBus PEC", @@ -1491,57 +1492,23 @@ static inline unsigned int i801_get_adapter_class(struct i801_priv *priv) } #endif -static const struct itco_wdt_platform_data tco_platform_data = { +static const struct itco_wdt_platform_data spt_tco_platform_data = { .name = "Intel PCH", .version = 4, }; static DEFINE_SPINLOCK(p2sb_spinlock); -static void i801_add_tco(struct i801_priv *priv) +static struct platform_device * +i801_add_tco_spt(struct i801_priv *priv, struct pci_dev *pci_dev, + struct resource *tco_res) { - struct pci_dev *pci_dev = priv->pci_dev; - struct resource tco_res[3], *res; - struct platform_device *pdev; + struct resource *res; unsigned int devfn; - u32 tco_base, tco_ctl; - u32 base_addr, ctrl_val; u64 base64_addr; + u32 base_addr; u8 hidden; - if (!(priv->features & FEATURE_TCO)) - return; - - pci_read_config_dword(pci_dev, TCOBASE, &tco_base); - pci_read_config_dword(pci_dev, TCOCTL, &tco_ctl); - if (!(tco_ctl & TCOCTL_EN)) - return; - - memset(tco_res, 0, sizeof(tco_res)); - - res = &tco_res[ICH_RES_IO_TCO]; - res->start = tco_base & ~1; - res->end = res->start + 32 - 1; - res->flags = IORESOURCE_IO; - - /* - * Power Management registers. - */ - devfn = PCI_DEVFN(PCI_SLOT(pci_dev->devfn), 2); - pci_bus_read_config_dword(pci_dev->bus, devfn, ACPIBASE, &base_addr); - - res = &tco_res[ICH_RES_IO_SMI]; - res->start = (base_addr & ~1) + ACPIBASE_SMI_OFF; - res->end = res->start + 3; - res->flags = IORESOURCE_IO; - - /* - * Enable the ACPI I/O space. - */ - pci_bus_read_config_dword(pci_dev->bus, devfn, ACPICTRL, &ctrl_val); - ctrl_val |= ACPICTRL_EN; - pci_bus_write_config_dword(pci_dev->bus, devfn, ACPICTRL, ctrl_val); - /* * We must access the NO_REBOOT bit over the Primary to Sideband * bridge (P2SB). The BIOS prevents the P2SB device from being @@ -1577,15 +1544,76 @@ static void i801_add_tco(struct i801_priv *priv) res->end = res->start + 3; res->flags = IORESOURCE_MEM; - pdev = platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1, - tco_res, 3, &tco_platform_data, - sizeof(tco_platform_data)); - if (IS_ERR(pdev)) { - dev_warn(&pci_dev->dev, "failed to create iTCO device\n"); - return; - } + return platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1, + tco_res, 3, &spt_tco_platform_data, + sizeof(spt_tco_platform_data)); +} - priv->tco_pdev = pdev; +static const struct itco_wdt_platform_data cnl_tco_platform_data = { + .name = "Intel PCH", + .version = 6, +}; + +static struct platform_device * +i801_add_tco_cnl(struct i801_priv *priv, struct pci_dev *pci_dev, + struct resource *tco_res) +{ + return platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1, + tco_res, 2, &cnl_tco_platform_data, + sizeof(cnl_tco_platform_data)); +} + +static void i801_add_tco(struct i801_priv *priv) +{ + u32 base_addr, tco_base, tco_ctl, ctrl_val; + struct pci_dev *pci_dev = priv->pci_dev; + struct resource tco_res[3], *res; + unsigned int devfn; + + /* If we have ACPI based watchdog use that instead */ + if (acpi_has_watchdog()) + return; + + if (!(priv->features & (FEATURE_TCO_SPT | FEATURE_TCO_CNL))) + return; + + pci_read_config_dword(pci_dev, TCOBASE, &tco_base); + pci_read_config_dword(pci_dev, TCOCTL, &tco_ctl); + if (!(tco_ctl & TCOCTL_EN)) + return; + + memset(tco_res, 0, sizeof(tco_res)); + + res = &tco_res[ICH_RES_IO_TCO]; + res->start = tco_base & ~1; + res->end = res->start + 32 - 1; + res->flags = IORESOURCE_IO; + + /* + * Power Management registers. + */ + devfn = PCI_DEVFN(PCI_SLOT(pci_dev->devfn), 2); + pci_bus_read_config_dword(pci_dev->bus, devfn, ACPIBASE, &base_addr); + + res = &tco_res[ICH_RES_IO_SMI]; + res->start = (base_addr & ~1) + ACPIBASE_SMI_OFF; + res->end = res->start + 3; + res->flags = IORESOURCE_IO; + + /* + * Enable the ACPI I/O space. + */ + pci_bus_read_config_dword(pci_dev->bus, devfn, ACPICTRL, &ctrl_val); + ctrl_val |= ACPICTRL_EN; + pci_bus_write_config_dword(pci_dev->bus, devfn, ACPICTRL, ctrl_val); + + if (priv->features & FEATURE_TCO_CNL) + priv->tco_pdev = i801_add_tco_cnl(priv, pci_dev, tco_res); + else + priv->tco_pdev = i801_add_tco_spt(priv, pci_dev, tco_res); + + if (IS_ERR(priv->tco_pdev)) + dev_warn(&pci_dev->dev, "failed to create iTCO device\n"); } #ifdef CONFIG_ACPI @@ -1695,13 +1723,21 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) switch (dev->device) { case PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_SMBUS: case PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_SMBUS: - case PCI_DEVICE_ID_INTEL_CANNONLAKE_H_SMBUS: - case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS: case PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS: case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS: - case PCI_DEVICE_ID_INTEL_CDF_SMBUS: case PCI_DEVICE_ID_INTEL_DNV_SMBUS: case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS: + priv->features |= FEATURE_I2C_BLOCK_READ; + priv->features |= FEATURE_IRQ; + priv->features |= FEATURE_SMBUS_PEC; + priv->features |= FEATURE_BLOCK_BUFFER; + priv->features |= FEATURE_TCO_SPT; + priv->features |= FEATURE_HOST_NOTIFY; + break; + + case PCI_DEVICE_ID_INTEL_CANNONLAKE_H_SMBUS: + case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS: + case PCI_DEVICE_ID_INTEL_CDF_SMBUS: case PCI_DEVICE_ID_INTEL_ICELAKE_LP_SMBUS: case PCI_DEVICE_ID_INTEL_COMETLAKE_SMBUS: case PCI_DEVICE_ID_INTEL_ELKHART_LAKE_SMBUS: @@ -1711,9 +1747,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) priv->features |= FEATURE_IRQ; priv->features |= FEATURE_SMBUS_PEC; priv->features |= FEATURE_BLOCK_BUFFER; - /* If we have ACPI based watchdog use that instead */ - if (!acpi_has_watchdog()) - priv->features |= FEATURE_TCO; + priv->features |= FEATURE_TCO_CNL; priv->features |= FEATURE_HOST_NOTIFY; break; From f8c274e4a70e7cdc43db56e2391c485c580b5a43 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 13 Aug 2019 13:55:55 +0200 Subject: [PATCH 240/721] i2c: hix5hd2: Remove IRQF_ONESHOT The drivers sets IRQF_ONESHOT and passes only a primary handler. The IRQ is masked while the primary is handler is invoked independently of IRQF_ONESHOT. With IRQF_ONESHOT the core code will not force-thread the interrupt and this is probably not intended. I *assume* that the original author copied the IRQ registration from another driver which passed a primary and secondary handler and removed the secondary handler but keeping the ONESHOT flag. Remove IRQF_ONESHOT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-hix5hd2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-hix5hd2.c b/drivers/i2c/busses/i2c-hix5hd2.c index 4df1434b3597..8497c7a95dd4 100644 --- a/drivers/i2c/busses/i2c-hix5hd2.c +++ b/drivers/i2c/busses/i2c-hix5hd2.c @@ -445,8 +445,7 @@ static int hix5hd2_i2c_probe(struct platform_device *pdev) hix5hd2_i2c_init(priv); ret = devm_request_irq(&pdev->dev, irq, hix5hd2_i2c_irq, - IRQF_NO_SUSPEND | IRQF_ONESHOT, - dev_name(&pdev->dev), priv); + IRQF_NO_SUSPEND, dev_name(&pdev->dev), priv); if (ret != 0) { dev_err(&pdev->dev, "cannot request HS-I2C IRQ %d\n", irq); goto err_clk; From 7077ad2ee316551b4ba9602838d09b67683e20b6 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Mon, 19 Aug 2019 13:28:54 +0530 Subject: [PATCH 241/721] i2c: synquacer: Make synquacer_i2c_ops constant Static structure synquacer_i2c_ops, of type i2c_adapter, is only used when it is copied into a field of another structure. It is not itself modified. Hence make it const to protect it from unintended modification. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Acked-by: Ard Biesheuvel Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-synquacer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-synquacer.c b/drivers/i2c/busses/i2c-synquacer.c index f724c8e6b360..39762f0611b1 100644 --- a/drivers/i2c/busses/i2c-synquacer.c +++ b/drivers/i2c/busses/i2c-synquacer.c @@ -526,7 +526,7 @@ static const struct i2c_algorithm synquacer_i2c_algo = { .functionality = synquacer_i2c_functionality, }; -static struct i2c_adapter synquacer_i2c_ops = { +static const struct i2c_adapter synquacer_i2c_ops = { .owner = THIS_MODULE, .name = "synquacer_i2c-adapter", .algo = &synquacer_i2c_algo, From 0a321b97368aa0567f238207eeb19cc8cbb6a7e2 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Sat, 6 Jul 2019 18:49:11 +0530 Subject: [PATCH 242/721] i2c: fsi: Add of_put_node() before break Each iteration of for_each_available_childe_of_node puts the previous node, but in the case of a break from the middle of the loop, there is no put, thus causing a memory leak. Add an of_node_put before the break. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Reviewed-by: Eddie James Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-fsi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-fsi.c b/drivers/i2c/busses/i2c-fsi.c index da5eb3960def..e0c256922d4f 100644 --- a/drivers/i2c/busses/i2c-fsi.c +++ b/drivers/i2c/busses/i2c-fsi.c @@ -707,8 +707,10 @@ static int fsi_i2c_probe(struct device *dev) continue; port = kzalloc(sizeof(*port), GFP_KERNEL); - if (!port) + if (!port) { + of_node_put(np); break; + } port->master = i2c; port->port = port_no; From aec256d0ecd561036f188dbc8fa7924c47a9edfd Mon Sep 17 00:00:00 2001 From: Joao Moreno Date: Tue, 3 Sep 2019 16:46:32 +0200 Subject: [PATCH 243/721] HID: apple: Fix stuck function keys when using FN This fixes an issue in which key down events for function keys would be repeatedly emitted even after the user has raised the physical key. For example, the driver fails to emit the F5 key up event when going through the following steps: - fnmode=1: hold FN, hold F5, release FN, release F5 - fnmode=2: hold F5, hold FN, release F5, release FN The repeated F5 key down events can be easily verified using xev. Signed-off-by: Joao Moreno Co-developed-by: Benjamin Tissoires Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-apple.c | 45 ++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 81df62f48c4c..6ac8becc2372 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -54,7 +54,6 @@ MODULE_PARM_DESC(swap_opt_cmd, "Swap the Option (\"Alt\") and Command (\"Flag\") struct apple_sc { unsigned long quirks; unsigned int fn_on; - DECLARE_BITMAP(pressed_fn, KEY_CNT); DECLARE_BITMAP(pressed_numlock, KEY_CNT); }; @@ -181,6 +180,8 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, { struct apple_sc *asc = hid_get_drvdata(hid); const struct apple_key_translation *trans, *table; + bool do_translate; + u16 code = 0; if (usage->code == KEY_FN) { asc->fn_on = !!value; @@ -189,8 +190,6 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, } if (fnmode) { - int do_translate; - if (hid->product >= USB_DEVICE_ID_APPLE_WELLSPRING4_ANSI && hid->product <= USB_DEVICE_ID_APPLE_WELLSPRING4A_JIS) table = macbookair_fn_keys; @@ -202,25 +201,33 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, trans = apple_find_translation (table, usage->code); if (trans) { - if (test_bit(usage->code, asc->pressed_fn)) - do_translate = 1; - else if (trans->flags & APPLE_FLAG_FKEY) - do_translate = (fnmode == 2 && asc->fn_on) || - (fnmode == 1 && !asc->fn_on); - else - do_translate = asc->fn_on; + if (test_bit(trans->from, input->key)) + code = trans->from; + else if (test_bit(trans->to, input->key)) + code = trans->to; - if (do_translate) { - if (value) - set_bit(usage->code, asc->pressed_fn); - else - clear_bit(usage->code, asc->pressed_fn); + if (!code) { + if (trans->flags & APPLE_FLAG_FKEY) { + switch (fnmode) { + case 1: + do_translate = !asc->fn_on; + break; + case 2: + do_translate = asc->fn_on; + break; + default: + /* should never happen */ + do_translate = false; + } + } else { + do_translate = asc->fn_on; + } - input_event(input, usage->type, trans->to, - value); - - return 1; + code = do_translate ? trans->to : trans->from; } + + input_event(input, usage->type, code, value); + return 1; } if (asc->quirks & APPLE_NUMLOCK_EMULATION && From b20bef4b4be8a0a031049cc552c52e9d4e70ef00 Mon Sep 17 00:00:00 2001 From: HungNien Chen Date: Fri, 30 Aug 2019 15:58:30 +0800 Subject: [PATCH 244/721] HID: i2c-hid: modify quirks for weida's devices This 'SET_PWR_WAKEUP_DEV' quirk only works for weida's devices with pid 0xC300 & 0xC301. Some weida's devices with other pids also need this quirk now. Use 'HID_ANY_ID' instead of 0xC300 to make all of weida's devices can be fixed on the power on issue. This modification should be safe since devices without power on issue will send the power on command only once. Signed-off-by: HungNien Chen Signed-off-by: Jiri Kosina --- drivers/hid/i2c-hid/i2c-hid-core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index 90164fed08d3..2a7c6e33bb1c 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -169,9 +169,7 @@ static const struct i2c_hid_quirks { __u16 idProduct; __u32 quirks; } i2c_hid_quirks[] = { - { USB_VENDOR_ID_WEIDA, USB_DEVICE_ID_WEIDA_8752, - I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV }, - { USB_VENDOR_ID_WEIDA, USB_DEVICE_ID_WEIDA_8755, + { USB_VENDOR_ID_WEIDA, HID_ANY_ID, I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV }, { I2C_VENDOR_ID_HANTICK, I2C_PRODUCT_ID_HANTICK_5288, I2C_HID_QUIRK_NO_IRQ_AFTER_RESET | From 0e3ff0ac5f71bdb6be2a698de0ed0c7e6e738269 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:14 +0200 Subject: [PATCH 245/721] PCI: rockchip: Propagate errors for optional regulators regulator_get_optional() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate data structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "regulator not specified in DT". What we really want is to ignore the optional regulators only if they have not been specified in DT. regulator_get_optional() returns -ENODEV in this case, so that's the special case that we need to handle. So we propagate all errors, except -ENODEV, so that real failures will still cause the driver to fail probe. Tested-by: Heiko Stuebner Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Reviewed-by: Heiko Stuebner Acked-by: Shawn Lin Cc: Shawn Lin Cc: Heiko Stuebner Cc: linux-rockchip@lists.infradead.org --- drivers/pci/controller/pcie-rockchip-host.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/pci/controller/pcie-rockchip-host.c b/drivers/pci/controller/pcie-rockchip-host.c index 8d20f1793a61..ef8e677ce9d1 100644 --- a/drivers/pci/controller/pcie-rockchip-host.c +++ b/drivers/pci/controller/pcie-rockchip-host.c @@ -608,29 +608,29 @@ static int rockchip_pcie_parse_host_dt(struct rockchip_pcie *rockchip) rockchip->vpcie12v = devm_regulator_get_optional(dev, "vpcie12v"); if (IS_ERR(rockchip->vpcie12v)) { - if (PTR_ERR(rockchip->vpcie12v) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie12v) != -ENODEV) + return PTR_ERR(rockchip->vpcie12v); dev_info(dev, "no vpcie12v regulator found\n"); } rockchip->vpcie3v3 = devm_regulator_get_optional(dev, "vpcie3v3"); if (IS_ERR(rockchip->vpcie3v3)) { - if (PTR_ERR(rockchip->vpcie3v3) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie3v3) != -ENODEV) + return PTR_ERR(rockchip->vpcie3v3); dev_info(dev, "no vpcie3v3 regulator found\n"); } rockchip->vpcie1v8 = devm_regulator_get_optional(dev, "vpcie1v8"); if (IS_ERR(rockchip->vpcie1v8)) { - if (PTR_ERR(rockchip->vpcie1v8) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie1v8) != -ENODEV) + return PTR_ERR(rockchip->vpcie1v8); dev_info(dev, "no vpcie1v8 regulator found\n"); } rockchip->vpcie0v9 = devm_regulator_get_optional(dev, "vpcie0v9"); if (IS_ERR(rockchip->vpcie0v9)) { - if (PTR_ERR(rockchip->vpcie0v9) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie0v9) != -ENODEV) + return PTR_ERR(rockchip->vpcie0v9); dev_info(dev, "no vpcie0v9 regulator found\n"); } From ddd6960087d4b45759434146d681a94bbb1c54ad Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:15 +0200 Subject: [PATCH 246/721] PCI: exynos: Propagate errors for optional PHYs devm_of_phy_get() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate devres structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "PHY not specified in DT". What we really want is to ignore the optional PHYs only if they have not been specified in DT. devm_of_phy_get() returns -ENODEV in this case, so that's the special case that we need to handle. So we propagate all errors, except -ENODEV, so that real failures will still cause the driver to fail probe. Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Cc: Jingoo Han Cc: Kukjin Kim Cc: Krzysztof Kozlowski --- drivers/pci/controller/dwc/pci-exynos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pci-exynos.c b/drivers/pci/controller/dwc/pci-exynos.c index cee5f2f590e2..14a6ba4067fb 100644 --- a/drivers/pci/controller/dwc/pci-exynos.c +++ b/drivers/pci/controller/dwc/pci-exynos.c @@ -465,7 +465,7 @@ static int __init exynos_pcie_probe(struct platform_device *pdev) ep->phy = devm_of_phy_get(dev, np, NULL); if (IS_ERR(ep->phy)) { - if (PTR_ERR(ep->phy) == -EPROBE_DEFER) + if (PTR_ERR(ep->phy) != -ENODEV) return PTR_ERR(ep->phy); ep->phy = NULL; From 2170a09fb4b0f66e06e5bcdcbc98c9ccbf353650 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:16 +0200 Subject: [PATCH 247/721] PCI: imx6: Propagate errors for optional regulators regulator_get_optional() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate data structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "regulator not specified in DT". What we really want is to ignore the optional regulators only if they have not been specified in DT. regulator_get_optional() returns -ENODEV in this case, so that's the special case that we need to handle. So we propagate all errors, except -ENODEV, so that real failures will still cause the driver to fail probe. Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Cc: Richard Zhu Cc: Lucas Stach Cc: Shawn Guo Cc: Sascha Hauer Cc: Fabio Estevam Cc: kernel@pengutronix.de Cc: linux-imx@nxp.com --- drivers/pci/controller/dwc/pci-imx6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 9b5cb5b70389..aabf22eaa6b9 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -1173,8 +1173,8 @@ static int imx6_pcie_probe(struct platform_device *pdev) imx6_pcie->vpcie = devm_regulator_get_optional(&pdev->dev, "vpcie"); if (IS_ERR(imx6_pcie->vpcie)) { - if (PTR_ERR(imx6_pcie->vpcie) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(imx6_pcie->vpcie) != -ENODEV) + return PTR_ERR(imx6_pcie->vpcie); imx6_pcie->vpcie = NULL; } From e7a877b2fa79c57c3eac7d28803279a356844907 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:17 +0200 Subject: [PATCH 248/721] PCI: armada8x: Propagate errors for optional PHYs devm_of_phy_get_by_index() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate devres structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "PHY not specified in DT". What we really want is to ignore the optional PHYs only if they have not been specified in DT. devm_of_phy_get_by_index() returns -ENODEV in this case, so that's the special case that we need to handle. So we propagate all errors, except -ENODEV, so that real failures will still cause the driver to fail probe. Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Cc: Thomas Petazzoni --- drivers/pci/controller/dwc/pcie-armada8k.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-armada8k.c b/drivers/pci/controller/dwc/pcie-armada8k.c index 3d55dc78d999..49596547e8c2 100644 --- a/drivers/pci/controller/dwc/pcie-armada8k.c +++ b/drivers/pci/controller/dwc/pcie-armada8k.c @@ -118,11 +118,10 @@ static int armada8k_pcie_setup_phys(struct armada8k_pcie *pcie) for (i = 0; i < ARMADA8K_PCIE_MAX_LANES; i++) { pcie->phy[i] = devm_of_phy_get_by_index(dev, node, i); - if (IS_ERR(pcie->phy[i]) && - (PTR_ERR(pcie->phy[i]) == -EPROBE_DEFER)) - return PTR_ERR(pcie->phy[i]); - if (IS_ERR(pcie->phy[i])) { + if (PTR_ERR(pcie->phy[i]) != -ENODEV) + return PTR_ERR(pcie->phy[i]); + pcie->phy[i] = NULL; continue; } From 8f9e1641ba445437095411d9fda2324121110d5d Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:18 +0200 Subject: [PATCH 249/721] PCI: histb: Propagate errors for optional regulators regulator_get_optional() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate data structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "regulator not specified in DT". What we really want is to ignore the optional regulators only if they have not been specified in DT. regulator_get_optional() returns -ENODEV in this case, so that's the special case that we need to handle. So we propagate all errors, except -ENODEV, so that real failures will still cause the driver to fail probe. Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Cc: Shawn Guo --- drivers/pci/controller/dwc/pcie-histb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-histb.c b/drivers/pci/controller/dwc/pcie-histb.c index 954bc2b74bbc..811b5c6d62ea 100644 --- a/drivers/pci/controller/dwc/pcie-histb.c +++ b/drivers/pci/controller/dwc/pcie-histb.c @@ -340,8 +340,8 @@ static int histb_pcie_probe(struct platform_device *pdev) hipcie->vpcie = devm_regulator_get_optional(dev, "vpcie"); if (IS_ERR(hipcie->vpcie)) { - if (PTR_ERR(hipcie->vpcie) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(hipcie->vpcie) != -ENODEV) + return PTR_ERR(hipcie->vpcie); hipcie->vpcie = NULL; } From c1e33666c94fda365a321c471f2a82996fdf3d83 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Aug 2019 12:53:19 +0200 Subject: [PATCH 250/721] PCI: iproc: Propagate errors for optional PHYs devm_phy_get() can fail for a number of reasons besides probe deferral. It can for example return -ENOMEM if it runs out of memory as it tries to allocate devres structures. Propagating only -EPROBE_DEFER is problematic because it results in these legitimately fatal errors being treated as "PHY not specified in DT". What we really want is to ignore the optional PHYs only if they have not been specified in DT. devm_phy_optional_get() is a function that exactly does what's required here, so use that instead. Signed-off-by: Thierry Reding Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Cc: Ray Jui Cc: Scott Branden Cc: bcm-kernel-feedback-list@broadcom.com --- drivers/pci/controller/pcie-iproc-platform.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pcie-iproc-platform.c b/drivers/pci/controller/pcie-iproc-platform.c index 5a3550b6bb29..9ee6200a66f4 100644 --- a/drivers/pci/controller/pcie-iproc-platform.c +++ b/drivers/pci/controller/pcie-iproc-platform.c @@ -93,12 +93,9 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev) pcie->need_ib_cfg = of_property_read_bool(np, "dma-ranges"); /* PHY use is optional */ - pcie->phy = devm_phy_get(dev, "pcie-phy"); - if (IS_ERR(pcie->phy)) { - if (PTR_ERR(pcie->phy) == -EPROBE_DEFER) - return -EPROBE_DEFER; - pcie->phy = NULL; - } + pcie->phy = devm_phy_optional_get(dev, "pcie-phy"); + if (IS_ERR(pcie->phy)) + return PTR_ERR(pcie->phy); ret = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, &resources, &iobase); From 3675f052b43ba51b99b85b073c7070e083f3e6fb Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Jul 2019 20:44:44 +0200 Subject: [PATCH 251/721] Smack: Don't ignore other bprm->unsafe flags if LSM_UNSAFE_PTRACE is set There is a logic bug in the current smack_bprm_set_creds(): If LSM_UNSAFE_PTRACE is set, but the ptrace state is deemed to be acceptable (e.g. because the ptracer detached in the meantime), the other ->unsafe flags aren't checked. As far as I can tell, this means that something like the following could work (but I haven't tested it): - task A: create task B with fork() - task B: set NO_NEW_PRIVS - task B: install a seccomp filter that makes open() return 0 under some conditions - task B: replace fd 0 with a malicious library - task A: attach to task B with PTRACE_ATTACH - task B: execve() a file with an SMACK64EXEC extended attribute - task A: while task B is still in the middle of execve(), exit (which destroys the ptrace relationship) Make sure that if any flags other than LSM_UNSAFE_PTRACE are set in bprm->unsafe, we reject the execve(). Cc: stable@vger.kernel.org Fixes: 5663884caab1 ("Smack: unify all ptrace accesses in the smack") Signed-off-by: Jann Horn Signed-off-by: Casey Schaufler --- security/smack/smack_lsm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 4c5e5a438f8b..d8b59480a01c 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -937,7 +937,8 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm) if (rc != 0) return rc; - } else if (bprm->unsafe) + } + if (bprm->unsafe & ~LSM_UNSAFE_PTRACE) return -EPERM; bsp->smk_task = isp->smk_task; From a1a07f22346144d1e2108f9faa2a41fe67579e85 Mon Sep 17 00:00:00 2001 From: luanshi Date: Fri, 5 Jul 2019 10:35:20 +0800 Subject: [PATCH 252/721] smack: fix some kernel-doc notations Fix/add kernel-doc notation and fix typos in security/smack/. Signed-off-by: Liguang Zhang Signed-off-by: Casey Schaufler --- security/smack/smack_lsm.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index d8b59480a01c..336e855e7ab2 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -307,7 +307,7 @@ static struct smack_known *smk_fetch(const char *name, struct inode *ip, /** * init_inode_smack - initialize an inode security blob - * @isp: the blob to initialize + * @inode: inode to extract the info from * @skp: a pointer to the Smack label entry to use in the blob * */ @@ -509,7 +509,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp) /** * smack_syslog - Smack approval on syslog - * @type: message type + * @typefrom_file: unused * * Returns 0 on success, error code otherwise. */ @@ -765,7 +765,7 @@ static int smack_sb_eat_lsm_opts(char *options, void **mnt_opts) /** * smack_set_mnt_opts - set Smack specific mount options * @sb: the file system superblock - * @opts: Smack mount options + * @mnt_opts: Smack mount options * @kern_flags: mount option from kernel space or user space * @set_kern_flags: where to store converted mount opts * @@ -959,7 +959,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm) * smack_inode_alloc_security - allocate an inode blob * @inode: the inode in need of a blob * - * Returns 0 if it gets a blob, -ENOMEM otherwise + * Returns 0 */ static int smack_inode_alloc_security(struct inode *inode) { @@ -1165,7 +1165,7 @@ static int smack_inode_rename(struct inode *old_inode, * * This is the important Smack hook. * - * Returns 0 if access is permitted, -EACCES otherwise + * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_permission(struct inode *inode, int mask) { @@ -1223,8 +1223,7 @@ static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr) /** * smack_inode_getattr - Smack check for getting attributes - * @mnt: vfsmount of the object - * @dentry: the object + * @path: path to extract the info from * * Returns 0 if access is permitted, an error code otherwise */ @@ -1871,14 +1870,13 @@ static int smack_file_receive(struct file *file) /** * smack_file_open - Smack dentry open processing * @file: the object - * @cred: task credential * * Set the security blob in the file structure. * Allow the open only if the task has read access. There are * many read operations (e.g. fstat) that you can do with an * fd even if you have the file open write-only. * - * Returns 0 + * Returns 0 if current has access, error code otherwise */ static int smack_file_open(struct file *file) { @@ -1901,7 +1899,7 @@ static int smack_file_open(struct file *file) /** * smack_cred_alloc_blank - "allocate" blank task-level security credentials - * @new: the new credentials + * @cred: the new credentials * @gfp: the atomicity of any memory allocations * * Prepare a blank set of credentials for modification. This must allocate all @@ -1984,7 +1982,7 @@ static void smack_cred_transfer(struct cred *new, const struct cred *old) /** * smack_cred_getsecid - get the secid corresponding to a creds structure - * @c: the object creds + * @cred: the object creds * @secid: where to put the result * * Sets the secid to contain a u32 version of the smack label. @@ -2141,8 +2139,6 @@ static int smack_task_getioprio(struct task_struct *p) /** * smack_task_setscheduler - Smack check on setting scheduler * @p: the task object - * @policy: unused - * @lp: unused * * Return 0 if read access is permitted */ @@ -2612,8 +2608,9 @@ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address) /** * smk_ipv6_port_check - check Smack port access - * @sock: socket + * @sk: socket * @address: address + * @act: the action being taken * * Create or update the port list entry */ @@ -2783,7 +2780,7 @@ static int smack_socket_post_create(struct socket *sock, int family, * * Cross reference the peer labels for SO_PEERSEC * - * Returns 0 on success, and error code otherwise + * Returns 0 */ static int smack_socket_socketpair(struct socket *socka, struct socket *sockb) @@ -3015,13 +3012,13 @@ static int smack_shm_shmctl(struct kern_ipc_perm *isp, int cmd) * * Returns 0 if current has the requested access, error code otherwise */ -static int smack_shm_shmat(struct kern_ipc_perm *ipc, char __user *shmaddr, +static int smack_shm_shmat(struct kern_ipc_perm *isp, char __user *shmaddr, int shmflg) { int may; may = smack_flags_to_may(shmflg); - return smk_curacc_shm(ipc, may); + return smk_curacc_shm(isp, may); } /** @@ -4763,7 +4760,7 @@ static __init void init_smack_known_list(void) /** * smack_init - initialize the smack system * - * Returns 0 + * Returns 0 on success, -ENOMEM is there's no memory */ static __init int smack_init(void) { From 3f4287e7d98a2954f20bf96c567fdffcd2b63eb9 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Tue, 23 Jul 2019 18:00:15 +0800 Subject: [PATCH 253/721] security: smack: Fix possible null-pointer dereferences in smack_socket_sock_rcv_skb() In smack_socket_sock_rcv_skb(), there is an if statement on line 3920 to check whether skb is NULL: if (skb && skb->secmark != 0) This check indicates skb can be NULL in some cases. But on lines 3931 and 3932, skb is used: ad.a.u.net->netif = skb->skb_iif; ipv6_skb_to_auditdata(skb, &ad.a, NULL); Thus, possible null-pointer dereferences may occur when skb is NULL. To fix these possible bugs, an if statement is added to check skb. These bugs are found by a static analysis tool STCheck written by us. Signed-off-by: Jia-Ju Bai Signed-off-by: Casey Schaufler --- security/smack/smack_lsm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 336e855e7ab2..516f7ba96f1e 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3923,6 +3923,8 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) skp = smack_ipv6host_label(&sadd); if (skp == NULL) skp = smack_net_ambient; + if (skb == NULL) + break; #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); ad.a.u.net->family = family; From e5bfad3d7acc5702f32aafeb388362994f4d7bd0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 21 Aug 2019 22:54:41 -0700 Subject: [PATCH 254/721] smack: use GFP_NOFS while holding inode_smack::smk_lock inode_smack::smk_lock is taken during smack_d_instantiate(), which is called during a filesystem transaction when creating a file on ext4. Therefore to avoid a deadlock, all code that takes this lock must use GFP_NOFS, to prevent memory reclaim from waiting for the filesystem transaction to complete. Reported-by: syzbot+0eefc1e06a77d327a056@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Casey Schaufler --- security/smack/smack_access.c | 6 +++--- security/smack/smack_lsm.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index f1c93a7be9ec..38ac3da4e791 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -465,7 +465,7 @@ char *smk_parse_smack(const char *string, int len) if (i == 0 || i >= SMK_LONGLABEL) return ERR_PTR(-EINVAL); - smack = kzalloc(i + 1, GFP_KERNEL); + smack = kzalloc(i + 1, GFP_NOFS); if (smack == NULL) return ERR_PTR(-ENOMEM); @@ -500,7 +500,7 @@ int smk_netlbl_mls(int level, char *catset, struct netlbl_lsm_secattr *sap, if ((m & *cp) == 0) continue; rc = netlbl_catmap_setbit(&sap->attr.mls.cat, - cat, GFP_KERNEL); + cat, GFP_NOFS); if (rc < 0) { netlbl_catmap_free(sap->attr.mls.cat); return rc; @@ -536,7 +536,7 @@ struct smack_known *smk_import_entry(const char *string, int len) if (skp != NULL) goto freeout; - skp = kzalloc(sizeof(*skp), GFP_KERNEL); + skp = kzalloc(sizeof(*skp), GFP_NOFS); if (skp == NULL) { skp = ERR_PTR(-ENOMEM); goto freeout; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 516f7ba96f1e..abeb09c30633 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -288,7 +288,7 @@ static struct smack_known *smk_fetch(const char *name, struct inode *ip, if (!(ip->i_opflags & IOP_XATTR)) return ERR_PTR(-EOPNOTSUPP); - buffer = kzalloc(SMK_LONGLABEL, GFP_KERNEL); + buffer = kzalloc(SMK_LONGLABEL, GFP_NOFS); if (buffer == NULL) return ERR_PTR(-ENOMEM); From 82d51481544146db2e9c9fb90c8fed92a5d0f93b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ard=C3=B6?= Date: Wed, 4 Sep 2019 11:29:53 +0200 Subject: [PATCH 255/721] i2c-eeprom_slave: Add support for more eeprom models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 32 and a 64 kbit memory. These needs 16 bit address so added support for that as well. Signed-off-by: Björn Ardö Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-slave-eeprom.c | 36 +++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/i2c/i2c-slave-eeprom.c b/drivers/i2c/i2c-slave-eeprom.c index be65d3842878..773afaabfb61 100644 --- a/drivers/i2c/i2c-slave-eeprom.c +++ b/drivers/i2c/i2c-slave-eeprom.c @@ -11,6 +11,7 @@ * pointer, yet implementation is deferred until the need actually arises. */ +#include #include #include #include @@ -21,12 +22,18 @@ struct eeprom_data { struct bin_attribute bin; - bool first_write; spinlock_t buffer_lock; - u8 buffer_idx; + u16 buffer_idx; + u16 address_mask; + u8 num_address_bytes; + u8 idx_write_cnt; u8 buffer[]; }; +#define I2C_SLAVE_BYTELEN GENMASK(15, 0) +#define I2C_SLAVE_FLAG_ADDR16 BIT(16) +#define I2C_SLAVE_DEVICE_MAGIC(_len, _flags) ((_flags) | (_len)) + static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, enum i2c_slave_event event, u8 *val) { @@ -34,12 +41,14 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, switch (event) { case I2C_SLAVE_WRITE_RECEIVED: - if (eeprom->first_write) { - eeprom->buffer_idx = *val; - eeprom->first_write = false; + if (eeprom->idx_write_cnt < eeprom->num_address_bytes) { + if (eeprom->idx_write_cnt == 0) + eeprom->buffer_idx = 0; + eeprom->buffer_idx = *val | (eeprom->buffer_idx << 8); + eeprom->idx_write_cnt++; } else { spin_lock(&eeprom->buffer_lock); - eeprom->buffer[eeprom->buffer_idx++] = *val; + eeprom->buffer[eeprom->buffer_idx++ & eeprom->address_mask] = *val; spin_unlock(&eeprom->buffer_lock); } break; @@ -50,7 +59,7 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, /* fallthrough */ case I2C_SLAVE_READ_REQUESTED: spin_lock(&eeprom->buffer_lock); - *val = eeprom->buffer[eeprom->buffer_idx]; + *val = eeprom->buffer[eeprom->buffer_idx & eeprom->address_mask]; spin_unlock(&eeprom->buffer_lock); /* * Do not increment buffer_idx here, because we don't know if @@ -61,7 +70,7 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, case I2C_SLAVE_STOP: case I2C_SLAVE_WRITE_REQUESTED: - eeprom->first_write = true; + eeprom->idx_write_cnt = 0; break; default: @@ -105,13 +114,16 @@ static int i2c_slave_eeprom_probe(struct i2c_client *client, const struct i2c_de { struct eeprom_data *eeprom; int ret; - unsigned size = id->driver_data; + unsigned int size = FIELD_GET(I2C_SLAVE_BYTELEN, id->driver_data); + unsigned int flag_addr16 = FIELD_GET(I2C_SLAVE_FLAG_ADDR16, id->driver_data); eeprom = devm_kzalloc(&client->dev, sizeof(struct eeprom_data) + size, GFP_KERNEL); if (!eeprom) return -ENOMEM; - eeprom->first_write = true; + eeprom->idx_write_cnt = 0; + eeprom->num_address_bytes = flag_addr16 ? 2 : 1; + eeprom->address_mask = size - 1; spin_lock_init(&eeprom->buffer_lock); i2c_set_clientdata(client, eeprom); @@ -146,7 +158,9 @@ static int i2c_slave_eeprom_remove(struct i2c_client *client) } static const struct i2c_device_id i2c_slave_eeprom_id[] = { - { "slave-24c02", 2048 / 8 }, + { "slave-24c02", I2C_SLAVE_DEVICE_MAGIC(2048 / 8, 0) }, + { "slave-24c32", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16) }, + { "slave-24c64", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16) }, { } }; MODULE_DEVICE_TABLE(i2c, i2c_slave_eeprom_id); From 539b7569c56523486db95214d62a108532c7960f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 20 Aug 2019 17:34:40 +0200 Subject: [PATCH 256/721] i2c: cht-wc: drop check because i2c_unregister_device() is NULL safe No need to check the argument of i2c_unregister_device() because the function itself does it. Signed-off-by: Wolfram Sang Acked-by: Hans de Goede Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-cht-wc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c index f6546de66fbc..b8fde61bb5d8 100644 --- a/drivers/i2c/busses/i2c-cht-wc.c +++ b/drivers/i2c/busses/i2c-cht-wc.c @@ -409,8 +409,7 @@ static int cht_wc_i2c_adap_i2c_remove(struct platform_device *pdev) { struct cht_wc_i2c_adap *adap = platform_get_drvdata(pdev); - if (adap->client) - i2c_unregister_device(adap->client); + i2c_unregister_device(adap->client); i2c_del_adapter(&adap->adapter); irq_domain_remove(adap->irq_domain); From 2252c3172cc5ecfab5aef1057f7c57b39e485f21 Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Thu, 15 Aug 2019 11:28:57 +0530 Subject: [PATCH 257/721] i2c: stm32f7: Make structure stm32f7_i2c_algo constant Static structure stm32f7_i2c_algo, of type i2c_algorithm, is used only when it is assigned to constant field algo of a variable having type i2c_adapter. As stm32f7_i2c_algo is therefore never modified, make it const as well to protect it from unintended modification. Issue found with Coccinelle. Signed-off-by: Nishka Dasgupta Acked-by: Pierre-Yves MORDRET Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 266d1c269b83..d36cf08461f7 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -1809,7 +1809,7 @@ static u32 stm32f7_i2c_func(struct i2c_adapter *adap) I2C_FUNC_SMBUS_I2C_BLOCK; } -static struct i2c_algorithm stm32f7_i2c_algo = { +static const struct i2c_algorithm stm32f7_i2c_algo = { .master_xfer = stm32f7_i2c_xfer, .smbus_xfer = stm32f7_i2c_smbus_xfer, .functionality = stm32f7_i2c_func, From 41d529d6227c443a5827cb8b8f040402dedcf3d2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 13 Aug 2019 13:55:54 +0200 Subject: [PATCH 258/721] i2c: exynos5: Remove IRQF_ONESHOT The drivers sets IRQF_ONESHOT and passes only a primary handler. The IRQ is masked while the primary is handler is invoked independently of IRQF_ONESHOT. With IRQF_ONESHOT the core code will not force-thread the interrupt and this is probably not intended. I *assume* that the original author copied the IRQ registration from another driver which passed a primary and secondary handler and removed the secondary handler but keeping the ONESHOT flag. Remove IRQF_ONESHOT. Reported-by: Benjamin Rouxel Tested-by: Benjamin Rouxel Signed-off-by: Sebastian Andrzej Siewior Tested-by: Krzysztof Kozlowski Acked-by: Krzysztof Kozlowski Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-exynos5.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-exynos5.c b/drivers/i2c/busses/i2c-exynos5.c index e4e7932f7800..e7514c16b756 100644 --- a/drivers/i2c/busses/i2c-exynos5.c +++ b/drivers/i2c/busses/i2c-exynos5.c @@ -791,9 +791,7 @@ static int exynos5_i2c_probe(struct platform_device *pdev) } ret = devm_request_irq(&pdev->dev, i2c->irq, exynos5_i2c_irq, - IRQF_NO_SUSPEND | IRQF_ONESHOT, - dev_name(&pdev->dev), i2c); - + IRQF_NO_SUSPEND, dev_name(&pdev->dev), i2c); if (ret != 0) { dev_err(&pdev->dev, "cannot request HS-I2C IRQ %d\n", i2c->irq); goto err_clk; From 169ce0c081cd85f78388bb6c1638c1ad7b81bde7 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Wed, 4 Sep 2019 10:32:48 -0400 Subject: [PATCH 259/721] selinux: fix residual uses of current_security() for the SELinux blob We need to use selinux_cred() to fetch the SELinux cred blob instead of directly using current->security or current_security(). There were a couple of lingering uses of current_security() in the SELinux code that were apparently missed during the earlier conversions. IIUC, this would only manifest as a bug if multiple security modules including SELinux are enabled and SELinux is not first in the lsm order. After this change, there appear to be no other users of current_security() in-tree; perhaps we should remove it altogether. Fixes: bbd3662a8348 ("Infrastructure management of the cred security blob") Signed-off-by: Stephen Smalley Acked-by: Casey Schaufler Reviewed-by: James Morris Signed-off-by: Paul Moore --- security/selinux/hooks.c | 2 +- security/selinux/include/objsec.h | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d55571c585ff..f1b763eceef9 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3435,7 +3435,7 @@ static int selinux_inode_copy_up_xattr(const char *name) static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { - const struct task_security_struct *tsec = current_security(); + const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 parent_sid, newsid, clen; int rc; char *context; diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 231262d8eac9..d2e00c7595dd 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -40,16 +40,6 @@ struct task_security_struct { u32 sockcreate_sid; /* fscreate SID */ }; -/* - * get the subjective security ID of the current task - */ -static inline u32 current_sid(void) -{ - const struct task_security_struct *tsec = current_security(); - - return tsec->sid; -} - enum label_initialized { LABEL_INVALID, /* invalid or not initialized */ LABEL_INITIALIZED, /* initialized */ @@ -188,4 +178,14 @@ static inline struct ipc_security_struct *selinux_ipc( return ipc->security + selinux_blob_sizes.lbs_ipc; } +/* + * get the subjective security ID of the current task + */ +static inline u32 current_sid(void) +{ + const struct task_security_struct *tsec = selinux_cred(current_cred()); + + return tsec->sid; +} + #endif /* _SELINUX_OBJSEC_H_ */ From 15322a0d90b6fd62ae8f22e5b87f735c3fdfeff7 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 4 Sep 2019 18:53:39 -0400 Subject: [PATCH 260/721] lsm: remove current_security() There are no remaining callers and it really is unsafe in the brave new world of LSM stacking. Acked-by: James Morris Signed-off-by: Paul Moore --- include/linux/cred.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/cred.h b/include/linux/cred.h index efb6edf32de7..98b0a23ddd23 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -384,7 +384,6 @@ static inline void put_cred(const struct cred *_cred) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) -#define current_security() (current_cred_xxx(security)) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS From 2bcdacb70327013ca2066bfcf2af1009eff01f1d Mon Sep 17 00:00:00 2001 From: Roderick Colenbrander Date: Wed, 4 Sep 2019 14:22:11 -0700 Subject: [PATCH 261/721] HID: sony: Fix memory corruption issue on cleanup. The sony driver is not properly cleaning up from potential failures in sony_input_configured. Currently it calls hid_hw_stop, while hid_connect is still running. This is not a good idea, instead hid_hw_stop should be moved to sony_probe. Similar changes were recently made to Logitech drivers, which were also doing improper cleanup. Signed-off-by: Roderick Colenbrander CC: stable@vger.kernel.org Signed-off-by: Jiri Kosina --- drivers/hid/hid-sony.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 49dd2d905c7f..73c0f7a95e2d 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -2811,7 +2811,6 @@ static int sony_input_configured(struct hid_device *hdev, sony_cancel_work_sync(sc); sony_remove_dev_list(sc); sony_release_device_id(sc); - hid_hw_stop(hdev); return ret; } @@ -2876,6 +2875,7 @@ static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id) */ if (!(hdev->claimed & HID_CLAIMED_INPUT)) { hid_err(hdev, "failed to claim input\n"); + hid_hw_stop(hdev); return -ENODEV; } From bbbe3ac8f943b68f288111ad0e43a0de576c4fa6 Mon Sep 17 00:00:00 2001 From: Ping Cheng Date: Wed, 4 Sep 2019 13:43:23 -0700 Subject: [PATCH 262/721] HID: wacom: add new MobileStudio Pro 13 support wacom_wac_pad_event is the only routine we need to update. Signed-off-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 6b739c1bda99..85a3082837f6 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2093,14 +2093,14 @@ static void wacom_wac_pad_event(struct hid_device *hdev, struct hid_field *field (hdev->product == 0x34d || hdev->product == 0x34e || /* MobileStudio Pro */ hdev->product == 0x357 || hdev->product == 0x358 || /* Intuos Pro 2 */ hdev->product == 0x392 || /* Intuos Pro 2 */ - hdev->product == 0x399)) { /* MobileStudio Pro */ + hdev->product == 0x398 || hdev->product == 0x399)) { /* MobileStudio Pro */ value = (field->logical_maximum - value); if (hdev->product == 0x357 || hdev->product == 0x358 || hdev->product == 0x392) value = wacom_offset_rotation(input, usage, value, 3, 16); else if (hdev->product == 0x34d || hdev->product == 0x34e || - hdev->product == 0x399) + hdev->product == 0x398 || hdev->product == 0x399) value = wacom_offset_rotation(input, usage, value, 1, 2); } else { From 98375b86c79137416e9fd354177b85e768c16e56 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 4 Sep 2019 11:54:20 -0400 Subject: [PATCH 263/721] HID: prodikeys: Fix general protection fault during probe The syzbot fuzzer provoked a general protection fault in the hid-prodikeys driver: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN CPU: 0 PID: 12 Comm: kworker/0:1 Not tainted 5.3.0-rc5+ #28 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: usb_hub_wq hub_event RIP: 0010:pcmidi_submit_output_report drivers/hid/hid-prodikeys.c:300 [inline] RIP: 0010:pcmidi_set_operational drivers/hid/hid-prodikeys.c:558 [inline] RIP: 0010:pcmidi_snd_initialise drivers/hid/hid-prodikeys.c:686 [inline] RIP: 0010:pk_probe+0xb51/0xfd0 drivers/hid/hid-prodikeys.c:836 Code: 0f 85 50 04 00 00 48 8b 04 24 4c 89 7d 10 48 8b 58 08 e8 b2 53 e4 fc 48 8b 54 24 20 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 13 04 00 00 48 ba 00 00 00 00 00 fc ff df 49 8b The problem is caused by the fact that pcmidi_get_output_report() will return an error if the HID device doesn't provide the right sort of output report, but pcmidi_set_operational() doesn't bother to check the return code and assumes the function call always succeeds. This patch adds the missing check and aborts the probe operation if necessary. Reported-and-tested-by: syzbot+1088533649dafa1c9004@syzkaller.appspotmail.com Signed-off-by: Alan Stern CC: Signed-off-by: Jiri Kosina --- drivers/hid/hid-prodikeys.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-prodikeys.c b/drivers/hid/hid-prodikeys.c index 21544ebff855..5a3b3d974d84 100644 --- a/drivers/hid/hid-prodikeys.c +++ b/drivers/hid/hid-prodikeys.c @@ -551,10 +551,14 @@ static void pcmidi_setup_extra_keys( static int pcmidi_set_operational(struct pcmidi_snd *pm) { + int rc; + if (pm->ifnum != 1) return 0; /* only set up ONCE for interace 1 */ - pcmidi_get_output_report(pm); + rc = pcmidi_get_output_report(pm); + if (rc < 0) + return rc; pcmidi_submit_output_report(pm, 0xc1); return 0; } @@ -683,7 +687,11 @@ static int pcmidi_snd_initialise(struct pcmidi_snd *pm) spin_lock_init(&pm->rawmidi_in_lock); init_sustain_timers(pm); - pcmidi_set_operational(pm); + err = pcmidi_set_operational(pm); + if (err < 0) { + pk_error("failed to find output report\n"); + goto fail_register; + } /* register it */ err = snd_card_register(card); From 244c06c3b6d86513066426c9707ee460cd845349 Mon Sep 17 00:00:00 2001 From: Kelsey Skunberg Date: Thu, 5 Sep 2019 00:32:26 -0600 Subject: [PATCH 264/721] PCI/IOV: Remove group write permission from sriov_numvfs, sriov_drivers_autoprobe Previously the sriov_numvfs and sriov_drivers_autoprobe sysfs files had 0664 permissions, which allowed group write. libvirt runs as root when dealing with PCI, and it chowns files needed by qemu, so group write permission should not be needed. Change these permissions from 0664 to 0644, which is what DEVICE_ATTR_RW() does by default. [bhelgaas: commit log] Link: https://lore.kernel.org/r/20190905063226.43269-1-skunberg.kelsey@gmail.com Link: https://lore.kernel.org/r/850cf536-0b72-d78c-efaf-855dcb391087@redhat.com Signed-off-by: Kelsey Skunberg Signed-off-by: Bjorn Helgaas Reviewed-by: Greg Kroah-Hartman Acked-by: Donald Dutile --- drivers/pci/iov.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 053054362247..617dccebccf5 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -375,12 +375,11 @@ static ssize_t sriov_drivers_autoprobe_store(struct device *dev, } static DEVICE_ATTR_RO(sriov_totalvfs); -static DEVICE_ATTR(sriov_numvfs, 0664, sriov_numvfs_show, sriov_numvfs_store); +static DEVICE_ATTR_RW(sriov_numvfs); static DEVICE_ATTR_RO(sriov_offset); static DEVICE_ATTR_RO(sriov_stride); static DEVICE_ATTR_RO(sriov_vf_device); -static DEVICE_ATTR(sriov_drivers_autoprobe, 0664, sriov_drivers_autoprobe_show, - sriov_drivers_autoprobe_store); +static DEVICE_ATTR_RW(sriov_drivers_autoprobe); static struct attribute *sriov_dev_attrs[] = { &dev_attr_sriov_totalvfs.attr, From 7f1c62c443a453deb6eb3515e3c05650ffe0dcf0 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Mon, 26 Aug 2019 00:46:16 +0200 Subject: [PATCH 265/721] PCI: Add pci_info_ratelimited() to ratelimit PCI separately Do not use printk_ratelimit() in drivers/pci/pci.c as it shares the rate limiting state with all other callers to the printk_ratelimit(). Add pci_info_ratelimited() (similar to pci_notice_ratelimited() added in the commit a88a7b3eb076 ("vfio: Use dev_printk() when possible")) and use it instead of printk_ratelimit() + pci_info(). Link: https://lore.kernel.org/r/20190825224616.8021-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 4 ++-- include/linux/pci.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 29ed5ec1ac27..08dfb16bd084 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -890,8 +890,8 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state) pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); - if (dev->current_state != state && printk_ratelimit()) - pci_info(dev, "Refused to change power state, currently in D%d\n", + if (dev->current_state != state) + pci_info_ratelimited(dev, "Refused to change power state, currently in D%d\n", dev->current_state); /* diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..f5bab312995d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2433,4 +2433,7 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); #define pci_notice_ratelimited(pdev, fmt, arg...) \ dev_notice_ratelimited(&(pdev)->dev, fmt, ##arg) +#define pci_info_ratelimited(pdev, fmt, arg...) \ + dev_info_ratelimited(&(pdev)->dev, fmt, ##arg) + #endif /* LINUX_PCI_H */ From 8050f3f6645ae0f7e4c1304593f6f7eb2ee7d85c Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Mon, 26 Aug 2019 17:14:36 +0200 Subject: [PATCH 266/721] PCI: Use static const struct, not const static struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the static keyword to the front of declarations of pci_regs_behavior[] and pcie_cap_regs_behavior[], which resolves compiler warnings when building with "W=1": drivers/pci/pci-bridge-emul.c:41:1: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] const static struct pci_bridge_reg_behavior pci_regs_behavior[] = { ^ drivers/pci/pci-bridge-emul.c:176:1: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] const static struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { ^ Link: https://lore.kernel.org/r/20190826151436.4672-1-kw@linux.com Link: https://lore.kernel.org/r/20190828131733.5817-1-kw@linux.com Signed-off-by: Krzysztof Wilczynski Signed-off-by: Bjorn Helgaas Acked-by: Thomas Petazzoni --- drivers/pci/pci-bridge-emul.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index 06083b86d4f4..5fd90105510d 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -38,7 +38,7 @@ struct pci_bridge_reg_behavior { u32 rsvd; }; -const static struct pci_bridge_reg_behavior pci_regs_behavior[] = { +static const struct pci_bridge_reg_behavior pci_regs_behavior[] = { [PCI_VENDOR_ID / 4] = { .ro = ~0 }, [PCI_COMMAND / 4] = { .rw = (PCI_COMMAND_IO | PCI_COMMAND_MEMORY | @@ -173,7 +173,7 @@ const static struct pci_bridge_reg_behavior pci_regs_behavior[] = { }, }; -const static struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { +static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { [PCI_CAP_LIST_ID / 4] = { /* * Capability ID, Next Capability Pointer and From 70aaf61a9b8b86eb08da96344efd1c0f0925ee6e Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 22 Aug 2019 10:10:11 -0600 Subject: [PATCH 267/721] PCI: Clean up resource_alignment parameter to not require static buffer Clean up the 'resource_alignment' parameter code to use kstrdup() in the initcall routine instead of a static buffer that wastes memory regardless of whether the feature is used. This allows us to drop 'COMMAND_LINE_SIZE' bytes (typically 256-4096 depending on architecture) of static data. This is similar to what has been done for the 'disable_acs_redir' parameter. Link: https://lore.kernel.org/r/20190822161013.5481-2-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 08dfb16bd084..fbfb64ba447d 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5932,8 +5932,7 @@ resource_size_t __weak pcibios_default_alignment(void) return 0; } -#define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE -static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0}; +static char *resource_alignment_param; static DEFINE_SPINLOCK(resource_alignment_lock); /** @@ -5954,7 +5953,7 @@ static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev, spin_lock(&resource_alignment_lock); p = resource_alignment_param; - if (!*p && !align) + if (!p || !*p) goto out; if (pci_has_flag(PCI_PROBE_ONLY)) { align = 0; @@ -6120,21 +6119,25 @@ void pci_reassigndev_resource_alignment(struct pci_dev *dev) static ssize_t pci_set_resource_alignment_param(const char *buf, size_t count) { - if (count > RESOURCE_ALIGNMENT_PARAM_SIZE - 1) - count = RESOURCE_ALIGNMENT_PARAM_SIZE - 1; spin_lock(&resource_alignment_lock); - strncpy(resource_alignment_param, buf, count); - resource_alignment_param[count] = '\0'; + + kfree(resource_alignment_param); + resource_alignment_param = kstrndup(buf, count, GFP_KERNEL); + spin_unlock(&resource_alignment_lock); - return count; + + return resource_alignment_param ? count : -ENOMEM; } static ssize_t pci_get_resource_alignment_param(char *buf, size_t size) { - size_t count; + size_t count = 0; + spin_lock(&resource_alignment_lock); - count = snprintf(buf, size, "%s", resource_alignment_param); + if (resource_alignment_param) + count = snprintf(buf, size, "%s", resource_alignment_param); spin_unlock(&resource_alignment_lock); + return count; } @@ -6275,8 +6278,7 @@ static int __init pci_setup(char *str) } else if (!strncmp(str, "cbmemsize=", 10)) { pci_cardbus_mem_size = memparse(str + 10, &str); } else if (!strncmp(str, "resource_alignment=", 19)) { - pci_set_resource_alignment_param(str + 19, - strlen(str + 19)); + resource_alignment_param = str + 19; } else if (!strncmp(str, "ecrc=", 5)) { pcie_ecrc_get_policy(str + 5); } else if (!strncmp(str, "hpiosize=", 9)) { @@ -6311,15 +6313,18 @@ static int __init pci_setup(char *str) early_param("pci", pci_setup); /* - * 'disable_acs_redir_param' is initialized in pci_setup(), above, to point - * to data in the __initdata section which will be freed after the init - * sequence is complete. We can't allocate memory in pci_setup() because some - * architectures do not have any memory allocation service available during - * an early_param() call. So we allocate memory and copy the variable here - * before the init section is freed. + * 'resource_alignment_param' and 'disable_acs_redir_param' are initialized + * in pci_setup(), above, to point to data in the __initdata section which + * will be freed after the init sequence is complete. We can't allocate memory + * in pci_setup() because some architectures do not have any memory allocation + * service available during an early_param() call. So we allocate memory and + * copy the variable here before the init section is freed. + * */ static int __init pci_realloc_setup_params(void) { + resource_alignment_param = kstrdup(resource_alignment_param, + GFP_KERNEL); disable_acs_redir_param = kstrdup(disable_acs_redir_param, GFP_KERNEL); return 0; From fe050f99072d6b5175c35427a6f72846790441ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Ard=C3=B6?= Date: Thu, 5 Sep 2019 16:50:26 +0200 Subject: [PATCH 268/721] i2c: slave-eeprom: Add comment about address handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The behaviour of the EEPROM in the case where we only send an 8bit address to a 16bit address EEPROM is not defined. Added comment about that the slave-eeprom might behave differently from how an actual device does (only one model measured). Reported-by: Wolfram Sang Signed-off-by: Björn Ardö Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-slave-eeprom.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/i2c/i2c-slave-eeprom.c b/drivers/i2c/i2c-slave-eeprom.c index 773afaabfb61..92ff9991bae8 100644 --- a/drivers/i2c/i2c-slave-eeprom.c +++ b/drivers/i2c/i2c-slave-eeprom.c @@ -11,6 +11,12 @@ * pointer, yet implementation is deferred until the need actually arises. */ +/* + * FIXME: What to do if only 8 bits of a 16 bit address are sent? + * The ST-M24C64 sends only 0xff then. Needs verification with other + * EEPROMs, though. We currently use the 8 bit as a valid address. + */ + #include #include #include From 22ac74a6194798ce131c55ff0dfbb9697650e626 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 5 Sep 2019 12:45:32 +0900 Subject: [PATCH 269/721] i2c: uniphier(-f): use devm_platform_ioremap_resource() Replace the chain of platform_get_resource() and devm_ioremap_resource() with devm_platform_ioremap_resource(). This allows to remove the local variable for (struct resource *), and have one function call less. Signed-off-by: Masahiro Yamada Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-uniphier-f.c | 4 +--- drivers/i2c/busses/i2c-uniphier.c | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/i2c/busses/i2c-uniphier-f.c b/drivers/i2c/busses/i2c-uniphier-f.c index 7acca2599f04..fc5354845ffa 100644 --- a/drivers/i2c/busses/i2c-uniphier-f.c +++ b/drivers/i2c/busses/i2c-uniphier-f.c @@ -538,7 +538,6 @@ static int uniphier_fi2c_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct uniphier_fi2c_priv *priv; - struct resource *regs; u32 bus_speed; unsigned long clk_rate; int irq, ret; @@ -547,8 +546,7 @@ static int uniphier_fi2c_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); - priv->membase = devm_ioremap_resource(dev, regs); + priv->membase = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(priv->membase)) return PTR_ERR(priv->membase); diff --git a/drivers/i2c/busses/i2c-uniphier.c b/drivers/i2c/busses/i2c-uniphier.c index 0173840c32af..a6d7a3709051 100644 --- a/drivers/i2c/busses/i2c-uniphier.c +++ b/drivers/i2c/busses/i2c-uniphier.c @@ -326,7 +326,6 @@ static int uniphier_i2c_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct uniphier_i2c_priv *priv; - struct resource *regs; u32 bus_speed; unsigned long clk_rate; int irq, ret; @@ -335,8 +334,7 @@ static int uniphier_i2c_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); - priv->membase = devm_ioremap_resource(dev, regs); + priv->membase = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(priv->membase)) return PTR_ERR(priv->membase); From 9ee7e72fbbb835ba92b2cbf4349997adf065eab0 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 5 Sep 2019 13:46:48 +0900 Subject: [PATCH 270/721] i2c: uniphier(-f): remove all dev_dbg() I have fixed various bugs, and these drivers are (I hope) pretty stable now. Remove all dev_dbg() for code clean-up. If I end up with debugging the drivers again, I will locally revert this commit. I no longer need the debug code in upstream. Signed-off-by: Masahiro Yamada Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-uniphier-f.c | 22 +--------------------- drivers/i2c/busses/i2c-uniphier.c | 18 +++--------------- 2 files changed, 4 insertions(+), 36 deletions(-) diff --git a/drivers/i2c/busses/i2c-uniphier-f.c b/drivers/i2c/busses/i2c-uniphier-f.c index fc5354845ffa..4241aac79e7e 100644 --- a/drivers/i2c/busses/i2c-uniphier-f.c +++ b/drivers/i2c/busses/i2c-uniphier-f.c @@ -108,7 +108,6 @@ static void uniphier_fi2c_fill_txfifo(struct uniphier_fi2c_priv *priv, if (fifo_space-- <= 0) break; - dev_dbg(&priv->adap.dev, "write data: %02x\n", *priv->buf); writel(*priv->buf++, priv->membase + UNIPHIER_FI2C_DTTX); priv->len--; } @@ -124,7 +123,6 @@ static void uniphier_fi2c_drain_rxfifo(struct uniphier_fi2c_priv *priv) break; *priv->buf++ = readl(priv->membase + UNIPHIER_FI2C_DTRX); - dev_dbg(&priv->adap.dev, "read data: %02x\n", priv->buf[-1]); priv->len--; } } @@ -142,8 +140,6 @@ static void uniphier_fi2c_clear_irqs(struct uniphier_fi2c_priv *priv, static void uniphier_fi2c_stop(struct uniphier_fi2c_priv *priv) { - dev_dbg(&priv->adap.dev, "stop condition\n"); - priv->enabled_irqs |= UNIPHIER_FI2C_INT_STOP; uniphier_fi2c_set_irqs(priv); writel(UNIPHIER_FI2C_CR_MST | UNIPHIER_FI2C_CR_STO, @@ -160,21 +156,15 @@ static irqreturn_t uniphier_fi2c_interrupt(int irq, void *dev_id) irq_status = readl(priv->membase + UNIPHIER_FI2C_INT); irq_status &= priv->enabled_irqs; - dev_dbg(&priv->adap.dev, - "interrupt: enabled_irqs=%04x, irq_status=%04x\n", - priv->enabled_irqs, irq_status); - if (irq_status & UNIPHIER_FI2C_INT_STOP) goto complete; if (unlikely(irq_status & UNIPHIER_FI2C_INT_AL)) { - dev_dbg(&priv->adap.dev, "arbitration lost\n"); priv->error = -EAGAIN; goto complete; } if (unlikely(irq_status & UNIPHIER_FI2C_INT_NA)) { - dev_dbg(&priv->adap.dev, "could not get ACK\n"); priv->error = -ENXIO; if (priv->flags & UNIPHIER_FI2C_RD) { /* @@ -215,18 +205,14 @@ static irqreturn_t uniphier_fi2c_interrupt(int irq, void *dev_id) if (unlikely(priv->flags & UNIPHIER_FI2C_MANUAL_NACK)) { if (priv->len <= UNIPHIER_FI2C_FIFO_SIZE && !(priv->flags & UNIPHIER_FI2C_BYTE_WISE)) { - dev_dbg(&priv->adap.dev, - "enable read byte count IRQ\n"); priv->enabled_irqs |= UNIPHIER_FI2C_INT_RB; uniphier_fi2c_set_irqs(priv); priv->flags |= UNIPHIER_FI2C_BYTE_WISE; } - if (priv->len <= 1) { - dev_dbg(&priv->adap.dev, "set NACK\n"); + if (priv->len <= 1) writel(UNIPHIER_FI2C_CR_MST | UNIPHIER_FI2C_CR_NACK, priv->membase + UNIPHIER_FI2C_CR); - } } goto handled; @@ -334,10 +320,6 @@ static int uniphier_fi2c_master_xfer_one(struct i2c_adapter *adap, bool is_read = msg->flags & I2C_M_RD; unsigned long time_left, flags; - dev_dbg(&adap->dev, "%s: addr=0x%02x, len=%d, repeat=%d, stop=%d\n", - is_read ? "receive" : "transmit", msg->addr, msg->len, - repeat, stop); - priv->len = msg->len; priv->buf = msg->buf; priv->enabled_irqs = UNIPHIER_FI2C_INT_FAULTS; @@ -359,7 +341,6 @@ static int uniphier_fi2c_master_xfer_one(struct i2c_adapter *adap, else uniphier_fi2c_tx_init(priv, msg->addr, repeat); - dev_dbg(&adap->dev, "start condition\n"); /* * For a repeated START condition, writing a slave address to the FIFO * kicks the controller. So, the UNIPHIER_FI2C_CR register should be @@ -383,7 +364,6 @@ static int uniphier_fi2c_master_xfer_one(struct i2c_adapter *adap, uniphier_fi2c_recover(priv); return -ETIMEDOUT; } - dev_dbg(&adap->dev, "complete\n"); if (unlikely(priv->flags & UNIPHIER_FI2C_DEFER_STOP_COMP)) { u32 status; diff --git a/drivers/i2c/busses/i2c-uniphier.c b/drivers/i2c/busses/i2c-uniphier.c index a6d7a3709051..0270090c0360 100644 --- a/drivers/i2c/busses/i2c-uniphier.c +++ b/drivers/i2c/busses/i2c-uniphier.c @@ -71,7 +71,6 @@ static int uniphier_i2c_xfer_byte(struct i2c_adapter *adap, u32 txdata, reinit_completion(&priv->comp); txdata |= UNIPHIER_I2C_DTRM_IRQEN; - dev_dbg(&adap->dev, "write data: 0x%04x\n", txdata); writel(txdata, priv->membase + UNIPHIER_I2C_DTRM); time_left = wait_for_completion_timeout(&priv->comp, adap->timeout); @@ -81,8 +80,6 @@ static int uniphier_i2c_xfer_byte(struct i2c_adapter *adap, u32 txdata, } rxdata = readl(priv->membase + UNIPHIER_I2C_DREC); - dev_dbg(&adap->dev, "read data: 0x%04x\n", rxdata); - if (rxdatap) *rxdatap = rxdata; @@ -98,14 +95,11 @@ static int uniphier_i2c_send_byte(struct i2c_adapter *adap, u32 txdata) if (ret) return ret; - if (unlikely(rxdata & UNIPHIER_I2C_DREC_LAB)) { - dev_dbg(&adap->dev, "arbitration lost\n"); + if (unlikely(rxdata & UNIPHIER_I2C_DREC_LAB)) return -EAGAIN; - } - if (unlikely(rxdata & UNIPHIER_I2C_DREC_LRB)) { - dev_dbg(&adap->dev, "could not get ACK\n"); + + if (unlikely(rxdata & UNIPHIER_I2C_DREC_LRB)) return -ENXIO; - } return 0; } @@ -115,7 +109,6 @@ static int uniphier_i2c_tx(struct i2c_adapter *adap, u16 addr, u16 len, { int ret; - dev_dbg(&adap->dev, "start condition\n"); ret = uniphier_i2c_send_byte(adap, addr << 1 | UNIPHIER_I2C_DTRM_STA | UNIPHIER_I2C_DTRM_NACK); @@ -137,7 +130,6 @@ static int uniphier_i2c_rx(struct i2c_adapter *adap, u16 addr, u16 len, { int ret; - dev_dbg(&adap->dev, "start condition\n"); ret = uniphier_i2c_send_byte(adap, addr << 1 | UNIPHIER_I2C_DTRM_STA | UNIPHIER_I2C_DTRM_NACK | @@ -161,7 +153,6 @@ static int uniphier_i2c_rx(struct i2c_adapter *adap, u16 addr, u16 len, static int uniphier_i2c_stop(struct i2c_adapter *adap) { - dev_dbg(&adap->dev, "stop condition\n"); return uniphier_i2c_send_byte(adap, UNIPHIER_I2C_DTRM_STO | UNIPHIER_I2C_DTRM_NACK); } @@ -173,9 +164,6 @@ static int uniphier_i2c_master_xfer_one(struct i2c_adapter *adap, bool recovery = false; int ret; - dev_dbg(&adap->dev, "%s: addr=0x%02x, len=%d, stop=%d\n", - is_read ? "receive" : "transmit", msg->addr, msg->len, stop); - if (is_read) ret = uniphier_i2c_rx(adap, msg->addr, msg->len, msg->buf); else From 688033f52d7192ea8ddf617b86af0a4120247b42 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 3 Sep 2019 14:10:18 +0300 Subject: [PATCH 271/721] PCI: pciehp: Add pciehp_set_indicators() to set both indicators Add pciehp_set_indicators() to set power and attention indicators with a single register write. This is a minor optimization because we frequently set both indicators and this can do it with a single command. It also reduces the number of interfaces related to the indicators and makes them more discoverable because callers use the PCI_EXP_SLTCTL_ATTN_IND_* and PCI_EXP_SLTCTL_PWR_IND_* definitions directly. [bhelgaas: extend commit log, s/PCI_EXP_SLTCTL_.*_IND_NONE/INDICATOR_NOOP/ so they don't look like things defined by the spec, add function doc, mask commands to make it obvious we only send valid commands (pcie_do_write_cmd() does mask it, but requires more effort to verify)] Link: https://lore.kernel.org/r/20190903111021.1559-2-efremov@linux.com Signed-off-by: Denis Efremov Signed-off-by: Bjorn Helgaas --- drivers/pci/hotplug/pciehp.h | 3 +++ drivers/pci/hotplug/pciehp_hpc.c | 36 ++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 8c51a04b8083..2ca0f35a6a23 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -167,6 +167,9 @@ int pciehp_power_on_slot(struct controller *ctrl); void pciehp_power_off_slot(struct controller *ctrl); void pciehp_get_power_status(struct controller *ctrl, u8 *status); +#define INDICATOR_NOOP -1 /* Leave indicator unchanged */ +void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn); + void pciehp_set_attention_status(struct controller *ctrl, u8 status); void pciehp_get_latch_status(struct controller *ctrl, u8 *status); int pciehp_query_power_fault(struct controller *ctrl); diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index bd990e3371e3..d7ebcd7111cb 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -443,6 +443,42 @@ void pciehp_set_attention_status(struct controller *ctrl, u8 value) pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, slot_cmd); } +/** + * pciehp_set_indicators() - set attention indicator, power indicator, or both + * @ctrl: PCIe hotplug controller + * @pwr: one of: + * PCI_EXP_SLTCTL_PWR_IND_ON + * PCI_EXP_SLTCTL_PWR_IND_BLINK + * PCI_EXP_SLTCTL_PWR_IND_OFF + * @attn: one of: + * PCI_EXP_SLTCTL_ATTN_IND_ON + * PCI_EXP_SLTCTL_ATTN_IND_BLINK + * PCI_EXP_SLTCTL_ATTN_IND_OFF + * + * Either @pwr or @attn can also be INDICATOR_NOOP to leave that indicator + * unchanged. + */ +void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn) +{ + u16 cmd = 0, mask = 0; + + if (PWR_LED(ctrl) && pwr != INDICATOR_NOOP) { + cmd |= (pwr & PCI_EXP_SLTCTL_PIC); + mask |= PCI_EXP_SLTCTL_PIC; + } + + if (ATTN_LED(ctrl) && attn != INDICATOR_NOOP) { + cmd |= (attn & PCI_EXP_SLTCTL_AIC); + mask |= PCI_EXP_SLTCTL_AIC; + } + + if (cmd) { + pcie_write_cmd_nowait(ctrl, cmd, mask); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, + pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, cmd); + } +} + void pciehp_green_led_on(struct controller *ctrl) { if (!PWR_LED(ctrl)) From 94719ba090e2c111272eb97c053c6cc47a7b8856 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 3 Sep 2019 14:10:19 +0300 Subject: [PATCH 272/721] PCI: pciehp: Combine adjacent indicator updates Combine adjacent updates of power and attention indicators into a single pciehp_set_indicators() call. This sends one command to the hotplug controller instead of two. Link: https://lore.kernel.org/r/20190903111021.1559-3-efremov@linux.com Signed-off-by: Denis Efremov Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Lukas Wunner --- drivers/pci/hotplug/pciehp_ctrl.c | 26 +++++++++++++++----------- drivers/pci/hotplug/pciehp_hpc.c | 4 ++-- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index 631ced0ab28a..b10c71493ffa 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -30,7 +30,10 @@ static void set_slot_off(struct controller *ctrl) { - /* turn off slot, turn on Amber LED, turn off Green LED if supported*/ + /* + * Turn off slot, turn on attention indicator, turn off power + * indicator + */ if (POWER_CTRL(ctrl)) { pciehp_power_off_slot(ctrl); @@ -42,8 +45,8 @@ static void set_slot_off(struct controller *ctrl) msleep(1000); } - pciehp_green_led_off(ctrl); - pciehp_set_attention_status(ctrl, 1); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + PCI_EXP_SLTCTL_ATTN_IND_ON); } /** @@ -90,8 +93,8 @@ static int board_added(struct controller *ctrl) } } - pciehp_green_led_on(ctrl); - pciehp_set_attention_status(ctrl, 0); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON, + PCI_EXP_SLTCTL_ATTN_IND_OFF); return 0; err_exit: @@ -171,9 +174,9 @@ void pciehp_handle_button_press(struct controller *ctrl) ctrl_info(ctrl, "Slot(%s) Powering on due to button press\n", slot_name(ctrl)); } - /* blink green LED and turn off amber */ - pciehp_green_led_blink(ctrl); - pciehp_set_attention_status(ctrl, 0); + /* blink power indicator and turn off attention */ + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK, + PCI_EXP_SLTCTL_ATTN_IND_OFF); schedule_delayed_work(&ctrl->button_work, 5 * HZ); break; case BLINKINGOFF_STATE: @@ -187,12 +190,13 @@ void pciehp_handle_button_press(struct controller *ctrl) cancel_delayed_work(&ctrl->button_work); if (ctrl->state == BLINKINGOFF_STATE) { ctrl->state = ON_STATE; - pciehp_green_led_on(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON, + PCI_EXP_SLTCTL_ATTN_IND_OFF); } else { ctrl->state = OFF_STATE; - pciehp_green_led_off(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + PCI_EXP_SLTCTL_ATTN_IND_OFF); } - pciehp_set_attention_status(ctrl, 0); ctrl_info(ctrl, "Slot(%s): Action canceled due to button press\n", slot_name(ctrl)); break; diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index d7ebcd7111cb..a900bfd22447 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -674,8 +674,8 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) if ((events & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) { ctrl->power_fault_detected = 1; ctrl_err(ctrl, "Slot(%s): Power fault\n", slot_name(ctrl)); - pciehp_set_attention_status(ctrl, 1); - pciehp_green_led_off(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + PCI_EXP_SLTCTL_ATTN_IND_ON); } /* From 106feb2fdced0c240ca8ecb3d5db91b2b64e9a48 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 3 Sep 2019 14:10:20 +0300 Subject: [PATCH 273/721] PCI: pciehp: Remove pciehp_set_attention_status() Remove pciehp_set_attention_status() and use pciehp_set_indicators() instead, since the code is mostly the same. Link: https://lore.kernel.org/r/20190903111021.1559-4-efremov@linux.com Signed-off-by: Denis Efremov Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/hotplug/pciehp.h | 1 - drivers/pci/hotplug/pciehp_core.c | 9 +++++++-- drivers/pci/hotplug/pciehp_hpc.c | 25 ------------------------- include/uapi/linux/pci_regs.h | 1 + 4 files changed, 8 insertions(+), 28 deletions(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 2ca0f35a6a23..1f69f43a3f29 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -170,7 +170,6 @@ void pciehp_get_power_status(struct controller *ctrl, u8 *status); #define INDICATOR_NOOP -1 /* Leave indicator unchanged */ void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn); -void pciehp_set_attention_status(struct controller *ctrl, u8 status); void pciehp_get_latch_status(struct controller *ctrl, u8 *status); int pciehp_query_power_fault(struct controller *ctrl); void pciehp_green_led_on(struct controller *ctrl); diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 6ad0d86762cb..b3122c151b80 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -95,15 +95,20 @@ static void cleanup_slot(struct controller *ctrl) } /* - * set_attention_status - Turns the Amber LED for a slot on, off or blink + * set_attention_status - Turns the Attention Indicator on, off or blinking */ static int set_attention_status(struct hotplug_slot *hotplug_slot, u8 status) { struct controller *ctrl = to_ctrl(hotplug_slot); struct pci_dev *pdev = ctrl->pcie->port; + if (status) + status <<= PCI_EXP_SLTCTL_ATTN_IND_SHIFT; + else + status = PCI_EXP_SLTCTL_ATTN_IND_OFF; + pci_config_pm_runtime_get(pdev); - pciehp_set_attention_status(ctrl, status); + pciehp_set_indicators(ctrl, INDICATOR_NOOP, status); pci_config_pm_runtime_put(pdev); return 0; } diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index a900bfd22447..6527345ccd8e 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -418,31 +418,6 @@ int pciehp_set_raw_indicator_status(struct hotplug_slot *hotplug_slot, return 0; } -void pciehp_set_attention_status(struct controller *ctrl, u8 value) -{ - u16 slot_cmd; - - if (!ATTN_LED(ctrl)) - return; - - switch (value) { - case 0: /* turn off */ - slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_OFF; - break; - case 1: /* turn on */ - slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_ON; - break; - case 2: /* turn blink */ - slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_BLINK; - break; - default: - return; - } - pcie_write_cmd_nowait(ctrl, slot_cmd, PCI_EXP_SLTCTL_AIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, slot_cmd); -} - /** * pciehp_set_indicators() - set attention indicator, power indicator, or both * @ctrl: PCIe hotplug controller diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f28e562d7ca8..de3e58afc564 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -591,6 +591,7 @@ #define PCI_EXP_SLTCTL_CCIE 0x0010 /* Command Completed Interrupt Enable */ #define PCI_EXP_SLTCTL_HPIE 0x0020 /* Hot-Plug Interrupt Enable */ #define PCI_EXP_SLTCTL_AIC 0x00c0 /* Attention Indicator Control */ +#define PCI_EXP_SLTCTL_ATTN_IND_SHIFT 6 /* Attention Indicator shift */ #define PCI_EXP_SLTCTL_ATTN_IND_ON 0x0040 /* Attention Indicator on */ #define PCI_EXP_SLTCTL_ATTN_IND_BLINK 0x0080 /* Attention Indicator blinking */ #define PCI_EXP_SLTCTL_ATTN_IND_OFF 0x00c0 /* Attention Indicator off */ From 9194094be418f4f13fbab3a6049ea18acb137178 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Tue, 3 Sep 2019 14:10:21 +0300 Subject: [PATCH 274/721] PCI: pciehp: Remove pciehp_green_led_{on,off,blink}() Remove pciehp_green_led_{on,off,blink}() and use pciehp_set_indicators() instead, since the code is mostly the same. [bhelgaas: drop set_power_indicator() wrapper to reduce the number of interfaces] Link: https://lore.kernel.org/r/20190903111021.1559-5-efremov@linux.com Signed-off-by: Denis Efremov Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan --- drivers/pci/hotplug/pciehp.h | 3 --- drivers/pci/hotplug/pciehp_ctrl.c | 11 ++++++---- drivers/pci/hotplug/pciehp_hpc.c | 36 ------------------------------- 3 files changed, 7 insertions(+), 43 deletions(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 1f69f43a3f29..01706448b04a 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -172,9 +172,6 @@ void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn); void pciehp_get_latch_status(struct controller *ctrl, u8 *status); int pciehp_query_power_fault(struct controller *ctrl); -void pciehp_green_led_on(struct controller *ctrl); -void pciehp_green_led_off(struct controller *ctrl); -void pciehp_green_led_blink(struct controller *ctrl); bool pciehp_card_present(struct controller *ctrl); bool pciehp_card_present_or_link_active(struct controller *ctrl); int pciehp_check_link_status(struct controller *ctrl); diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index b10c71493ffa..71141abfba70 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -68,7 +68,8 @@ static int board_added(struct controller *ctrl) return retval; } - pciehp_green_led_blink(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK, + INDICATOR_NOOP); /* Check link training status */ retval = pciehp_check_link_status(ctrl); @@ -126,8 +127,8 @@ static void remove_board(struct controller *ctrl, bool safe_removal) &ctrl->pending_events); } - /* turn off Green LED */ - pciehp_green_led_off(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + INDICATOR_NOOP); } static int pciehp_enable_slot(struct controller *ctrl); @@ -314,7 +315,9 @@ static int pciehp_enable_slot(struct controller *ctrl) pm_runtime_get_sync(&ctrl->pcie->port->dev); ret = __pciehp_enable_slot(ctrl); if (ret && ATTN_BUTTN(ctrl)) - pciehp_green_led_off(ctrl); /* may be blinking */ + /* may be blinking */ + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + INDICATOR_NOOP); pm_runtime_put(&ctrl->pcie->port->dev); mutex_lock(&ctrl->state_lock); diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 6527345ccd8e..1a522c1c4177 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -454,42 +454,6 @@ void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn) } } -void pciehp_green_led_on(struct controller *ctrl) -{ - if (!PWR_LED(ctrl)) - return; - - pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON, - PCI_EXP_SLTCTL_PIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, - PCI_EXP_SLTCTL_PWR_IND_ON); -} - -void pciehp_green_led_off(struct controller *ctrl) -{ - if (!PWR_LED(ctrl)) - return; - - pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, - PCI_EXP_SLTCTL_PIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, - PCI_EXP_SLTCTL_PWR_IND_OFF); -} - -void pciehp_green_led_blink(struct controller *ctrl) -{ - if (!PWR_LED(ctrl)) - return; - - pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK, - PCI_EXP_SLTCTL_PIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, - PCI_EXP_SLTCTL_PWR_IND_BLINK); -} - int pciehp_power_on_slot(struct controller *ctrl) { struct pci_dev *pdev = ctrl_dev(ctrl); From 4a06c2c38235d4a0bc436131591e2927288992fe Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 5 Sep 2019 15:52:24 -0500 Subject: [PATCH 275/721] PCI: pciehp: Refer to "Indicators" instead of "LEDs" in comments The PCIe spec doesn't mention "green LEDs" or "amber LEDs". Replace those terms with "Power Indicator" and "Attention Indicator" so the comments match the spec language. Comment changes only. Signed-off-by: Bjorn Helgaas --- drivers/pci/hotplug/pciehp.h | 4 ++-- drivers/pci/hotplug/pciehp_ctrl.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 01706448b04a..654c972b8ea0 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -110,9 +110,9 @@ struct controller { * * @OFF_STATE: slot is powered off, no subordinate devices are enumerated * @BLINKINGON_STATE: slot will be powered on after the 5 second delay, - * green led is blinking + * Power Indicator is blinking * @BLINKINGOFF_STATE: slot will be powered off after the 5 second delay, - * green led is blinking + * Power Indicator is blinking * @POWERON_STATE: slot is currently powering on * @POWEROFF_STATE: slot is currently powering off * @ON_STATE: slot is powered on, subordinate devices have been enumerated diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index 71141abfba70..21af7b16d7a4 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -104,7 +104,7 @@ static int board_added(struct controller *ctrl) } /** - * remove_board - Turns off slot and LEDs + * remove_board - Turn off slot and Power Indicator * @ctrl: PCIe hotplug controller where board is being removed * @safe_removal: whether the board is safely removed (versus surprise removed) */ From 273b177cac4b649c3c6d448e85bbc64cebfe7a0a Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 22 Aug 2019 10:10:12 -0600 Subject: [PATCH 276/721] PCI: Move pci_[get|set]_resource_alignment_param() into their callers Both the functions pci_get_resource_alignment_param() and pci_set_resource_alignment_param() are now only called in one place: resource_alignment_show() and resource_alignment_store() respectively. There is no value in this extra set of functions so move both into their callers respectively. [bhelgaas: fold in "GFP_KERNEL while atomic" fix from Christoph Hellwig https://lore.kernel.org/r/20190902075006.GB754@infradead.org] Link: https://lore.kernel.org/r/20190822161013.5481-3-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index fbfb64ba447d..ff3abc577a25 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6117,39 +6117,31 @@ void pci_reassigndev_resource_alignment(struct pci_dev *dev) } } -static ssize_t pci_set_resource_alignment_param(const char *buf, size_t count) -{ - spin_lock(&resource_alignment_lock); - - kfree(resource_alignment_param); - resource_alignment_param = kstrndup(buf, count, GFP_KERNEL); - - spin_unlock(&resource_alignment_lock); - - return resource_alignment_param ? count : -ENOMEM; -} - -static ssize_t pci_get_resource_alignment_param(char *buf, size_t size) +static ssize_t resource_alignment_show(struct bus_type *bus, char *buf) { size_t count = 0; spin_lock(&resource_alignment_lock); if (resource_alignment_param) - count = snprintf(buf, size, "%s", resource_alignment_param); + count = snprintf(buf, PAGE_SIZE, "%s", resource_alignment_param); spin_unlock(&resource_alignment_lock); return count; } -static ssize_t resource_alignment_show(struct bus_type *bus, char *buf) -{ - return pci_get_resource_alignment_param(buf, PAGE_SIZE); -} - static ssize_t resource_alignment_store(struct bus_type *bus, const char *buf, size_t count) { - return pci_set_resource_alignment_param(buf, count); + char *param = kstrndup(buf, count, GFP_KERNEL); + + if (!param) + return -ENOMEM; + + spin_lock(&resource_alignment_lock); + kfree(resource_alignment_param); + resource_alignment_param = param; + spin_unlock(&resource_alignment_lock); + return count; } static BUS_ATTR_RW(resource_alignment); From e499081da1a247a8a5a234211c7127fb0b91ca92 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 22 Aug 2019 10:10:13 -0600 Subject: [PATCH 277/721] PCI: Force trailing new line to resource_alignment_param in sysfs When 'pci=resource_alignment=' is specified on the command line, there is no trailing new line. Then, when it's read through the corresponding sysfs attribute, there will be no newline and a cat command will not show correctly in a shell. If the parameter is set through sysfs a new line will be stored and it will 'cat' correctly. To solve this, append a new line character in the show function if one does not already exist. Link: https://lore.kernel.org/r/20190822161013.5481-4-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index ff3abc577a25..5f70ff1031d2 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -6126,6 +6126,16 @@ static ssize_t resource_alignment_show(struct bus_type *bus, char *buf) count = snprintf(buf, PAGE_SIZE, "%s", resource_alignment_param); spin_unlock(&resource_alignment_lock); + /* + * When set by the command line, resource_alignment_param will not + * have a trailing line feed, which is ugly. So conditionally add + * it here. + */ + if (count >= 2 && buf[count - 2] != '\n' && count < PAGE_SIZE - 1) { + buf[count - 1] = '\n'; + buf[count++] = 0; + } + return count; } From 46b2c32df7a462d0e64b68c513e5c4c1b2a399a7 Mon Sep 17 00:00:00 2001 From: Abhinav Ratna Date: Tue, 20 Aug 2019 10:09:45 +0530 Subject: [PATCH 278/721] PCI: Add ACS quirk for iProc PAXB iProc PAXB Root Ports don't advertise an ACS capability, but they do not allow peer-to-peer transactions between Root Ports. Add an ACS quirk so each Root Port can be in a separate IOMMU group. [bhelgaas: commit log, comment, use common implementation style] Link: https://lore.kernel.org/r/1566275985-25670-1-git-send-email-srinath.mannam@broadcom.com Signed-off-by: Abhinav Ratna Signed-off-by: Srinath Mannam Signed-off-by: Bjorn Helgaas Acked-by: Scott Branden --- drivers/pci/quirks.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 208aacf39329..2edbce35e8c5 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4466,6 +4466,19 @@ static int pci_quirk_mf_endpoint_acs(struct pci_dev *dev, u16 acs_flags) return acs_flags ? 0 : 1; } +static int pci_quirk_brcm_acs(struct pci_dev *dev, u16 acs_flags) +{ + /* + * iProc PAXB Root Ports don't advertise an ACS capability, but + * they do not allow peer-to-peer transactions between Root Ports. + * Allow each Root Port to be in a separate IOMMU group by masking + * SV/RR/CR/UF bits. + */ + acs_flags &= ~(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); + + return acs_flags ? 0 : 1; +} + static const struct pci_dev_acs_enabled { u16 vendor; u16 device; @@ -4559,6 +4572,7 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_AMPERE, 0xE00A, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00B, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00C, pci_quirk_xgene_acs }, + { PCI_VENDOR_ID_BROADCOM, 0xD714, pci_quirk_brcm_acs }, { 0 } }; From e6fa0dc86734f99b037b36b8682133efc2b6e16b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 2 Sep 2019 14:09:58 +0530 Subject: [PATCH 279/721] swiotlb-xen: Convert to use macro Rather than using static int max_dma_bits, this can be coverted to use as macro. Signed-off-by: Souptick Joarder Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/swiotlb-xen.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index ae1df496bf38..d1eced53bdda 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -38,6 +38,7 @@ #include #include +#define MAX_DMA_BITS 32 /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this @@ -114,8 +115,6 @@ static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) return 0; } -static int max_dma_bits = 32; - static int xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) { @@ -135,7 +134,7 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) p + (i << IO_TLB_SHIFT), get_order(slabs << IO_TLB_SHIFT), dma_bits, &dma_handle); - } while (rc && dma_bits++ < max_dma_bits); + } while (rc && dma_bits++ < MAX_DMA_BITS); if (rc) return rc; From dba61cda30469a6c4fed0f8d5bf2b6001ca80a51 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:15 +0000 Subject: [PATCH 280/721] Drivers: hv: vmbus: Break out synic enable and disable operations Break out synic enable and disable operations into separate hv_synic_disable_regs() and hv_synic_enable_regs() functions for use by a later patch to support hibernation. There is no functional change except the unnecessary check "if (sctrl.enable != 1) return -EFAULT;" which is removed, because when we're in hv_synic_cleanup(), we're absolutely sure sctrl.enable must be 1. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/hv.c | 66 ++++++++++++++++++++++----------------- drivers/hv/hyperv_vmbus.h | 2 ++ 2 files changed, 39 insertions(+), 29 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 6188fb7dda42..fcc52797c169 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -154,7 +154,7 @@ void hv_synic_free(void) * retrieve the initialized message and event pages. Otherwise, we create and * initialize the message and event pages. */ -int hv_synic_init(unsigned int cpu) +void hv_synic_enable_regs(unsigned int cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); @@ -196,6 +196,11 @@ int hv_synic_init(unsigned int cpu) sctrl.enable = 1; hv_set_synic_state(sctrl.as_uint64); +} + +int hv_synic_init(unsigned int cpu) +{ + hv_synic_enable_regs(cpu); hv_stimer_init(cpu); @@ -205,20 +210,45 @@ int hv_synic_init(unsigned int cpu) /* * hv_synic_cleanup - Cleanup routine for hv_synic_init(). */ -int hv_synic_cleanup(unsigned int cpu) +void hv_synic_disable_regs(unsigned int cpu) { union hv_synic_sint shared_sint; union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_scontrol sctrl; + + hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + + shared_sint.masked = 1; + + /* Need to correctly cleanup in the case of SMP!!! */ + /* Disable the interrupt */ + hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + + hv_get_simp(simp.as_uint64); + simp.simp_enabled = 0; + simp.base_simp_gpa = 0; + + hv_set_simp(simp.as_uint64); + + hv_get_siefp(siefp.as_uint64); + siefp.siefp_enabled = 0; + siefp.base_siefp_gpa = 0; + + hv_set_siefp(siefp.as_uint64); + + /* Disable the global synic bit */ + hv_get_synic_state(sctrl.as_uint64); + sctrl.enable = 0; + hv_set_synic_state(sctrl.as_uint64); +} + +int hv_synic_cleanup(unsigned int cpu) +{ struct vmbus_channel *channel, *sc; bool channel_found = false; unsigned long flags; - hv_get_synic_state(sctrl.as_uint64); - if (sctrl.enable != 1) - return -EFAULT; - /* * Search for channels which are bound to the CPU we're about to * cleanup. In case we find one and vmbus is still connected we need to @@ -249,29 +279,7 @@ int hv_synic_cleanup(unsigned int cpu) hv_stimer_cleanup(cpu); - hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); - - shared_sint.masked = 1; - - /* Need to correctly cleanup in the case of SMP!!! */ - /* Disable the interrupt */ - hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); - - hv_get_simp(simp.as_uint64); - simp.simp_enabled = 0; - simp.base_simp_gpa = 0; - - hv_set_simp(simp.as_uint64); - - hv_get_siefp(siefp.as_uint64); - siefp.siefp_enabled = 0; - siefp.base_siefp_gpa = 0; - - hv_set_siefp(siefp.as_uint64); - - /* Disable the global synic bit */ - sctrl.enable = 0; - hv_set_synic_state(sctrl.as_uint64); + hv_synic_disable_regs(cpu); return 0; } diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 362e70e9d145..26ea161a6876 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -171,8 +171,10 @@ extern int hv_synic_alloc(void); extern void hv_synic_free(void); +extern void hv_synic_enable_regs(unsigned int cpu); extern int hv_synic_init(unsigned int cpu); +extern void hv_synic_disable_regs(unsigned int cpu); extern int hv_synic_cleanup(unsigned int cpu); /* Interface */ From 63ecc6d22ce46643165c391a9c90ba67e22e1c0f Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:16 +0000 Subject: [PATCH 281/721] Drivers: hv: vmbus: Suspend/resume the synic for hibernation This is needed when we resume the old kernel from the "current" kernel. Note: when hv_synic_suspend() and hv_synic_resume() run, all the non-boot CPUs have been offlined, and interrupts are disabled on CPU0. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/vmbus_drv.c | 46 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index ebd35fc35290..2ef375ce58ac 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "hyperv_vmbus.h" @@ -2086,6 +2087,47 @@ static void hv_crash_handler(struct pt_regs *regs) hyperv_cleanup(); }; +static int hv_synic_suspend(void) +{ + /* + * When we reach here, all the non-boot CPUs have been offlined, and + * the stimers on them have been unbound in hv_synic_cleanup() -> + * hv_stimer_cleanup() -> clockevents_unbind_device(). + * + * hv_synic_suspend() only runs on CPU0 with interrupts disabled. Here + * we do not unbind the stimer on CPU0 because: 1) it's unnecessary + * because the interrupts remain disabled between syscore_suspend() + * and syscore_resume(): see create_image() and resume_target_kernel(); + * 2) the stimer on CPU0 is automatically disabled later by + * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ... + * -> clockevents_shutdown() -> ... -> hv_ce_shutdown(); 3) a warning + * would be triggered if we call clockevents_unbind_device(), which + * may sleep, in an interrupts-disabled context. So, we intentionally + * don't call hv_stimer_cleanup(0) here. + */ + + hv_synic_disable_regs(0); + + return 0; +} + +static void hv_synic_resume(void) +{ + hv_synic_enable_regs(0); + + /* + * Note: we don't need to call hv_stimer_init(0), because the timer + * on CPU0 is not unbound in hv_synic_suspend(), and the timer is + * automatically re-enabled in timekeeping_resume(). + */ +} + +/* The callbacks run only on CPU0, with irqs_disabled. */ +static struct syscore_ops hv_synic_syscore_ops = { + .suspend = hv_synic_suspend, + .resume = hv_synic_resume, +}; + static int __init hv_acpi_init(void) { int ret, t; @@ -2116,6 +2158,8 @@ static int __init hv_acpi_init(void) hv_setup_kexec_handler(hv_kexec_handler); hv_setup_crash_handler(hv_crash_handler); + register_syscore_ops(&hv_synic_syscore_ops); + return 0; cleanup: @@ -2128,6 +2172,8 @@ static void __exit vmbus_exit(void) { int cpu; + unregister_syscore_ops(&hv_synic_syscore_ops); + hv_remove_kexec_handler(); hv_remove_crash_handler(); vmbus_connection.conn_state = DISCONNECTED; From ed56ef675ae6ef0e6f7d42b9c42dae61172f0960 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:16 +0000 Subject: [PATCH 282/721] Drivers: hv: vmbus: Add a helper function is_sub_channel() The existing method of telling if a channel is sub-channel in vmbus_process_offer() is cumbersome. This new simple helper function is preferred in future. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- include/linux/hyperv.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 6256cc34c4a6..2d39248cff96 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -245,7 +245,10 @@ struct vmbus_channel_offer { } pipe; } u; /* - * The sub_channel_index is defined in win8. + * The sub_channel_index is defined in Win8: a value of zero means a + * primary channel and a value of non-zero means a sub-channel. + * + * Before Win8, the field is reserved, meaning it's always zero. */ u16 sub_channel_index; u16 reserved3; @@ -934,6 +937,11 @@ static inline bool is_hvsock_channel(const struct vmbus_channel *c) VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); } +static inline bool is_sub_channel(const struct vmbus_channel *c) +{ + return c->offermsg.offer.sub_channel_index != 0; +} + static inline void set_channel_affinity_state(struct vmbus_channel *c, enum hv_numa_policy policy) { From 271b2224d42f88870e6b060924ee374871c131fc Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:17 +0000 Subject: [PATCH 283/721] Drivers: hv: vmbus: Implement suspend/resume for VSC drivers for hibernation The high-level VSC drivers will implement device-specific callbacks. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/vmbus_drv.c | 46 ++++++++++++++++++++++++++++++++++++++++++ include/linux/hyperv.h | 3 +++ 2 files changed, 49 insertions(+) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 2ef375ce58ac..a30c70adf9a0 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -911,6 +911,43 @@ static void vmbus_shutdown(struct device *child_device) drv->shutdown(dev); } +/* + * vmbus_suspend - Suspend a vmbus device + */ +static int vmbus_suspend(struct device *child_device) +{ + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); + + /* The device may not be attached yet */ + if (!child_device->driver) + return 0; + + drv = drv_to_hv_drv(child_device->driver); + if (!drv->suspend) + return -EOPNOTSUPP; + + return drv->suspend(dev); +} + +/* + * vmbus_resume - Resume a vmbus device + */ +static int vmbus_resume(struct device *child_device) +{ + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); + + /* The device may not be attached yet */ + if (!child_device->driver) + return 0; + + drv = drv_to_hv_drv(child_device->driver); + if (!drv->resume) + return -EOPNOTSUPP; + + return drv->resume(dev); +} /* * vmbus_device_release - Final callback release of the vmbus child device @@ -926,6 +963,14 @@ static void vmbus_device_release(struct device *device) kfree(hv_dev); } +/* + * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than + * SET_SYSTEM_SLEEP_PM_OPS: see the comment before vmbus_bus_pm. + */ +static const struct dev_pm_ops vmbus_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_suspend, vmbus_resume) +}; + /* The one and only one */ static struct bus_type hv_bus = { .name = "vmbus", @@ -936,6 +981,7 @@ static struct bus_type hv_bus = { .uevent = vmbus_uevent, .dev_groups = vmbus_dev_groups, .drv_groups = vmbus_drv_groups, + .pm = &vmbus_pm, }; struct onmessage_work_context { diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 2d39248cff96..8a60e7766037 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1157,6 +1157,9 @@ struct hv_driver { int (*remove)(struct hv_device *); void (*shutdown)(struct hv_device *); + int (*suspend)(struct hv_device *); + int (*resume)(struct hv_device *); + }; /* Base device object */ From e3ede02add7e6df315afc6b4120520f7d0c5a258 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:18 +0000 Subject: [PATCH 284/721] Drivers: hv: vmbus: Ignore the offers when resuming from hibernation When the VM resumes, the host re-sends the offers. We should not add the offers to the global vmbus_connection.chn_list again. This patch assumes the RELIDs of the channels don't change across hibernation. Actually this is not always true, especially in the case of NIC SR-IOV the VF vmbus device's RELID sometimes can change. A later patch will address this issue by mapping the new offers to the old channels and fixing up the old channels, if necessary. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/channel_mgmt.c | 58 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index addcef50df7a..44b92fa1b7fa 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -847,6 +847,37 @@ void vmbus_initiate_unload(bool crash) vmbus_wait_for_unload(); } +/* + * find_primary_channel_by_offer - Get the channel object given the new offer. + * This is only used in the resume path of hibernation. + */ +static struct vmbus_channel * +find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer) +{ + struct vmbus_channel *channel = NULL, *iter; + const guid_t *inst1, *inst2; + + /* Ignore sub-channel offers. */ + if (offer->offer.sub_channel_index != 0) + return NULL; + + mutex_lock(&vmbus_connection.channel_mutex); + + list_for_each_entry(iter, &vmbus_connection.chn_list, listentry) { + inst1 = &iter->offermsg.offer.if_instance; + inst2 = &offer->offer.if_instance; + + if (guid_equal(inst1, inst2)) { + channel = iter; + break; + } + } + + mutex_unlock(&vmbus_connection.channel_mutex); + + return channel; +} + /* * vmbus_onoffer - Handler for channel offers from vmbus in parent partition. * @@ -854,12 +885,37 @@ void vmbus_initiate_unload(bool crash) static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) { struct vmbus_channel_offer_channel *offer; - struct vmbus_channel *newchannel; + struct vmbus_channel *oldchannel, *newchannel; + size_t offer_sz; offer = (struct vmbus_channel_offer_channel *)hdr; trace_vmbus_onoffer(offer); + oldchannel = find_primary_channel_by_offer(offer); + + if (oldchannel != NULL) { + atomic_dec(&vmbus_connection.offer_in_progress); + + /* + * We're resuming from hibernation: we expect the host to send + * exactly the same offers that we had before the hibernation. + */ + offer_sz = sizeof(*offer); + if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) + return; + + pr_debug("Mismatched offer from the host (relid=%d)\n", + offer->child_relid); + + print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, + 16, 4, &oldchannel->offermsg, offer_sz, + false); + print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, + 16, 4, offer, offer_sz, false); + return; + } + /* Allocate the channel object and save this offer. */ newchannel = alloc_channel(); if (!newchannel) { From f53335e3289f9ac3a6a8faf6c2f819eee508bd39 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:19 +0000 Subject: [PATCH 285/721] Drivers: hv: vmbus: Suspend/resume the vmbus itself for hibernation Before Linux enters hibernation, it sends the CHANNELMSG_UNLOAD message to the host so all the offers are gone. After hibernation, Linux needs to re-negotiate with the host using the same vmbus protocol version (which was in use before hibernation), and ask the host to re-offer the vmbus devices. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/connection.c | 3 +- drivers/hv/hyperv_vmbus.h | 2 ++ drivers/hv/vmbus_drv.c | 59 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 09829e15d4a0..806319cd5ccf 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -59,8 +59,7 @@ static __u32 vmbus_get_next_version(__u32 current_version) } } -static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, - __u32 version) +int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) { int ret = 0; unsigned int cur_cpu; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 26ea161a6876..e657197a027a 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -274,6 +274,8 @@ struct vmbus_msginfo { extern struct vmbus_connection vmbus_connection; +int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version); + static inline void vmbus_send_interrupt(u32 relid) { sync_set_bit(relid, vmbus_connection.send_int_page); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index a30c70adf9a0..ce9974bf683f 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2089,6 +2089,51 @@ static int vmbus_acpi_add(struct acpi_device *device) return ret_val; } +static int vmbus_bus_suspend(struct device *dev) +{ + vmbus_initiate_unload(false); + + vmbus_connection.conn_state = DISCONNECTED; + + return 0; +} + +static int vmbus_bus_resume(struct device *dev) +{ + struct vmbus_channel_msginfo *msginfo; + size_t msgsize; + int ret; + + /* + * We only use the 'vmbus_proto_version', which was in use before + * hibernation, to re-negotiate with the host. + */ + if (vmbus_proto_version == VERSION_INVAL || + vmbus_proto_version == 0) { + pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version); + return -EINVAL; + } + + msgsize = sizeof(*msginfo) + + sizeof(struct vmbus_channel_initiate_contact); + + msginfo = kzalloc(msgsize, GFP_KERNEL); + + if (msginfo == NULL) + return -ENOMEM; + + ret = vmbus_negotiate_version(msginfo, vmbus_proto_version); + + kfree(msginfo); + + if (ret != 0) + return ret; + + vmbus_request_offers(); + + return 0; +} + static const struct acpi_device_id vmbus_acpi_device_ids[] = { {"VMBUS", 0}, {"VMBus", 0}, @@ -2096,6 +2141,19 @@ static const struct acpi_device_id vmbus_acpi_device_ids[] = { }; MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids); +/* + * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than + * SET_SYSTEM_SLEEP_PM_OPS, otherwise NIC SR-IOV can not work, because the + * "pci_dev_pm_ops" uses the "noirq" callbacks: in the resume path, the + * pci "noirq" restore callback runs before "non-noirq" callbacks (see + * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() -> + * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's + * resume callback must also run via the "noirq" callbacks. + */ +static const struct dev_pm_ops vmbus_bus_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_bus_suspend, vmbus_bus_resume) +}; + static struct acpi_driver vmbus_acpi_driver = { .name = "vmbus", .ids = vmbus_acpi_device_ids, @@ -2103,6 +2161,7 @@ static struct acpi_driver vmbus_acpi_driver = { .add = vmbus_acpi_add, .remove = vmbus_acpi_remove, }, + .drv.pm = &vmbus_bus_pm, }; static void hv_kexec_handler(void) From 1f48dcf180e5422b1a633b24680dd0f5c3f540f5 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:20 +0000 Subject: [PATCH 286/721] Drivers: hv: vmbus: Clean up hv_sock channels by force upon suspend Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for hibernation. There is no better method to clean up the channels since some of the channels may still be referenced by the userspace apps when hibernation is triggered: in this case, with this patch, the "rescind" fields of the channels are set, and the apps will thoroughly destroy the channels after hibernation. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/vmbus_drv.c | 55 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index ce9974bf683f..45b976ec6e79 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -1069,6 +1070,41 @@ void vmbus_on_msg_dpc(unsigned long data) vmbus_signal_eom(msg, message_type); } +/* + * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for + * hibernation, because hv_sock connections can not persist across hibernation. + */ +static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) +{ + struct onmessage_work_context *ctx; + struct vmbus_channel_rescind_offer *rescind; + + WARN_ON(!is_hvsock_channel(channel)); + + /* + * sizeof(*ctx) is small and the allocation should really not fail, + * otherwise the state of the hv_sock connections ends up in limbo. + */ + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); + + /* + * So far, these are not really used by Linux. Just set them to the + * reasonable values conforming to the definitions of the fields. + */ + ctx->msg.header.message_type = 1; + ctx->msg.header.payload_size = sizeof(*rescind); + + /* These values are actually used by Linux. */ + rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload; + rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER; + rescind->child_relid = channel->offermsg.child_relid; + + INIT_WORK(&ctx->work, vmbus_onmessage_work); + + queue_work_on(vmbus_connection.connect_cpu, + vmbus_connection.work_queue, + &ctx->work); +} /* * Direct callback for channels using other deferred processing @@ -2091,6 +2127,25 @@ static int vmbus_acpi_add(struct acpi_device *device) static int vmbus_bus_suspend(struct device *dev) { + struct vmbus_channel *channel; + + while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { + /* + * We wait here until the completion of any channel + * offers that are currently in progress. + */ + msleep(1); + } + + mutex_lock(&vmbus_connection.channel_mutex); + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!is_hvsock_channel(channel)) + continue; + + vmbus_force_channel_rescinded(channel); + } + mutex_unlock(&vmbus_connection.channel_mutex); + vmbus_initiate_unload(false); vmbus_connection.conn_state = DISCONNECTED; From b307b38962eb0f22d1aa6dcf53cb7d3c2ed5eec7 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:21 +0000 Subject: [PATCH 287/721] Drivers: hv: vmbus: Suspend after cleaning up hv_sock and sub channels Before suspend, Linux must make sure all the hv_sock channels have been properly cleaned up, because a hv_sock connection can not persist across hibernation, and the user-space app must be properly notified of the state change of the connection. Before suspend, Linux also must make sure all the sub-channels have been destroyed, i.e. the related channel structs of the sub-channels must be properly removed, otherwise they would cause a conflict when the sub-channels are recreated upon resume. Add a counter to track such channels, and vmbus_bus_suspend() should wait for the counter to drop to zero. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/channel_mgmt.c | 26 +++++++++++++++++++++++ drivers/hv/connection.c | 3 +++ drivers/hv/hyperv_vmbus.h | 12 +++++++++++ drivers/hv/vmbus_drv.c | 44 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 84 insertions(+), 1 deletion(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 44b92fa1b7fa..5518d031f62a 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -545,6 +545,10 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) mutex_lock(&vmbus_connection.channel_mutex); + /* Remember the channels that should be cleaned up upon suspend. */ + if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel)) + atomic_inc(&vmbus_connection.nr_chan_close_on_suspend); + /* * Now that we have acquired the channel_mutex, * we can release the potentially racing rescind thread. @@ -944,6 +948,16 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) vmbus_process_offer(newchannel); } +static void check_ready_for_suspend_event(void) +{ + /* + * If all the sub-channels or hv_sock channels have been cleaned up, + * then it's safe to suspend. + */ + if (atomic_dec_and_test(&vmbus_connection.nr_chan_close_on_suspend)) + complete(&vmbus_connection.ready_for_suspend_event); +} + /* * vmbus_onoffer_rescind - Rescind offer handler. * @@ -954,6 +968,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) struct vmbus_channel_rescind_offer *rescind; struct vmbus_channel *channel; struct device *dev; + bool clean_up_chan_for_suspend; rescind = (struct vmbus_channel_rescind_offer *)hdr; @@ -993,6 +1008,8 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) return; } + clean_up_chan_for_suspend = is_hvsock_channel(channel) || + is_sub_channel(channel); /* * Before setting channel->rescind in vmbus_rescind_cleanup(), we * should make sure the channel callback is not running any more. @@ -1018,6 +1035,10 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) if (channel->device_obj) { if (channel->chn_rescind_callback) { channel->chn_rescind_callback(channel); + + if (clean_up_chan_for_suspend) + check_ready_for_suspend_event(); + return; } /* @@ -1050,6 +1071,11 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) } mutex_unlock(&vmbus_connection.channel_mutex); } + + /* The "channel" may have been freed. Do not access it any longer. */ + + if (clean_up_chan_for_suspend) + check_ready_for_suspend_event(); } void vmbus_hvsock_device_unregister(struct vmbus_channel *channel) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 806319cd5ccf..99851ea682eb 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -26,6 +26,9 @@ struct vmbus_connection vmbus_connection = { .conn_state = DISCONNECTED, .next_gpadl_handle = ATOMIC_INIT(0xE1E10), + + .ready_for_suspend_event= COMPLETION_INITIALIZER( + vmbus_connection.ready_for_suspend_event), }; EXPORT_SYMBOL_GPL(vmbus_connection); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index e657197a027a..974b747ca1fc 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -260,6 +260,18 @@ struct vmbus_connection { struct workqueue_struct *work_queue; struct workqueue_struct *handle_primary_chan_wq; struct workqueue_struct *handle_sub_chan_wq; + + /* + * The number of sub-channels and hv_sock channels that should be + * cleaned up upon suspend: sub-channels will be re-created upon + * resume, and hv_sock channels should not survive suspend. + */ + atomic_t nr_chan_close_on_suspend; + /* + * vmbus_bus_suspend() waits for "nr_chan_close_on_suspend" to + * drop to zero. + */ + struct completion ready_for_suspend_event; }; diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 45b976ec6e79..32ec951d334f 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2127,7 +2127,8 @@ static int vmbus_acpi_add(struct acpi_device *device) static int vmbus_bus_suspend(struct device *dev) { - struct vmbus_channel *channel; + struct vmbus_channel *channel, *sc; + unsigned long flags; while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { /* @@ -2146,6 +2147,44 @@ static int vmbus_bus_suspend(struct device *dev) } mutex_unlock(&vmbus_connection.channel_mutex); + /* + * Wait until all the sub-channels and hv_sock channels have been + * cleaned up. Sub-channels should be destroyed upon suspend, otherwise + * they would conflict with the new sub-channels that will be created + * in the resume path. hv_sock channels should also be destroyed, but + * a hv_sock channel of an established hv_sock connection can not be + * really destroyed since it may still be referenced by the userspace + * application, so we just force the hv_sock channel to be rescinded + * by vmbus_force_channel_rescinded(), and the userspace application + * will thoroughly destroy the channel after hibernation. + * + * Note: the counter nr_chan_close_on_suspend may never go above 0 if + * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM. + */ + if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) + wait_for_completion(&vmbus_connection.ready_for_suspend_event); + + mutex_lock(&vmbus_connection.channel_mutex); + + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (is_hvsock_channel(channel)) { + if (!channel->rescind) { + pr_err("hv_sock channel not rescinded!\n"); + WARN_ON_ONCE(1); + } + continue; + } + + spin_lock_irqsave(&channel->lock, flags); + list_for_each_entry(sc, &channel->sc_list, sc_list) { + pr_err("Sub-channel not deleted!\n"); + WARN_ON_ONCE(1); + } + spin_unlock_irqrestore(&channel->lock, flags); + } + + mutex_unlock(&vmbus_connection.channel_mutex); + vmbus_initiate_unload(false); vmbus_connection.conn_state = DISCONNECTED; @@ -2186,6 +2225,9 @@ static int vmbus_bus_resume(struct device *dev) vmbus_request_offers(); + /* Reset the event for the next suspend. */ + reinit_completion(&vmbus_connection.ready_for_suspend_event); + return 0; } From d8bd2d442bb2688b428ac7164e5dc6d95d4fa65b Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 5 Sep 2019 23:01:22 +0000 Subject: [PATCH 288/721] Drivers: hv: vmbus: Resume after fixing up old primary channels When the host re-offers the primary channels upon resume, the host only guarantees the Instance GUID doesn't change, so vmbus_bus_suspend() should invalidate channel->offermsg.child_relid and figure out the number of primary channels that need to be fixed up upon resume. Upon resume, vmbus_onoffer() finds the old channel structs, and maps the new offers to the old channels, and fixes up the old structs, and finally the resume callbacks of the VSC drivers will re-open the channels. Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Sasha Levin --- drivers/hv/channel_mgmt.c | 89 +++++++++++++++++++++++++++++---------- drivers/hv/connection.c | 2 + drivers/hv/hyperv_vmbus.h | 14 ++++++ drivers/hv/vmbus_drv.c | 17 ++++++++ include/linux/hyperv.h | 3 ++ 5 files changed, 103 insertions(+), 22 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 5518d031f62a..8eb167540b4f 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -407,7 +407,15 @@ void hv_process_channel_removal(struct vmbus_channel *channel) cpumask_clear_cpu(channel->target_cpu, &primary_channel->alloced_cpus_in_node); - vmbus_release_relid(channel->offermsg.child_relid); + /* + * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and + * the relid is invalidated; after hibernation, when the user-space app + * destroys the channel, the relid is INVALID_RELID, and in this case + * it's unnecessary and unsafe to release the old relid, since the same + * relid can refer to a completely different channel now. + */ + if (channel->offermsg.child_relid != INVALID_RELID) + vmbus_release_relid(channel->offermsg.child_relid); free_channel(channel); } @@ -851,6 +859,36 @@ void vmbus_initiate_unload(bool crash) vmbus_wait_for_unload(); } +static void check_ready_for_resume_event(void) +{ + /* + * If all the old primary channels have been fixed up, then it's safe + * to resume. + */ + if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume)) + complete(&vmbus_connection.ready_for_resume_event); +} + +static void vmbus_setup_channel_state(struct vmbus_channel *channel, + struct vmbus_channel_offer_channel *offer) +{ + /* + * Setup state for signalling the host. + */ + channel->sig_event = VMBUS_EVENT_CONNECTION_ID; + + if (vmbus_proto_version != VERSION_WS2008) { + channel->is_dedicated_interrupt = + (offer->is_dedicated_interrupt != 0); + channel->sig_event = offer->connection_id; + } + + memcpy(&channel->offermsg, offer, + sizeof(struct vmbus_channel_offer_channel)); + channel->monitor_grp = (u8)offer->monitorid / 32; + channel->monitor_bit = (u8)offer->monitorid % 32; +} + /* * find_primary_channel_by_offer - Get the channel object given the new offer. * This is only used in the resume path of hibernation. @@ -902,14 +940,29 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) atomic_dec(&vmbus_connection.offer_in_progress); /* - * We're resuming from hibernation: we expect the host to send - * exactly the same offers that we had before the hibernation. + * We're resuming from hibernation: all the sub-channel and + * hv_sock channels we had before the hibernation should have + * been cleaned up, and now we must be seeing a re-offered + * primary channel that we had before the hibernation. */ - offer_sz = sizeof(*offer); - if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) - return; - pr_debug("Mismatched offer from the host (relid=%d)\n", + WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID); + /* Fix up the relid. */ + oldchannel->offermsg.child_relid = offer->child_relid; + + offer_sz = sizeof(*offer); + if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) { + check_ready_for_resume_event(); + return; + } + + /* + * This is not an error, since the host can also change the + * other field(s) of the offer, e.g. on WS RS5 (Build 17763), + * the offer->connection_id of the Mellanox VF vmbus device + * can change when the host reoffers the device upon resume. + */ + pr_debug("vmbus offer changed: relid=%d\n", offer->child_relid); print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, @@ -917,6 +970,12 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) false); print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, 16, 4, offer, offer_sz, false); + + /* Fix up the old channel. */ + vmbus_setup_channel_state(oldchannel, offer); + + check_ready_for_resume_event(); + return; } @@ -929,21 +988,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) return; } - /* - * Setup state for signalling the host. - */ - newchannel->sig_event = VMBUS_EVENT_CONNECTION_ID; - - if (vmbus_proto_version != VERSION_WS2008) { - newchannel->is_dedicated_interrupt = - (offer->is_dedicated_interrupt != 0); - newchannel->sig_event = offer->connection_id; - } - - memcpy(&newchannel->offermsg, offer, - sizeof(struct vmbus_channel_offer_channel)); - newchannel->monitor_grp = (u8)offer->monitorid / 32; - newchannel->monitor_bit = (u8)offer->monitorid % 32; + vmbus_setup_channel_state(newchannel, offer); vmbus_process_offer(newchannel); } diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 99851ea682eb..6e4c015783ff 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -29,6 +29,8 @@ struct vmbus_connection vmbus_connection = { .ready_for_suspend_event= COMPLETION_INITIALIZER( vmbus_connection.ready_for_suspend_event), + .ready_for_resume_event = COMPLETION_INITIALIZER( + vmbus_connection.ready_for_resume_event), }; EXPORT_SYMBOL_GPL(vmbus_connection); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 974b747ca1fc..f7a5f5615f34 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -272,6 +272,20 @@ struct vmbus_connection { * drop to zero. */ struct completion ready_for_suspend_event; + + /* + * The number of primary channels that should be "fixed up" + * upon resume: these channels are re-offered upon resume, and some + * fields of the channel offers (i.e. child_relid and connection_id) + * can change, so the old offermsg must be fixed up, before the resume + * callbacks of the VSC drivers start to further touch the channels. + */ + atomic_t nr_chan_fixup_on_resume; + /* + * vmbus_bus_resume() waits for "nr_chan_fixup_on_resume" to + * drop to zero. + */ + struct completion ready_for_resume_event; }; diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 32ec951d334f..391f0b225c9a 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2164,9 +2164,17 @@ static int vmbus_bus_suspend(struct device *dev) if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) wait_for_completion(&vmbus_connection.ready_for_suspend_event); + WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0); + mutex_lock(&vmbus_connection.channel_mutex); list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + /* + * Invalidate the field. Upon resume, vmbus_onoffer() will fix + * up the field, and the other fields (if necessary). + */ + channel->offermsg.child_relid = INVALID_RELID; + if (is_hvsock_channel(channel)) { if (!channel->rescind) { pr_err("hv_sock channel not rescinded!\n"); @@ -2181,6 +2189,8 @@ static int vmbus_bus_suspend(struct device *dev) WARN_ON_ONCE(1); } spin_unlock_irqrestore(&channel->lock, flags); + + atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume); } mutex_unlock(&vmbus_connection.channel_mutex); @@ -2189,6 +2199,9 @@ static int vmbus_bus_suspend(struct device *dev) vmbus_connection.conn_state = DISCONNECTED; + /* Reset the event for the next resume. */ + reinit_completion(&vmbus_connection.ready_for_resume_event); + return 0; } @@ -2223,8 +2236,12 @@ static int vmbus_bus_resume(struct device *dev) if (ret != 0) return ret; + WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0); + vmbus_request_offers(); + wait_for_completion(&vmbus_connection.ready_for_resume_event); + /* Reset the event for the next suspend. */ reinit_completion(&vmbus_connection.ready_for_suspend_event); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 8a60e7766037..a3aa9e9ef6f2 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -426,6 +426,9 @@ enum vmbus_channel_message_type { CHANNELMSG_COUNT }; +/* Hyper-V supports about 2048 channels, and the RELIDs start with 1. */ +#define INVALID_RELID U32_MAX + struct vmbus_channel_message_header { enum vmbus_channel_message_type msgtype; u32 padding; From c30da2e981a703c6b1d49911511f7ade8dac20be Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Mar 2019 16:38:31 +0000 Subject: [PATCH 289/721] fuse: convert to use the new mount API Convert the fuse filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells Signed-off-by: Al Viro Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 292 +++++++++++++++++++++++++++--------------------- 1 file changed, 167 insertions(+), 125 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4bb885b0f032..c334f95e799a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -59,7 +60,13 @@ MODULE_PARM_DESC(max_user_congthresh, /** Congestion starts at 75% of maximum */ #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) -struct fuse_mount_data { +#ifdef CONFIG_BLOCK +static struct file_system_type fuseblk_fs_type; +#endif + +struct fuse_fs_context { + const char *subtype; + bool is_bdev; int fd; unsigned rootmode; kuid_t user_id; @@ -443,6 +450,8 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) } enum { + OPT_SOURCE, + OPT_SUBTYPE, OPT_FD, OPT_ROOTMODE, OPT_USER_ID, @@ -454,111 +463,110 @@ enum { OPT_ERR }; -static const match_table_t tokens = { - {OPT_FD, "fd=%u"}, - {OPT_ROOTMODE, "rootmode=%o"}, - {OPT_USER_ID, "user_id=%u"}, - {OPT_GROUP_ID, "group_id=%u"}, - {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, - {OPT_ALLOW_OTHER, "allow_other"}, - {OPT_MAX_READ, "max_read=%u"}, - {OPT_BLKSIZE, "blksize=%u"}, - {OPT_ERR, NULL} +static const struct fs_parameter_spec fuse_param_specs[] = { + fsparam_string ("source", OPT_SOURCE), + fsparam_u32 ("fd", OPT_FD), + fsparam_u32oct ("rootmode", OPT_ROOTMODE), + fsparam_u32 ("user_id", OPT_USER_ID), + fsparam_u32 ("group_id", OPT_GROUP_ID), + fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS), + fsparam_flag ("allow_other", OPT_ALLOW_OTHER), + fsparam_u32 ("max_read", OPT_MAX_READ), + fsparam_u32 ("blksize", OPT_BLKSIZE), + __fsparam(fs_param_is_string, "subtype", OPT_SUBTYPE, + fs_param_v_optional), + {} }; -static int fuse_match_uint(substring_t *s, unsigned int *res) +static const struct fs_parameter_description fuse_fs_parameters = { + .name = "fuse", + .specs = fuse_param_specs, +}; + +static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) { - int err = -ENOMEM; - char *buf = match_strdup(s); - if (buf) { - err = kstrtouint(buf, 10, res); - kfree(buf); - } - return err; -} + struct fs_parse_result result; + struct fuse_fs_context *ctx = fc->fs_private; + int opt; -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, - struct user_namespace *user_ns) -{ - char *p; - memset(d, 0, sizeof(struct fuse_mount_data)); - d->max_read = ~0; - d->blksize = FUSE_DEFAULT_BLKSIZE; + opt = fs_parse(fc, &fuse_fs_parameters, param, &result); + if (opt < 0) + return opt; - while ((p = strsep(&opt, ",")) != NULL) { - int token; - int value; - unsigned uv; - substring_t args[MAX_OPT_ARGS]; - if (!*p) - continue; + switch (opt) { + case OPT_SOURCE: + if (fc->source) + return invalf(fc, "fuse: Multiple sources specified"); + fc->source = param->string; + param->string = NULL; + break; - token = match_token(p, tokens, args); - switch (token) { - case OPT_FD: - if (match_int(&args[0], &value)) - return 0; - d->fd = value; - d->fd_present = 1; - break; - - case OPT_ROOTMODE: - if (match_octal(&args[0], &value)) - return 0; - if (!fuse_valid_type(value)) - return 0; - d->rootmode = value; - d->rootmode_present = 1; - break; - - case OPT_USER_ID: - if (fuse_match_uint(&args[0], &uv)) - return 0; - d->user_id = make_kuid(user_ns, uv); - if (!uid_valid(d->user_id)) - return 0; - d->user_id_present = 1; - break; - - case OPT_GROUP_ID: - if (fuse_match_uint(&args[0], &uv)) - return 0; - d->group_id = make_kgid(user_ns, uv); - if (!gid_valid(d->group_id)) - return 0; - d->group_id_present = 1; - break; - - case OPT_DEFAULT_PERMISSIONS: - d->default_permissions = 1; - break; - - case OPT_ALLOW_OTHER: - d->allow_other = 1; - break; - - case OPT_MAX_READ: - if (match_int(&args[0], &value)) - return 0; - d->max_read = value; - break; - - case OPT_BLKSIZE: - if (!is_bdev || match_int(&args[0], &value)) - return 0; - d->blksize = value; - break; - - default: - return 0; - } - } - - if (!d->fd_present || !d->rootmode_present || - !d->user_id_present || !d->group_id_present) + case OPT_SUBTYPE: + if (ctx->subtype) + return invalf(fc, "fuse: Multiple subtypes specified"); + ctx->subtype = param->string; + param->string = NULL; return 0; - return 1; + case OPT_FD: + ctx->fd = result.uint_32; + ctx->fd_present = 1; + break; + + case OPT_ROOTMODE: + if (!fuse_valid_type(result.uint_32)) + return invalf(fc, "fuse: Invalid rootmode"); + ctx->rootmode = result.uint_32; + ctx->rootmode_present = 1; + break; + + case OPT_USER_ID: + ctx->user_id = make_kuid(fc->user_ns, result.uint_32); + if (!uid_valid(ctx->user_id)) + return invalf(fc, "fuse: Invalid user_id"); + ctx->user_id_present = 1; + break; + + case OPT_GROUP_ID: + ctx->group_id = make_kgid(fc->user_ns, result.uint_32); + if (!gid_valid(ctx->group_id)) + return invalf(fc, "fuse: Invalid group_id"); + ctx->group_id_present = 1; + break; + + case OPT_DEFAULT_PERMISSIONS: + ctx->default_permissions = 1; + break; + + case OPT_ALLOW_OTHER: + ctx->allow_other = 1; + break; + + case OPT_MAX_READ: + ctx->max_read = result.uint_32; + break; + + case OPT_BLKSIZE: + if (!ctx->is_bdev) + return invalf(fc, "fuse: blksize only supported for fuseblk"); + ctx->blksize = result.uint_32; + break; + + default: + return -EINVAL; + } + + return 0; +} + +static void fuse_free_fc(struct fs_context *fc) +{ + struct fuse_fs_context *ctx = fc->fs_private; + + if (ctx) { + kfree(ctx->subtype); + kfree(ctx); + } } static int fuse_show_options(struct seq_file *m, struct dentry *root) @@ -1075,12 +1083,12 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); -static int fuse_fill_super(struct super_block *sb, void *data, int silent) +static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { + struct fuse_fs_context *ctx = fsc->fs_private; struct fuse_dev *fud; struct fuse_conn *fc; struct inode *root; - struct fuse_mount_data d; struct file *file; struct dentry *root_dentry; struct fuse_req *init_req; @@ -1093,19 +1101,19 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) - goto err; - if (is_bdev) { #ifdef CONFIG_BLOCK err = -EINVAL; - if (!sb_set_blocksize(sb, d.blksize)) + if (!sb_set_blocksize(sb, ctx->blksize)) goto err; #endif } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; } + + sb->s_subtype = ctx->subtype; + ctx->subtype = NULL; sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; sb->s_xattr = fuse_xattr_handlers; @@ -1116,7 +1124,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - file = fget(d.fd); + file = fget(ctx->fd); err = -EINVAL; if (!file) goto err; @@ -1159,17 +1167,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= SB_POSIXACL; - fc->default_permissions = d.default_permissions; - fc->allow_other = d.allow_other; - fc->user_id = d.user_id; - fc->group_id = d.group_id; - fc->max_read = max_t(unsigned, 4096, d.max_read); + fc->default_permissions = ctx->default_permissions; + fc->allow_other = ctx->allow_other; + fc->user_id = ctx->user_id; + fc->group_id = ctx->group_id; + fc->max_read = max_t(unsigned, 4096, ctx->max_read); /* Used by get_root_inode() */ sb->s_fs_info = fc; err = -ENOMEM; - root = fuse_get_root_inode(sb, d.rootmode); + root = fuse_get_root_inode(sb, ctx->rootmode); sb->s_d_op = &fuse_root_dentry_operations; root_dentry = d_make_root(root); if (!root_dentry) @@ -1229,11 +1237,50 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return err; } -static struct dentry *fuse_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) +static int fuse_get_tree(struct fs_context *fc) { - return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); + struct fuse_fs_context *ctx = fc->fs_private; + + if (!ctx->fd_present || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; + +#ifdef CONFIG_BLOCK + if (ctx->is_bdev) + return get_tree_bdev(fc, fuse_fill_super); +#endif + + return get_tree_nodev(fc, fuse_fill_super); +} + +static const struct fs_context_operations fuse_context_ops = { + .free = fuse_free_fc, + .parse_param = fuse_parse_param, + .get_tree = fuse_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int fuse_init_fs_context(struct fs_context *fc) +{ + struct fuse_fs_context *ctx; + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max_read = ~0; + ctx->blksize = FUSE_DEFAULT_BLKSIZE; + +#ifdef CONFIG_BLOCK + if (fc->fs_type == &fuseblk_fs_type) + ctx->is_bdev = true; +#endif + + fc->fs_private = ctx; + fc->ops = &fuse_context_ops; + return 0; } static void fuse_sb_destroy(struct super_block *sb) @@ -1262,19 +1309,13 @@ static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, - .mount = fuse_mount, + .init_fs_context = fuse_init_fs_context, + .parameters = &fuse_fs_parameters, .kill_sb = fuse_kill_sb_anon, }; MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK -static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) -{ - return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super); -} - static void fuse_kill_sb_blk(struct super_block *sb) { fuse_sb_destroy(sb); @@ -1284,7 +1325,8 @@ static void fuse_kill_sb_blk(struct super_block *sb) static struct file_system_type fuseblk_fs_type = { .owner = THIS_MODULE, .name = "fuseblk", - .mount = fuse_mount_blk, + .init_fs_context = fuse_init_fs_context, + .parameters = &fuse_fs_parameters, .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, }; From c7eb6869632a5d33b41d0a00d683b8395392b7ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Mar 2019 16:38:31 +0000 Subject: [PATCH 290/721] vfs: subtype handling moved to fuse The unused vfs code can be removed. Don't pass empty subtype (same as if ->parse callback isn't called). The bits that are left involve determining whether it's permitted to split the filesystem type string passed in to mount(2). Consequently, this means that we cannot get rid of the FS_HAS_SUBTYPE flag unless we define that a type string with a dot in it always indicates a subtype specification. Signed-off-by: David Howells Signed-off-by: Al Viro Signed-off-by: Miklos Szeredi --- fs/fs_context.c | 14 -------------- fs/fuse/inode.c | 3 +-- fs/namespace.c | 2 -- fs/proc_namespace.c | 2 +- fs/super.c | 5 ----- include/linux/fs_context.h | 1 - 6 files changed, 2 insertions(+), 25 deletions(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index 87c2c9687d90..138b5b4d621d 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -504,7 +504,6 @@ void put_fs_context(struct fs_context *fc) put_net(fc->net_ns); put_user_ns(fc->user_ns); put_cred(fc->cred); - kfree(fc->subtype); put_fc_log(fc); put_filesystem(fc->fs_type); kfree(fc->source); @@ -571,17 +570,6 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } - if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) && - strcmp(param->key, "subtype") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Legacy: Non-string subtype"); - if (fc->subtype) - return invalf(fc, "VFS: Legacy: Multiple subtype"); - fc->subtype = param->string; - param->string = NULL; - return 0; - } - if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); @@ -738,8 +726,6 @@ void vfs_clean_context(struct fs_context *fc) fc->s_fs_info = NULL; fc->sb_flags = 0; security_free_mnt_opts(&fc->security); - kfree(fc->subtype); - fc->subtype = NULL; kfree(fc->source); fc->source = NULL; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c334f95e799a..2183967261a4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -473,8 +473,7 @@ static const struct fs_parameter_spec fuse_param_specs[] = { fsparam_flag ("allow_other", OPT_ALLOW_OTHER), fsparam_u32 ("max_read", OPT_MAX_READ), fsparam_u32 ("blksize", OPT_BLKSIZE), - __fsparam(fs_param_is_string, "subtype", OPT_SUBTYPE, - fs_param_v_optional), + fsparam_string ("subtype", OPT_SUBTYPE), {} }; diff --git a/fs/namespace.c b/fs/namespace.c index d28d30b13043..105f995543f6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2768,8 +2768,6 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, put_filesystem(type); return -EINVAL; } - } else { - subtype = ""; } } diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index e16fb8f2049e..273ee82d8aa9 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -88,7 +88,7 @@ static inline void mangle(struct seq_file *m, const char *s) static void show_type(struct seq_file *m, struct super_block *sb) { mangle(m, sb->s_type->name); - if (sb->s_subtype && sb->s_subtype[0]) { + if (sb->s_subtype) { seq_putc(m, '.'); mangle(m, sb->s_subtype); } diff --git a/fs/super.c b/fs/super.c index da223b4cfbca..a6ceed7bcd89 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1530,11 +1530,6 @@ int vfs_get_tree(struct fs_context *fc) sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); - if (fc->subtype && !sb->s_subtype) { - sb->s_subtype = fc->subtype; - fc->subtype = NULL; - } - /* * Write barrier is for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is the diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 84a5eaa09f19..f6c86d58ea91 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -95,7 +95,6 @@ struct fs_context { const struct cred *cred; /* The mounter's credentials */ struct fc_log *log; /* Logging buffer */ const char *source; /* The source name (eg. dev path) */ - const char *subtype; /* The subtype to set on the superblock */ void *security; /* Linux S&M options */ void *s_fs_info; /* Proposed s_fs_info */ unsigned int sb_flags; /* Proposed superblock flags (SB_*) */ From 984998e3404e9073479281dbba8af36b104e8c00 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 22 Aug 2019 11:55:52 +0300 Subject: [PATCH 291/721] PCI: Make pcie_downstream_port() available outside of access.c pcie_downstream_port() is useful in other places where code needs to determine whether the PCIe port is downstream so make it available outside of access.c. Link: https://lore.kernel.org/r/20190822085553.62697-1-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- drivers/pci/access.c | 9 --------- drivers/pci/pci.h | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 544922f097c0..2fccb5762c76 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -336,15 +336,6 @@ static inline int pcie_cap_version(const struct pci_dev *dev) return pcie_caps_reg(dev) & PCI_EXP_FLAGS_VERS; } -static bool pcie_downstream_port(const struct pci_dev *dev) -{ - int type = pci_pcie_type(dev); - - return type == PCI_EXP_TYPE_ROOT_PORT || - type == PCI_EXP_TYPE_DOWNSTREAM || - type == PCI_EXP_TYPE_PCIE_BRIDGE; -} - bool pcie_cap_has_lnkctl(const struct pci_dev *dev) { int type = pci_pcie_type(dev); diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 97180274c60b..ce9f4d3ca075 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -118,6 +118,15 @@ static inline bool pci_power_manageable(struct pci_dev *pci_dev) return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3; } +static inline bool pcie_downstream_port(const struct pci_dev *dev) +{ + int type = pci_pcie_type(dev); + + return type == PCI_EXP_TYPE_ROOT_PORT || + type == PCI_EXP_TYPE_DOWNSTREAM || + type == PCI_EXP_TYPE_PCIE_BRIDGE; +} + int pci_vpd_init(struct pci_dev *dev); void pci_vpd_release(struct pci_dev *dev); void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev); From ca78410403dd64ac0ee0e3cc8646b38335271bfd Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 22 Aug 2019 11:55:53 +0300 Subject: [PATCH 292/721] PCI: Get rid of dev->has_secondary_link flag In some systems, the Device/Port Type in the PCI Express Capabilities register incorrectly identifies upstream ports as downstream ports. d0751b98dfa3 ("PCI: Add dev->has_secondary_link to track downstream PCIe links") addressed this by adding pci_dev.has_secondary_link, which is set for downstream ports. But this is confusing because pci_pcie_type() sometimes gives the wrong answer, and it's not obvious that we should use pci_dev.has_secondary_link instead. Reduce the confusion by correcting the type of the port itself so that pci_pcie_type() returns the actual type regardless of what the Device/Port Type register claims it is. Update the users to call pci_pcie_type() and pcie_downstream_port() accordingly, and remove pci_dev.has_secondary_link completely. Link: https://lore.kernel.org/linux-pci/20190703133953.GK128603@google.com/ Suggested-by: Bjorn Helgaas Link: https://lore.kernel.org/r/20190822085553.62697-2-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko --- drivers/pci/pci.c | 2 +- drivers/pci/pcie/aspm.c | 8 +++---- drivers/pci/pcie/err.c | 2 +- drivers/pci/probe.c | 48 ++++++++++++++++++++++++----------------- drivers/pci/vc.c | 4 +++- include/linux/pci.h | 1 - 6 files changed, 37 insertions(+), 28 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 29ed5ec1ac27..97e7f6e0821e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3576,7 +3576,7 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask) } /* Ensure upstream ports don't block AtomicOps on egress */ - if (!bridge->has_secondary_link) { + if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) { pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, &ctl2); if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index e44af7f4d37f..dcda96818eb6 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -913,10 +913,10 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev) /* * We allocate pcie_link_state for the component on the upstream - * end of a Link, so there's nothing to do unless this device has a - * Link on its secondary side. + * end of a Link, so there's nothing to do unless this device is + * downstream port. */ - if (!pdev->has_secondary_link) + if (!pcie_downstream_port(pdev)) return; /* VIA has a strange chipset, root port is under a bridge */ @@ -1070,7 +1070,7 @@ static int __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem) if (!pci_is_pcie(pdev)) return 0; - if (pdev->has_secondary_link) + if (pcie_downstream_port(pdev)) parent = pdev; if (!parent || !parent->link_state) return -EINVAL; diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 773197a12568..b0e6048a9208 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -166,7 +166,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service) driver = pcie_port_find_service(dev, service); if (driver && driver->reset_link) { status = driver->reset_link(dev); - } else if (dev->has_secondary_link) { + } else if (pcie_downstream_port(dev)) { status = default_reset_link(dev); } else { pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n", diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index cdf34a3c531a..3bfa57d58402 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1431,26 +1431,38 @@ void set_pcie_port_type(struct pci_dev *pdev) pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, ®16); pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD; + parent = pci_upstream_bridge(pdev); + if (!parent) + return; + /* - * A Root Port or a PCI-to-PCIe bridge is always the upstream end - * of a Link. No PCIe component has two Links. Two Links are - * connected by a Switch that has a Port on each Link and internal - * logic to connect the two Ports. + * Some systems do not identify their upstream/downstream ports + * correctly so detect impossible configurations here and correct + * the port type accordingly. */ type = pci_pcie_type(pdev); - if (type == PCI_EXP_TYPE_ROOT_PORT || - type == PCI_EXP_TYPE_PCIE_BRIDGE) - pdev->has_secondary_link = 1; - else if (type == PCI_EXP_TYPE_UPSTREAM || - type == PCI_EXP_TYPE_DOWNSTREAM) { - parent = pci_upstream_bridge(pdev); - + if (type == PCI_EXP_TYPE_DOWNSTREAM) { /* - * Usually there's an upstream device (Root Port or Switch - * Downstream Port), but we can't assume one exists. + * If pdev claims to be downstream port but the parent + * device is also downstream port assume pdev is actually + * upstream port. */ - if (parent && !parent->has_secondary_link) - pdev->has_secondary_link = 1; + if (pcie_downstream_port(parent)) { + pci_info(pdev, "claims to be downstream port but is acting as upstream port, correcting type\n"); + pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE; + pdev->pcie_flags_reg |= PCI_EXP_TYPE_UPSTREAM; + } + } else if (type == PCI_EXP_TYPE_UPSTREAM) { + /* + * If pdev claims to be upstream port but the parent + * device is also upstream port assume pdev is actually + * downstream port. + */ + if (pci_pcie_type(parent) == PCI_EXP_TYPE_UPSTREAM) { + pci_info(pdev, "claims to be upstream port but is acting as downstream port, correcting type\n"); + pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE; + pdev->pcie_flags_reg |= PCI_EXP_TYPE_DOWNSTREAM; + } } } @@ -2488,12 +2500,8 @@ static int only_one_child(struct pci_bus *bus) * A PCIe Downstream Port normally leads to a Link with only Device * 0 on it (PCIe spec r3.1, sec 7.3.1). As an optimization, scan * only for Device 0 in that situation. - * - * Checking has_secondary_link is a hack to identify Downstream - * Ports because sometimes Switches are configured such that the - * PCIe Port Type labels are backwards. */ - if (bridge && pci_is_pcie(bridge) && bridge->has_secondary_link) + if (bridge && pci_is_pcie(bridge) && pcie_downstream_port(bridge)) return 1; return 0; diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c index 5acd9c02683a..9ae9fb9339e8 100644 --- a/drivers/pci/vc.c +++ b/drivers/pci/vc.c @@ -13,6 +13,8 @@ #include #include +#include "pci.h" + /** * pci_vc_save_restore_dwords - Save or restore a series of dwords * @dev: device @@ -105,7 +107,7 @@ static void pci_vc_enable(struct pci_dev *dev, int pos, int res) struct pci_dev *link = NULL; /* Enable VCs from the downstream device */ - if (!dev->has_secondary_link) + if (!pci_is_pcie(dev) || !pcie_downstream_port(dev)) return; ctrl_pos = pos + PCI_VC_RES_CTRL + (res * PCI_CAP_VC_PER_VC_SIZEOF); diff --git a/include/linux/pci.h b/include/linux/pci.h index 9e700d9f9f28..0153ede227e5 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -418,7 +418,6 @@ struct pci_dev { unsigned int broken_intx_masking:1; /* INTx masking can't be used */ unsigned int io_window_1k:1; /* Intel bridge 1K I/O windows */ unsigned int irq_managed:1; - unsigned int has_secondary_link:1; unsigned int non_compliant_bars:1; /* Broken BARs; ignore them */ unsigned int is_probed:1; /* Device probing in progress */ unsigned int link_active_reporting:1;/* Device capable of reporting link active */ From 56e15a238d92788a2d09e0c5c26a5de1b3156931 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Tue, 13 Aug 2019 17:06:27 +0530 Subject: [PATCH 293/721] PCI: tegra: Add Tegra194 PCIe support Add support for Synopsys DesignWare core IP based PCIe host controller present in the Tegra194 SoC. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Acked-by: Thierry Reding --- drivers/pci/controller/dwc/Kconfig | 10 + drivers/pci/controller/dwc/Makefile | 1 + drivers/pci/controller/dwc/pcie-designware.c | 2 +- drivers/pci/controller/dwc/pcie-tegra194.c | 1641 ++++++++++++++++++ 4 files changed, 1653 insertions(+), 1 deletion(-) create mode 100644 drivers/pci/controller/dwc/pcie-tegra194.c diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 6ea778ae4877..49475f5c42c3 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -220,6 +220,16 @@ config PCI_MESON and therefore the driver re-uses the DesignWare core functions to implement the driver. +config PCIE_TEGRA194 + tristate "NVIDIA Tegra194 (and later) PCIe controller" + depends on ARCH_TEGRA_194_SOC || COMPILE_TEST + depends on PCI_MSI_IRQ_DOMAIN + select PCIE_DW_HOST + select PHY_TEGRA194_P2U + help + Say Y here if you want support for DesignWare core based PCIe host + controller found in NVIDIA Tegra194 SoC. + config PCIE_UNIPHIER bool "Socionext UniPhier PCIe controllers" depends on ARCH_UNIPHIER || COMPILE_TEST diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index b085dfd4fab7..b30336181d46 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o obj-$(CONFIG_PCIE_KIRIN) += pcie-kirin.o obj-$(CONFIG_PCIE_HISI_STB) += pcie-histb.o obj-$(CONFIG_PCI_MESON) += pci-meson.o +obj-$(CONFIG_PCIE_TEGRA194) += pcie-tegra194.o obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o # The following drivers are for devices that use the generic ACPI diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 59eaeeb21dbe..4d6690b6ca36 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -456,7 +456,7 @@ int dw_pcie_wait_for_link(struct dw_pcie *pci) usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX); } - dev_err(pci->dev, "Phy link never came up\n"); + dev_info(pci->dev, "Phy link never came up\n"); return -ETIMEDOUT; } diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c new file mode 100644 index 000000000000..6056414c07d4 --- /dev/null +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -0,0 +1,1641 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * PCIe host controller driver for Tegra194 SoC + * + * Copyright (C) 2019 NVIDIA Corporation. + * + * Author: Vidya Sagar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pcie-designware.h" +#include +#include +#include "../../pci.h" + +#define APPL_PINMUX 0x0 +#define APPL_PINMUX_PEX_RST BIT(0) +#define APPL_PINMUX_CLKREQ_OVERRIDE_EN BIT(2) +#define APPL_PINMUX_CLKREQ_OVERRIDE BIT(3) +#define APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE_EN BIT(4) +#define APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE BIT(5) +#define APPL_PINMUX_CLKREQ_OUT_OVRD_EN BIT(9) +#define APPL_PINMUX_CLKREQ_OUT_OVRD BIT(10) + +#define APPL_CTRL 0x4 +#define APPL_CTRL_SYS_PRE_DET_STATE BIT(6) +#define APPL_CTRL_LTSSM_EN BIT(7) +#define APPL_CTRL_HW_HOT_RST_EN BIT(20) +#define APPL_CTRL_HW_HOT_RST_MODE_MASK GENMASK(1, 0) +#define APPL_CTRL_HW_HOT_RST_MODE_SHIFT 22 +#define APPL_CTRL_HW_HOT_RST_MODE_IMDT_RST 0x1 + +#define APPL_INTR_EN_L0_0 0x8 +#define APPL_INTR_EN_L0_0_LINK_STATE_INT_EN BIT(0) +#define APPL_INTR_EN_L0_0_MSI_RCV_INT_EN BIT(4) +#define APPL_INTR_EN_L0_0_INT_INT_EN BIT(8) +#define APPL_INTR_EN_L0_0_CDM_REG_CHK_INT_EN BIT(19) +#define APPL_INTR_EN_L0_0_SYS_INTR_EN BIT(30) +#define APPL_INTR_EN_L0_0_SYS_MSI_INTR_EN BIT(31) + +#define APPL_INTR_STATUS_L0 0xC +#define APPL_INTR_STATUS_L0_LINK_STATE_INT BIT(0) +#define APPL_INTR_STATUS_L0_INT_INT BIT(8) +#define APPL_INTR_STATUS_L0_CDM_REG_CHK_INT BIT(18) + +#define APPL_INTR_EN_L1_0_0 0x1C +#define APPL_INTR_EN_L1_0_0_LINK_REQ_RST_NOT_INT_EN BIT(1) + +#define APPL_INTR_STATUS_L1_0_0 0x20 +#define APPL_INTR_STATUS_L1_0_0_LINK_REQ_RST_NOT_CHGED BIT(1) + +#define APPL_INTR_STATUS_L1_1 0x2C +#define APPL_INTR_STATUS_L1_2 0x30 +#define APPL_INTR_STATUS_L1_3 0x34 +#define APPL_INTR_STATUS_L1_6 0x3C +#define APPL_INTR_STATUS_L1_7 0x40 + +#define APPL_INTR_EN_L1_8_0 0x44 +#define APPL_INTR_EN_L1_8_BW_MGT_INT_EN BIT(2) +#define APPL_INTR_EN_L1_8_AUTO_BW_INT_EN BIT(3) +#define APPL_INTR_EN_L1_8_INTX_EN BIT(11) +#define APPL_INTR_EN_L1_8_AER_INT_EN BIT(15) + +#define APPL_INTR_STATUS_L1_8_0 0x4C +#define APPL_INTR_STATUS_L1_8_0_EDMA_INT_MASK GENMASK(11, 6) +#define APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS BIT(2) +#define APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS BIT(3) + +#define APPL_INTR_STATUS_L1_9 0x54 +#define APPL_INTR_STATUS_L1_10 0x58 +#define APPL_INTR_STATUS_L1_11 0x64 +#define APPL_INTR_STATUS_L1_13 0x74 +#define APPL_INTR_STATUS_L1_14 0x78 +#define APPL_INTR_STATUS_L1_15 0x7C +#define APPL_INTR_STATUS_L1_17 0x88 + +#define APPL_INTR_EN_L1_18 0x90 +#define APPL_INTR_EN_L1_18_CDM_REG_CHK_CMPLT BIT(2) +#define APPL_INTR_EN_L1_18_CDM_REG_CHK_CMP_ERR BIT(1) +#define APPL_INTR_EN_L1_18_CDM_REG_CHK_LOGIC_ERR BIT(0) + +#define APPL_INTR_STATUS_L1_18 0x94 +#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMPLT BIT(2) +#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMP_ERR BIT(1) +#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_LOGIC_ERR BIT(0) + +#define APPL_MSI_CTRL_2 0xB0 + +#define APPL_LTR_MSG_1 0xC4 +#define LTR_MSG_REQ BIT(15) +#define LTR_MST_NO_SNOOP_SHIFT 16 + +#define APPL_LTR_MSG_2 0xC8 +#define APPL_LTR_MSG_2_LTR_MSG_REQ_STATE BIT(3) + +#define APPL_LINK_STATUS 0xCC +#define APPL_LINK_STATUS_RDLH_LINK_UP BIT(0) + +#define APPL_DEBUG 0xD0 +#define APPL_DEBUG_PM_LINKST_IN_L2_LAT BIT(21) +#define APPL_DEBUG_PM_LINKST_IN_L0 0x11 +#define APPL_DEBUG_LTSSM_STATE_MASK GENMASK(8, 3) +#define APPL_DEBUG_LTSSM_STATE_SHIFT 3 +#define LTSSM_STATE_PRE_DETECT 5 + +#define APPL_RADM_STATUS 0xE4 +#define APPL_PM_XMT_TURNOFF_STATE BIT(0) + +#define APPL_DM_TYPE 0x100 +#define APPL_DM_TYPE_MASK GENMASK(3, 0) +#define APPL_DM_TYPE_RP 0x4 +#define APPL_DM_TYPE_EP 0x0 + +#define APPL_CFG_BASE_ADDR 0x104 +#define APPL_CFG_BASE_ADDR_MASK GENMASK(31, 12) + +#define APPL_CFG_IATU_DMA_BASE_ADDR 0x108 +#define APPL_CFG_IATU_DMA_BASE_ADDR_MASK GENMASK(31, 18) + +#define APPL_CFG_MISC 0x110 +#define APPL_CFG_MISC_SLV_EP_MODE BIT(14) +#define APPL_CFG_MISC_ARCACHE_MASK GENMASK(13, 10) +#define APPL_CFG_MISC_ARCACHE_SHIFT 10 +#define APPL_CFG_MISC_ARCACHE_VAL 3 + +#define APPL_CFG_SLCG_OVERRIDE 0x114 +#define APPL_CFG_SLCG_OVERRIDE_SLCG_EN_MASTER BIT(0) + +#define APPL_CAR_RESET_OVRD 0x12C +#define APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N BIT(0) + +#define IO_BASE_IO_DECODE BIT(0) +#define IO_BASE_IO_DECODE_BIT8 BIT(8) + +#define CFG_PREF_MEM_LIMIT_BASE_MEM_DECODE BIT(0) +#define CFG_PREF_MEM_LIMIT_BASE_MEM_LIMIT_DECODE BIT(16) + +#define CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF 0x718 +#define CFG_TIMER_CTRL_ACK_NAK_SHIFT (19) + +#define EVENT_COUNTER_ALL_CLEAR 0x3 +#define EVENT_COUNTER_ENABLE_ALL 0x7 +#define EVENT_COUNTER_ENABLE_SHIFT 2 +#define EVENT_COUNTER_EVENT_SEL_MASK GENMASK(7, 0) +#define EVENT_COUNTER_EVENT_SEL_SHIFT 16 +#define EVENT_COUNTER_EVENT_Tx_L0S 0x2 +#define EVENT_COUNTER_EVENT_Rx_L0S 0x3 +#define EVENT_COUNTER_EVENT_L1 0x5 +#define EVENT_COUNTER_EVENT_L1_1 0x7 +#define EVENT_COUNTER_EVENT_L1_2 0x8 +#define EVENT_COUNTER_GROUP_SEL_SHIFT 24 +#define EVENT_COUNTER_GROUP_5 0x5 + +#define PORT_LOGIC_ACK_F_ASPM_CTRL 0x70C +#define ENTER_ASPM BIT(30) +#define L0S_ENTRANCE_LAT_SHIFT 24 +#define L0S_ENTRANCE_LAT_MASK GENMASK(26, 24) +#define L1_ENTRANCE_LAT_SHIFT 27 +#define L1_ENTRANCE_LAT_MASK GENMASK(29, 27) +#define N_FTS_SHIFT 8 +#define N_FTS_MASK GENMASK(7, 0) +#define N_FTS_VAL 52 + +#define PORT_LOGIC_GEN2_CTRL 0x80C +#define PORT_LOGIC_GEN2_CTRL_DIRECT_SPEED_CHANGE BIT(17) +#define FTS_MASK GENMASK(7, 0) +#define FTS_VAL 52 + +#define PORT_LOGIC_MSI_CTRL_INT_0_EN 0x828 + +#define GEN3_EQ_CONTROL_OFF 0x8a8 +#define GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT 8 +#define GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK GENMASK(23, 8) +#define GEN3_EQ_CONTROL_OFF_FB_MODE_MASK GENMASK(3, 0) + +#define GEN3_RELATED_OFF 0x890 +#define GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL BIT(0) +#define GEN3_RELATED_OFF_GEN3_EQ_DISABLE BIT(16) +#define GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT 24 +#define GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK GENMASK(25, 24) + +#define PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT 0x8D0 +#define AMBA_ERROR_RESPONSE_CRS_SHIFT 3 +#define AMBA_ERROR_RESPONSE_CRS_MASK GENMASK(1, 0) +#define AMBA_ERROR_RESPONSE_CRS_OKAY 0 +#define AMBA_ERROR_RESPONSE_CRS_OKAY_FFFFFFFF 1 +#define AMBA_ERROR_RESPONSE_CRS_OKAY_FFFF0001 2 + +#define PORT_LOGIC_MSIX_DOORBELL 0x948 + +#define CAP_SPCIE_CAP_OFF 0x154 +#define CAP_SPCIE_CAP_OFF_DSP_TX_PRESET0_MASK GENMASK(3, 0) +#define CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_MASK GENMASK(11, 8) +#define CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_SHIFT 8 + +#define PME_ACK_TIMEOUT 10000 + +#define LTSSM_TIMEOUT 50000 /* 50ms */ + +#define GEN3_GEN4_EQ_PRESET_INIT 5 + +#define GEN1_CORE_CLK_FREQ 62500000 +#define GEN2_CORE_CLK_FREQ 125000000 +#define GEN3_CORE_CLK_FREQ 250000000 +#define GEN4_CORE_CLK_FREQ 500000000 + +static const unsigned int pcie_gen_freq[] = { + GEN1_CORE_CLK_FREQ, + GEN2_CORE_CLK_FREQ, + GEN3_CORE_CLK_FREQ, + GEN4_CORE_CLK_FREQ +}; + +static const u32 event_cntr_ctrl_offset[] = { + 0x1d8, + 0x1a8, + 0x1a8, + 0x1a8, + 0x1c4, + 0x1d8 +}; + +static const u32 event_cntr_data_offset[] = { + 0x1dc, + 0x1ac, + 0x1ac, + 0x1ac, + 0x1c8, + 0x1dc +}; + +struct tegra_pcie_dw { + struct device *dev; + struct resource *appl_res; + struct resource *dbi_res; + struct resource *atu_dma_res; + void __iomem *appl_base; + struct clk *core_clk; + struct reset_control *core_apb_rst; + struct reset_control *core_rst; + struct dw_pcie pci; + struct tegra_bpmp *bpmp; + + bool supports_clkreq; + bool enable_cdm_check; + bool link_state; + bool update_fc_fixup; + u8 init_link_width; + u32 msi_ctrl_int; + u32 num_lanes; + u32 max_speed; + u32 cid; + u32 cfg_link_cap_l1sub; + u32 pcie_cap_base; + u32 aspm_cmrt; + u32 aspm_pwr_on_t; + u32 aspm_l0s_enter_lat; + + struct regulator *pex_ctl_supply; + + unsigned int phy_count; + struct phy **phys; + + struct dentry *debugfs; +}; + +static inline struct tegra_pcie_dw *to_tegra_pcie(struct dw_pcie *pci) +{ + return container_of(pci, struct tegra_pcie_dw, pci); +} + +static inline void appl_writel(struct tegra_pcie_dw *pcie, const u32 value, + const u32 reg) +{ + writel_relaxed(value, pcie->appl_base + reg); +} + +static inline u32 appl_readl(struct tegra_pcie_dw *pcie, const u32 reg) +{ + return readl_relaxed(pcie->appl_base + reg); +} + +struct tegra_pcie_soc { + enum dw_pcie_device_mode mode; +}; + +static void apply_bad_link_workaround(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 current_link_width; + u16 val; + + /* + * NOTE:- Since this scenario is uncommon and link as such is not + * stable anyway, not waiting to confirm if link is really + * transitioning to Gen-2 speed + */ + val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA); + if (val & PCI_EXP_LNKSTA_LBMS) { + current_link_width = (val & PCI_EXP_LNKSTA_NLW) >> + PCI_EXP_LNKSTA_NLW_SHIFT; + if (pcie->init_link_width > current_link_width) { + dev_warn(pci->dev, "PCIe link is bad, width reduced\n"); + val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL2); + val &= ~PCI_EXP_LNKCTL2_TLS; + val |= PCI_EXP_LNKCTL2_TLS_2_5GT; + dw_pcie_writew_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL2, val); + + val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL); + val |= PCI_EXP_LNKCTL_RL; + dw_pcie_writew_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL, val); + } + } +} + +static irqreturn_t tegra_pcie_rp_irq_handler(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + struct pcie_port *pp = &pci->pp; + u32 val, tmp; + u16 val_w; + + val = appl_readl(pcie, APPL_INTR_STATUS_L0); + if (val & APPL_INTR_STATUS_L0_LINK_STATE_INT) { + val = appl_readl(pcie, APPL_INTR_STATUS_L1_0_0); + if (val & APPL_INTR_STATUS_L1_0_0_LINK_REQ_RST_NOT_CHGED) { + appl_writel(pcie, val, APPL_INTR_STATUS_L1_0_0); + + /* SBR & Surprise Link Down WAR */ + val = appl_readl(pcie, APPL_CAR_RESET_OVRD); + val &= ~APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N; + appl_writel(pcie, val, APPL_CAR_RESET_OVRD); + udelay(1); + val = appl_readl(pcie, APPL_CAR_RESET_OVRD); + val |= APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N; + appl_writel(pcie, val, APPL_CAR_RESET_OVRD); + + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_GEN2_CTRL); + val |= PORT_LOGIC_GEN2_CTRL_DIRECT_SPEED_CHANGE; + dw_pcie_writel_dbi(pci, PORT_LOGIC_GEN2_CTRL, val); + } + } + + if (val & APPL_INTR_STATUS_L0_INT_INT) { + val = appl_readl(pcie, APPL_INTR_STATUS_L1_8_0); + if (val & APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS) { + appl_writel(pcie, + APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS, + APPL_INTR_STATUS_L1_8_0); + apply_bad_link_workaround(pp); + } + if (val & APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS) { + appl_writel(pcie, + APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS, + APPL_INTR_STATUS_L1_8_0); + + val_w = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKSTA); + dev_dbg(pci->dev, "Link Speed : Gen-%u\n", val_w & + PCI_EXP_LNKSTA_CLS); + } + } + + val = appl_readl(pcie, APPL_INTR_STATUS_L0); + if (val & APPL_INTR_STATUS_L0_CDM_REG_CHK_INT) { + val = appl_readl(pcie, APPL_INTR_STATUS_L1_18); + tmp = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS); + if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMPLT) { + dev_info(pci->dev, "CDM check complete\n"); + tmp |= PCIE_PL_CHK_REG_CHK_REG_COMPLETE; + } + if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMP_ERR) { + dev_err(pci->dev, "CDM comparison mismatch\n"); + tmp |= PCIE_PL_CHK_REG_CHK_REG_COMPARISON_ERROR; + } + if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_LOGIC_ERR) { + dev_err(pci->dev, "CDM Logic error\n"); + tmp |= PCIE_PL_CHK_REG_CHK_REG_LOGIC_ERROR; + } + dw_pcie_writel_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS, tmp); + tmp = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_ERR_ADDR); + dev_err(pci->dev, "CDM Error Address Offset = 0x%08X\n", tmp); + } + + return IRQ_HANDLED; +} + +static irqreturn_t tegra_pcie_irq_handler(int irq, void *arg) +{ + struct tegra_pcie_dw *pcie = arg; + + return tegra_pcie_rp_irq_handler(pcie); +} + +static int tegra_pcie_dw_rd_own_conf(struct pcie_port *pp, int where, int size, + u32 *val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + + /* + * This is an endpoint mode specific register happen to appear even + * when controller is operating in root port mode and system hangs + * when it is accessed with link being in ASPM-L1 state. + * So skip accessing it altogether + */ + if (where == PORT_LOGIC_MSIX_DOORBELL) { + *val = 0x00000000; + return PCIBIOS_SUCCESSFUL; + } + + return dw_pcie_read(pci->dbi_base + where, size, val); +} + +static int tegra_pcie_dw_wr_own_conf(struct pcie_port *pp, int where, int size, + u32 val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + + /* + * This is an endpoint mode specific register happen to appear even + * when controller is operating in root port mode and system hangs + * when it is accessed with link being in ASPM-L1 state. + * So skip accessing it altogether + */ + if (where == PORT_LOGIC_MSIX_DOORBELL) + return PCIBIOS_SUCCESSFUL; + + return dw_pcie_write(pci->dbi_base + where, size, val); +} + +#if defined(CONFIG_PCIEASPM) +static void disable_aspm_l11(struct tegra_pcie_dw *pcie) +{ + u32 val; + + val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub); + val &= ~PCI_L1SS_CAP_ASPM_L1_1; + dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val); +} + +static void disable_aspm_l12(struct tegra_pcie_dw *pcie) +{ + u32 val; + + val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub); + val &= ~PCI_L1SS_CAP_ASPM_L1_2; + dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val); +} + +static inline u32 event_counter_prog(struct tegra_pcie_dw *pcie, u32 event) +{ + u32 val; + + val = dw_pcie_readl_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid]); + val &= ~(EVENT_COUNTER_EVENT_SEL_MASK << EVENT_COUNTER_EVENT_SEL_SHIFT); + val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT; + val |= event << EVENT_COUNTER_EVENT_SEL_SHIFT; + val |= EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT; + dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], val); + val = dw_pcie_readl_dbi(&pcie->pci, event_cntr_data_offset[pcie->cid]); + + return val; +} + +static int aspm_state_cnt(struct seq_file *s, void *data) +{ + struct tegra_pcie_dw *pcie = (struct tegra_pcie_dw *) + dev_get_drvdata(s->private); + u32 val; + + seq_printf(s, "Tx L0s entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_Tx_L0S)); + + seq_printf(s, "Rx L0s entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_Rx_L0S)); + + seq_printf(s, "Link L1 entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1)); + + seq_printf(s, "Link L1.1 entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1_1)); + + seq_printf(s, "Link L1.2 entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1_2)); + + /* Clear all counters */ + dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], + EVENT_COUNTER_ALL_CLEAR); + + /* Re-enable counting */ + val = EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT; + val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT; + dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], val); + + return 0; +} + +static void init_host_aspm(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + u32 val; + + val = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS); + pcie->cfg_link_cap_l1sub = val + PCI_L1SS_CAP; + + /* Enable ASPM counters */ + val = EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT; + val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT; + dw_pcie_writel_dbi(pci, event_cntr_ctrl_offset[pcie->cid], val); + + /* Program T_cmrt and T_pwr_on values */ + val = dw_pcie_readl_dbi(pci, pcie->cfg_link_cap_l1sub); + val &= ~(PCI_L1SS_CAP_CM_RESTORE_TIME | PCI_L1SS_CAP_P_PWR_ON_VALUE); + val |= (pcie->aspm_cmrt << 8); + val |= (pcie->aspm_pwr_on_t << 19); + dw_pcie_writel_dbi(pci, pcie->cfg_link_cap_l1sub, val); + + /* Program L0s and L1 entrance latencies */ + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL); + val &= ~L0S_ENTRANCE_LAT_MASK; + val |= (pcie->aspm_l0s_enter_lat << L0S_ENTRANCE_LAT_SHIFT); + val |= ENTER_ASPM; + dw_pcie_writel_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL, val); +} + +static int init_debugfs(struct tegra_pcie_dw *pcie) +{ + struct dentry *d; + + d = debugfs_create_devm_seqfile(pcie->dev, "aspm_state_cnt", + pcie->debugfs, aspm_state_cnt); + if (IS_ERR_OR_NULL(d)) + dev_err(pcie->dev, + "Failed to create debugfs file \"aspm_state_cnt\"\n"); + + return 0; +} +#else +static inline void disable_aspm_l12(struct tegra_pcie_dw *pcie) { return; } +static inline void disable_aspm_l11(struct tegra_pcie_dw *pcie) { return; } +static inline void init_host_aspm(struct tegra_pcie_dw *pcie) { return; } +static inline int init_debugfs(struct tegra_pcie_dw *pcie) { return 0; } +#endif + +static void tegra_pcie_enable_system_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + u16 val_w; + + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_LINK_STATE_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); + + val = appl_readl(pcie, APPL_INTR_EN_L1_0_0); + val |= APPL_INTR_EN_L1_0_0_LINK_REQ_RST_NOT_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L1_0_0); + + if (pcie->enable_cdm_check) { + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_CDM_REG_CHK_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); + + val = appl_readl(pcie, APPL_INTR_EN_L1_18); + val |= APPL_INTR_EN_L1_18_CDM_REG_CHK_CMP_ERR; + val |= APPL_INTR_EN_L1_18_CDM_REG_CHK_LOGIC_ERR; + appl_writel(pcie, val, APPL_INTR_EN_L1_18); + } + + val_w = dw_pcie_readw_dbi(&pcie->pci, pcie->pcie_cap_base + + PCI_EXP_LNKSTA); + pcie->init_link_width = (val_w & PCI_EXP_LNKSTA_NLW) >> + PCI_EXP_LNKSTA_NLW_SHIFT; + + val_w = dw_pcie_readw_dbi(&pcie->pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL); + val_w |= PCI_EXP_LNKCTL_LBMIE; + dw_pcie_writew_dbi(&pcie->pci, pcie->pcie_cap_base + PCI_EXP_LNKCTL, + val_w); +} + +static void tegra_pcie_enable_legacy_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + + /* Enable legacy interrupt generation */ + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_SYS_INTR_EN; + val |= APPL_INTR_EN_L0_0_INT_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); + + val = appl_readl(pcie, APPL_INTR_EN_L1_8_0); + val |= APPL_INTR_EN_L1_8_INTX_EN; + val |= APPL_INTR_EN_L1_8_AUTO_BW_INT_EN; + val |= APPL_INTR_EN_L1_8_BW_MGT_INT_EN; + if (IS_ENABLED(CONFIG_PCIEAER)) + val |= APPL_INTR_EN_L1_8_AER_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L1_8_0); +} + +static void tegra_pcie_enable_msi_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + + dw_pcie_msi_init(pp); + + /* Enable MSI interrupt generation */ + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_SYS_MSI_INTR_EN; + val |= APPL_INTR_EN_L0_0_MSI_RCV_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); +} + +static void tegra_pcie_enable_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + + /* Clear interrupt statuses before enabling interrupts */ + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L0); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_0_0); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_1); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_2); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_3); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_6); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_7); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_8_0); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_9); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_10); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_11); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_13); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_14); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_15); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_17); + + tegra_pcie_enable_system_interrupts(pp); + tegra_pcie_enable_legacy_interrupts(pp); + if (IS_ENABLED(CONFIG_PCI_MSI)) + tegra_pcie_enable_msi_interrupts(pp); +} + +static void config_gen3_gen4_eq_presets(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + u32 val, offset, i; + + /* Program init preset */ + for (i = 0; i < pcie->num_lanes; i++) { + dw_pcie_read(pci->dbi_base + CAP_SPCIE_CAP_OFF + + (i * 2), 2, &val); + val &= ~CAP_SPCIE_CAP_OFF_DSP_TX_PRESET0_MASK; + val |= GEN3_GEN4_EQ_PRESET_INIT; + val &= ~CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_MASK; + val |= (GEN3_GEN4_EQ_PRESET_INIT << + CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_SHIFT); + dw_pcie_write(pci->dbi_base + CAP_SPCIE_CAP_OFF + + (i * 2), 2, val); + + offset = dw_pcie_find_ext_capability(pci, + PCI_EXT_CAP_ID_PL_16GT) + + PCI_PL_16GT_LE_CTRL; + dw_pcie_read(pci->dbi_base + offset + i, 1, &val); + val &= ~PCI_PL_16GT_LE_CTRL_DSP_TX_PRESET_MASK; + val |= GEN3_GEN4_EQ_PRESET_INIT; + val &= ~PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK; + val |= (GEN3_GEN4_EQ_PRESET_INIT << + PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT); + dw_pcie_write(pci->dbi_base + offset + i, 1, val); + } + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK; + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_EQ_CONTROL_OFF); + val &= ~GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK; + val |= (0x3ff << GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT); + val &= ~GEN3_EQ_CONTROL_OFF_FB_MODE_MASK; + dw_pcie_writel_dbi(pci, GEN3_EQ_CONTROL_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK; + val |= (0x1 << GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT); + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_EQ_CONTROL_OFF); + val &= ~GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK; + val |= (0x360 << GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT); + val &= ~GEN3_EQ_CONTROL_OFF_FB_MODE_MASK; + dw_pcie_writel_dbi(pci, GEN3_EQ_CONTROL_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK; + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); +} + +static void tegra_pcie_prepare_host(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + + val = dw_pcie_readl_dbi(pci, PCI_IO_BASE); + val &= ~(IO_BASE_IO_DECODE | IO_BASE_IO_DECODE_BIT8); + dw_pcie_writel_dbi(pci, PCI_IO_BASE, val); + + val = dw_pcie_readl_dbi(pci, PCI_PREF_MEMORY_BASE); + val |= CFG_PREF_MEM_LIMIT_BASE_MEM_DECODE; + val |= CFG_PREF_MEM_LIMIT_BASE_MEM_LIMIT_DECODE; + dw_pcie_writel_dbi(pci, PCI_PREF_MEMORY_BASE, val); + + dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_0, 0); + + /* Configure FTS */ + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL); + val &= ~(N_FTS_MASK << N_FTS_SHIFT); + val |= N_FTS_VAL << N_FTS_SHIFT; + dw_pcie_writel_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL, val); + + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_GEN2_CTRL); + val &= ~FTS_MASK; + val |= FTS_VAL; + dw_pcie_writel_dbi(pci, PORT_LOGIC_GEN2_CTRL, val); + + /* Enable as 0xFFFF0001 response for CRS */ + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT); + val &= ~(AMBA_ERROR_RESPONSE_CRS_MASK << AMBA_ERROR_RESPONSE_CRS_SHIFT); + val |= (AMBA_ERROR_RESPONSE_CRS_OKAY_FFFF0001 << + AMBA_ERROR_RESPONSE_CRS_SHIFT); + dw_pcie_writel_dbi(pci, PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT, val); + + /* Configure Max Speed from DT */ + if (pcie->max_speed && pcie->max_speed != -EINVAL) { + val = dw_pcie_readl_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCAP); + val &= ~PCI_EXP_LNKCAP_SLS; + val |= pcie->max_speed; + dw_pcie_writel_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP, + val); + } + + /* Configure Max lane width from DT */ + val = dw_pcie_readl_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP); + val &= ~PCI_EXP_LNKCAP_MLW; + val |= (pcie->num_lanes << PCI_EXP_LNKSTA_NLW_SHIFT); + dw_pcie_writel_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP, val); + + config_gen3_gen4_eq_presets(pcie); + + init_host_aspm(pcie); + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL; + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); + + if (pcie->update_fc_fixup) { + val = dw_pcie_readl_dbi(pci, CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF); + val |= 0x1 << CFG_TIMER_CTRL_ACK_NAK_SHIFT; + dw_pcie_writel_dbi(pci, CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF, val); + } + + dw_pcie_setup_rc(pp); + + clk_set_rate(pcie->core_clk, GEN4_CORE_CLK_FREQ); + + /* Assert RST */ + val = appl_readl(pcie, APPL_PINMUX); + val &= ~APPL_PINMUX_PEX_RST; + appl_writel(pcie, val, APPL_PINMUX); + + usleep_range(100, 200); + + /* Enable LTSSM */ + val = appl_readl(pcie, APPL_CTRL); + val |= APPL_CTRL_LTSSM_EN; + appl_writel(pcie, val, APPL_CTRL); + + /* De-assert RST */ + val = appl_readl(pcie, APPL_PINMUX); + val |= APPL_PINMUX_PEX_RST; + appl_writel(pcie, val, APPL_PINMUX); + + msleep(100); +} + +static int tegra_pcie_dw_host_init(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val, tmp, offset, speed; + + tegra_pcie_prepare_host(pp); + + if (dw_pcie_wait_for_link(pci)) { + /* + * There are some endpoints which can't get the link up if + * root port has Data Link Feature (DLF) enabled. + * Refer Spec rev 4.0 ver 1.0 sec 3.4.2 & 7.7.4 for more info + * on Scaled Flow Control and DLF. + * So, need to confirm that is indeed the case here and attempt + * link up once again with DLF disabled. + */ + val = appl_readl(pcie, APPL_DEBUG); + val &= APPL_DEBUG_LTSSM_STATE_MASK; + val >>= APPL_DEBUG_LTSSM_STATE_SHIFT; + tmp = appl_readl(pcie, APPL_LINK_STATUS); + tmp &= APPL_LINK_STATUS_RDLH_LINK_UP; + if (!(val == 0x11 && !tmp)) { + /* Link is down for all good reasons */ + return 0; + } + + dev_info(pci->dev, "Link is down in DLL"); + dev_info(pci->dev, "Trying again with DLFE disabled\n"); + /* Disable LTSSM */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~APPL_CTRL_LTSSM_EN; + appl_writel(pcie, val, APPL_CTRL); + + reset_control_assert(pcie->core_rst); + reset_control_deassert(pcie->core_rst); + + offset = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_DLF); + val = dw_pcie_readl_dbi(pci, offset + PCI_DLF_CAP); + val &= ~PCI_DLF_EXCHANGE_ENABLE; + dw_pcie_writel_dbi(pci, offset, val); + + tegra_pcie_prepare_host(pp); + + if (dw_pcie_wait_for_link(pci)) + return 0; + } + + speed = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA) & + PCI_EXP_LNKSTA_CLS; + clk_set_rate(pcie->core_clk, pcie_gen_freq[speed - 1]); + + tegra_pcie_enable_interrupts(pp); + + return 0; +} + +static int tegra_pcie_dw_link_up(struct dw_pcie *pci) +{ + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA); + + return !!(val & PCI_EXP_LNKSTA_DLLLA); +} + +static void tegra_pcie_set_msi_vec_num(struct pcie_port *pp) +{ + pp->num_vectors = MAX_MSI_IRQS; +} + +static const struct dw_pcie_ops tegra_dw_pcie_ops = { + .link_up = tegra_pcie_dw_link_up, +}; + +static struct dw_pcie_host_ops tegra_pcie_dw_host_ops = { + .rd_own_conf = tegra_pcie_dw_rd_own_conf, + .wr_own_conf = tegra_pcie_dw_wr_own_conf, + .host_init = tegra_pcie_dw_host_init, + .set_num_vectors = tegra_pcie_set_msi_vec_num, +}; + +static void tegra_pcie_disable_phy(struct tegra_pcie_dw *pcie) +{ + unsigned int phy_count = pcie->phy_count; + + while (phy_count--) { + phy_power_off(pcie->phys[phy_count]); + phy_exit(pcie->phys[phy_count]); + } +} + +static int tegra_pcie_enable_phy(struct tegra_pcie_dw *pcie) +{ + unsigned int i; + int ret; + + for (i = 0; i < pcie->phy_count; i++) { + ret = phy_init(pcie->phys[i]); + if (ret < 0) + goto phy_power_off; + + ret = phy_power_on(pcie->phys[i]); + if (ret < 0) + goto phy_exit; + } + + return 0; + +phy_power_off: + while (i--) { + phy_power_off(pcie->phys[i]); +phy_exit: + phy_exit(pcie->phys[i]); + } + + return ret; +} + +static int tegra_pcie_dw_parse_dt(struct tegra_pcie_dw *pcie) +{ + struct device_node *np = pcie->dev->of_node; + int ret; + + ret = of_property_read_u32(np, "nvidia,aspm-cmrt-us", &pcie->aspm_cmrt); + if (ret < 0) { + dev_info(pcie->dev, "Failed to read ASPM T_cmrt: %d\n", ret); + return ret; + } + + ret = of_property_read_u32(np, "nvidia,aspm-pwr-on-t-us", + &pcie->aspm_pwr_on_t); + if (ret < 0) + dev_info(pcie->dev, "Failed to read ASPM Power On time: %d\n", + ret); + + ret = of_property_read_u32(np, "nvidia,aspm-l0s-entrance-latency-us", + &pcie->aspm_l0s_enter_lat); + if (ret < 0) + dev_info(pcie->dev, + "Failed to read ASPM L0s Entrance latency: %d\n", ret); + + ret = of_property_read_u32(np, "num-lanes", &pcie->num_lanes); + if (ret < 0) { + dev_err(pcie->dev, "Failed to read num-lanes: %d\n", ret); + return ret; + } + + pcie->max_speed = of_pci_get_max_link_speed(np); + + ret = of_property_read_u32_index(np, "nvidia,bpmp", 1, &pcie->cid); + if (ret) { + dev_err(pcie->dev, "Failed to read Controller-ID: %d\n", ret); + return ret; + } + + ret = of_property_count_strings(np, "phy-names"); + if (ret < 0) { + dev_err(pcie->dev, "Failed to find PHY entries: %d\n", + ret); + return ret; + } + pcie->phy_count = ret; + + if (of_property_read_bool(np, "nvidia,update-fc-fixup")) + pcie->update_fc_fixup = true; + + pcie->supports_clkreq = + of_property_read_bool(pcie->dev->of_node, "supports-clkreq"); + + pcie->enable_cdm_check = + of_property_read_bool(np, "snps,enable-cdm-check"); + + return 0; +} + +static int tegra_pcie_bpmp_set_ctrl_state(struct tegra_pcie_dw *pcie, + bool enable) +{ + struct mrq_uphy_response resp; + struct tegra_bpmp_message msg; + struct mrq_uphy_request req; + + /* Controller-5 doesn't need to have its state set by BPMP-FW */ + if (pcie->cid == 5) + return 0; + + memset(&req, 0, sizeof(req)); + memset(&resp, 0, sizeof(resp)); + + req.cmd = CMD_UPHY_PCIE_CONTROLLER_STATE; + req.controller_state.pcie_controller = pcie->cid; + req.controller_state.enable = enable; + + memset(&msg, 0, sizeof(msg)); + msg.mrq = MRQ_UPHY; + msg.tx.data = &req; + msg.tx.size = sizeof(req); + msg.rx.data = &resp; + msg.rx.size = sizeof(resp); + + return tegra_bpmp_transfer(pcie->bpmp, &msg); +} + +static void tegra_pcie_downstream_dev_to_D0(struct tegra_pcie_dw *pcie) +{ + struct pcie_port *pp = &pcie->pci.pp; + struct pci_bus *child, *root_bus = NULL; + struct pci_dev *pdev; + + /* + * link doesn't go into L2 state with some of the endpoints with Tegra + * if they are not in D0 state. So, need to make sure that immediate + * downstream devices are in D0 state before sending PME_TurnOff to put + * link into L2 state. + * This is as per PCI Express Base r4.0 v1.0 September 27-2017, + * 5.2 Link State Power Management (Page #428). + */ + + list_for_each_entry(child, &pp->root_bus->children, node) { + /* Bring downstream devices to D0 if they are not already in */ + if (child->parent == pp->root_bus) { + root_bus = child; + break; + } + } + + if (!root_bus) { + dev_err(pcie->dev, "Failed to find downstream devices\n"); + return; + } + + list_for_each_entry(pdev, &root_bus->devices, bus_list) { + if (PCI_SLOT(pdev->devfn) == 0) { + if (pci_set_power_state(pdev, PCI_D0)) + dev_err(pcie->dev, + "Failed to transition %s to D0 state\n", + dev_name(&pdev->dev)); + } + } +} + +static int tegra_pcie_config_controller(struct tegra_pcie_dw *pcie, + bool en_hw_hot_rst) +{ + int ret; + u32 val; + + ret = tegra_pcie_bpmp_set_ctrl_state(pcie, true); + if (ret) { + dev_err(pcie->dev, + "Failed to enable controller %u: %d\n", pcie->cid, ret); + return ret; + } + + ret = regulator_enable(pcie->pex_ctl_supply); + if (ret < 0) { + dev_err(pcie->dev, "Failed to enable regulator: %d\n", ret); + goto fail_reg_en; + } + + ret = clk_prepare_enable(pcie->core_clk); + if (ret) { + dev_err(pcie->dev, "Failed to enable core clock: %d\n", ret); + goto fail_core_clk; + } + + ret = reset_control_deassert(pcie->core_apb_rst); + if (ret) { + dev_err(pcie->dev, "Failed to deassert core APB reset: %d\n", + ret); + goto fail_core_apb_rst; + } + + if (en_hw_hot_rst) { + /* Enable HW_HOT_RST mode */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT); + val |= APPL_CTRL_HW_HOT_RST_EN; + appl_writel(pcie, val, APPL_CTRL); + } + + ret = tegra_pcie_enable_phy(pcie); + if (ret) { + dev_err(pcie->dev, "Failed to enable PHY: %d\n", ret); + goto fail_phy; + } + + /* Update CFG base address */ + appl_writel(pcie, pcie->dbi_res->start & APPL_CFG_BASE_ADDR_MASK, + APPL_CFG_BASE_ADDR); + + /* Configure this core for RP mode operation */ + appl_writel(pcie, APPL_DM_TYPE_RP, APPL_DM_TYPE); + + appl_writel(pcie, 0x0, APPL_CFG_SLCG_OVERRIDE); + + val = appl_readl(pcie, APPL_CTRL); + appl_writel(pcie, val | APPL_CTRL_SYS_PRE_DET_STATE, APPL_CTRL); + + val = appl_readl(pcie, APPL_CFG_MISC); + val |= (APPL_CFG_MISC_ARCACHE_VAL << APPL_CFG_MISC_ARCACHE_SHIFT); + appl_writel(pcie, val, APPL_CFG_MISC); + + if (!pcie->supports_clkreq) { + val = appl_readl(pcie, APPL_PINMUX); + val |= APPL_PINMUX_CLKREQ_OUT_OVRD_EN; + val |= APPL_PINMUX_CLKREQ_OUT_OVRD; + appl_writel(pcie, val, APPL_PINMUX); + } + + /* Update iATU_DMA base address */ + appl_writel(pcie, + pcie->atu_dma_res->start & APPL_CFG_IATU_DMA_BASE_ADDR_MASK, + APPL_CFG_IATU_DMA_BASE_ADDR); + + reset_control_deassert(pcie->core_rst); + + pcie->pcie_cap_base = dw_pcie_find_capability(&pcie->pci, + PCI_CAP_ID_EXP); + + /* Disable ASPM-L1SS advertisement as there is no CLKREQ routing */ + if (!pcie->supports_clkreq) { + disable_aspm_l11(pcie); + disable_aspm_l12(pcie); + } + + return ret; + +fail_phy: + reset_control_assert(pcie->core_apb_rst); +fail_core_apb_rst: + clk_disable_unprepare(pcie->core_clk); +fail_core_clk: + regulator_disable(pcie->pex_ctl_supply); +fail_reg_en: + tegra_pcie_bpmp_set_ctrl_state(pcie, false); + + return ret; +} + +static int __deinit_controller(struct tegra_pcie_dw *pcie) +{ + int ret; + + ret = reset_control_assert(pcie->core_rst); + if (ret) { + dev_err(pcie->dev, "Failed to assert \"core\" reset: %d\n", + ret); + return ret; + } + + tegra_pcie_disable_phy(pcie); + + ret = reset_control_assert(pcie->core_apb_rst); + if (ret) { + dev_err(pcie->dev, "Failed to assert APB reset: %d\n", ret); + return ret; + } + + clk_disable_unprepare(pcie->core_clk); + + ret = regulator_disable(pcie->pex_ctl_supply); + if (ret) { + dev_err(pcie->dev, "Failed to disable regulator: %d\n", ret); + return ret; + } + + ret = tegra_pcie_bpmp_set_ctrl_state(pcie, false); + if (ret) { + dev_err(pcie->dev, "Failed to disable controller %d: %d\n", + pcie->cid, ret); + return ret; + } + + return ret; +} + +static int tegra_pcie_init_controller(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + struct pcie_port *pp = &pci->pp; + int ret; + + ret = tegra_pcie_config_controller(pcie, false); + if (ret < 0) + return ret; + + pp->ops = &tegra_pcie_dw_host_ops; + + ret = dw_pcie_host_init(pp); + if (ret < 0) { + dev_err(pcie->dev, "Failed to add PCIe port: %d\n", ret); + goto fail_host_init; + } + + return 0; + +fail_host_init: + return __deinit_controller(pcie); +} + +static int tegra_pcie_try_link_l2(struct tegra_pcie_dw *pcie) +{ + u32 val; + + if (!tegra_pcie_dw_link_up(&pcie->pci)) + return 0; + + val = appl_readl(pcie, APPL_RADM_STATUS); + val |= APPL_PM_XMT_TURNOFF_STATE; + appl_writel(pcie, val, APPL_RADM_STATUS); + + return readl_poll_timeout_atomic(pcie->appl_base + APPL_DEBUG, val, + val & APPL_DEBUG_PM_LINKST_IN_L2_LAT, + 1, PME_ACK_TIMEOUT); +} + +static void tegra_pcie_dw_pme_turnoff(struct tegra_pcie_dw *pcie) +{ + u32 data; + int err; + + if (!tegra_pcie_dw_link_up(&pcie->pci)) { + dev_dbg(pcie->dev, "PCIe link is not up...!\n"); + return; + } + + if (tegra_pcie_try_link_l2(pcie)) { + dev_info(pcie->dev, "Link didn't transition to L2 state\n"); + /* + * TX lane clock freq will reset to Gen1 only if link is in L2 + * or detect state. + * So apply pex_rst to end point to force RP to go into detect + * state + */ + data = appl_readl(pcie, APPL_PINMUX); + data &= ~APPL_PINMUX_PEX_RST; + appl_writel(pcie, data, APPL_PINMUX); + + err = readl_poll_timeout_atomic(pcie->appl_base + APPL_DEBUG, + data, + ((data & + APPL_DEBUG_LTSSM_STATE_MASK) >> + APPL_DEBUG_LTSSM_STATE_SHIFT) == + LTSSM_STATE_PRE_DETECT, + 1, LTSSM_TIMEOUT); + if (err) { + dev_info(pcie->dev, "Link didn't go to detect state\n"); + } else { + /* Disable LTSSM after link is in detect state */ + data = appl_readl(pcie, APPL_CTRL); + data &= ~APPL_CTRL_LTSSM_EN; + appl_writel(pcie, data, APPL_CTRL); + } + } + /* + * DBI registers may not be accessible after this as PLL-E would be + * down depending on how CLKREQ is pulled by end point + */ + data = appl_readl(pcie, APPL_PINMUX); + data |= (APPL_PINMUX_CLKREQ_OVERRIDE_EN | APPL_PINMUX_CLKREQ_OVERRIDE); + /* Cut REFCLK to slot */ + data |= APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE_EN; + data &= ~APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE; + appl_writel(pcie, data, APPL_PINMUX); +} + +static int tegra_pcie_deinit_controller(struct tegra_pcie_dw *pcie) +{ + tegra_pcie_downstream_dev_to_D0(pcie); + dw_pcie_host_deinit(&pcie->pci.pp); + tegra_pcie_dw_pme_turnoff(pcie); + + return __deinit_controller(pcie); +} + +static int tegra_pcie_config_rp(struct tegra_pcie_dw *pcie) +{ + struct pcie_port *pp = &pcie->pci.pp; + struct device *dev = pcie->dev; + char *name; + int ret; + + if (IS_ENABLED(CONFIG_PCI_MSI)) { + pp->msi_irq = of_irq_get_byname(dev->of_node, "msi"); + if (!pp->msi_irq) { + dev_err(dev, "Failed to get MSI interrupt\n"); + return -ENODEV; + } + } + + pm_runtime_enable(dev); + + ret = pm_runtime_get_sync(dev); + if (ret < 0) { + dev_err(dev, "Failed to get runtime sync for PCIe dev: %d\n", + ret); + pm_runtime_disable(dev); + return ret; + } + + tegra_pcie_init_controller(pcie); + + pcie->link_state = tegra_pcie_dw_link_up(&pcie->pci); + if (!pcie->link_state) { + ret = -ENOMEDIUM; + goto fail_host_init; + } + + name = devm_kasprintf(dev, GFP_KERNEL, "%pOFP", dev->of_node); + if (!name) { + ret = -ENOMEM; + goto fail_host_init; + } + + pcie->debugfs = debugfs_create_dir(name, NULL); + if (!pcie->debugfs) + dev_err(dev, "Failed to create debugfs\n"); + else + init_debugfs(pcie); + + return ret; + +fail_host_init: + tegra_pcie_deinit_controller(pcie); + pm_runtime_put_sync(dev); + pm_runtime_disable(dev); + return ret; +} + +static int tegra_pcie_dw_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *atu_dma_res; + struct tegra_pcie_dw *pcie; + struct resource *dbi_res; + struct pcie_port *pp; + struct dw_pcie *pci; + struct phy **phys; + char *name; + int ret; + u32 i; + + pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); + if (!pcie) + return -ENOMEM; + + pci = &pcie->pci; + pci->dev = &pdev->dev; + pci->ops = &tegra_dw_pcie_ops; + pp = &pci->pp; + pcie->dev = &pdev->dev; + + ret = tegra_pcie_dw_parse_dt(pcie); + if (ret < 0) { + dev_err(dev, "Failed to parse device tree: %d\n", ret); + return ret; + } + + pcie->pex_ctl_supply = devm_regulator_get(dev, "vddio-pex-ctl"); + if (IS_ERR(pcie->pex_ctl_supply)) { + ret = PTR_ERR(pcie->pex_ctl_supply); + if (ret != -EPROBE_DEFER) + dev_err(dev, "Failed to get regulator: %ld\n", + PTR_ERR(pcie->pex_ctl_supply)); + return ret; + } + + pcie->core_clk = devm_clk_get(dev, "core"); + if (IS_ERR(pcie->core_clk)) { + dev_err(dev, "Failed to get core clock: %ld\n", + PTR_ERR(pcie->core_clk)); + return PTR_ERR(pcie->core_clk); + } + + pcie->appl_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "appl"); + if (!pcie->appl_res) { + dev_err(dev, "Failed to find \"appl\" region\n"); + return -ENODEV; + } + + pcie->appl_base = devm_ioremap_resource(dev, pcie->appl_res); + if (IS_ERR(pcie->appl_base)) + return PTR_ERR(pcie->appl_base); + + pcie->core_apb_rst = devm_reset_control_get(dev, "apb"); + if (IS_ERR(pcie->core_apb_rst)) { + dev_err(dev, "Failed to get APB reset: %ld\n", + PTR_ERR(pcie->core_apb_rst)); + return PTR_ERR(pcie->core_apb_rst); + } + + phys = devm_kcalloc(dev, pcie->phy_count, sizeof(*phys), GFP_KERNEL); + if (!phys) + return -ENOMEM; + + for (i = 0; i < pcie->phy_count; i++) { + name = kasprintf(GFP_KERNEL, "p2u-%u", i); + if (!name) { + dev_err(dev, "Failed to create P2U string\n"); + return -ENOMEM; + } + phys[i] = devm_phy_get(dev, name); + kfree(name); + if (IS_ERR(phys[i])) { + ret = PTR_ERR(phys[i]); + if (ret != -EPROBE_DEFER) + dev_err(dev, "Failed to get PHY: %d\n", ret); + return ret; + } + } + + pcie->phys = phys; + + dbi_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + if (!dbi_res) { + dev_err(dev, "Failed to find \"dbi\" region\n"); + return -ENODEV; + } + pcie->dbi_res = dbi_res; + + pci->dbi_base = devm_ioremap_resource(dev, dbi_res); + if (IS_ERR(pci->dbi_base)) + return PTR_ERR(pci->dbi_base); + + /* Tegra HW locates DBI2 at a fixed offset from DBI */ + pci->dbi_base2 = pci->dbi_base + 0x1000; + + atu_dma_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "atu_dma"); + if (!atu_dma_res) { + dev_err(dev, "Failed to find \"atu_dma\" region\n"); + return -ENODEV; + } + pcie->atu_dma_res = atu_dma_res; + + pci->atu_base = devm_ioremap_resource(dev, atu_dma_res); + if (IS_ERR(pci->atu_base)) + return PTR_ERR(pci->atu_base); + + pcie->core_rst = devm_reset_control_get(dev, "core"); + if (IS_ERR(pcie->core_rst)) { + dev_err(dev, "Failed to get core reset: %ld\n", + PTR_ERR(pcie->core_rst)); + return PTR_ERR(pcie->core_rst); + } + + pp->irq = platform_get_irq_byname(pdev, "intr"); + if (!pp->irq) { + dev_err(dev, "Failed to get \"intr\" interrupt\n"); + return -ENODEV; + } + + ret = devm_request_irq(dev, pp->irq, tegra_pcie_irq_handler, + IRQF_SHARED, "tegra-pcie-intr", pcie); + if (ret) { + dev_err(dev, "Failed to request IRQ %d: %d\n", pp->irq, ret); + return ret; + } + + pcie->bpmp = tegra_bpmp_get(dev); + if (IS_ERR(pcie->bpmp)) + return PTR_ERR(pcie->bpmp); + + platform_set_drvdata(pdev, pcie); + + ret = tegra_pcie_config_rp(pcie); + if (ret && ret != -ENOMEDIUM) + goto fail; + else + return 0; + +fail: + tegra_bpmp_put(pcie->bpmp); + return ret; +} + +static int tegra_pcie_dw_remove(struct platform_device *pdev) +{ + struct tegra_pcie_dw *pcie = platform_get_drvdata(pdev); + + if (!pcie->link_state) + return 0; + + debugfs_remove_recursive(pcie->debugfs); + tegra_pcie_deinit_controller(pcie); + pm_runtime_put_sync(pcie->dev); + pm_runtime_disable(pcie->dev); + tegra_bpmp_put(pcie->bpmp); + + return 0; +} + +static int tegra_pcie_dw_suspend_late(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + u32 val; + + if (!pcie->link_state) + return 0; + + /* Enable HW_HOT_RST mode */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT); + val |= APPL_CTRL_HW_HOT_RST_EN; + appl_writel(pcie, val, APPL_CTRL); + + return 0; +} + +static int tegra_pcie_dw_suspend_noirq(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + + if (!pcie->link_state) + return 0; + + /* Save MSI interrupt vector */ + pcie->msi_ctrl_int = dw_pcie_readl_dbi(&pcie->pci, + PORT_LOGIC_MSI_CTRL_INT_0_EN); + tegra_pcie_downstream_dev_to_D0(pcie); + tegra_pcie_dw_pme_turnoff(pcie); + + return __deinit_controller(pcie); +} + +static int tegra_pcie_dw_resume_noirq(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + int ret; + + if (!pcie->link_state) + return 0; + + ret = tegra_pcie_config_controller(pcie, true); + if (ret < 0) + return ret; + + ret = tegra_pcie_dw_host_init(&pcie->pci.pp); + if (ret < 0) { + dev_err(dev, "Failed to init host: %d\n", ret); + goto fail_host_init; + } + + /* Restore MSI interrupt vector */ + dw_pcie_writel_dbi(&pcie->pci, PORT_LOGIC_MSI_CTRL_INT_0_EN, + pcie->msi_ctrl_int); + + return 0; + +fail_host_init: + return __deinit_controller(pcie); +} + +static int tegra_pcie_dw_resume_early(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + u32 val; + + if (!pcie->link_state) + return 0; + + /* Disable HW_HOT_RST mode */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT); + val |= APPL_CTRL_HW_HOT_RST_MODE_IMDT_RST << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT; + val &= ~APPL_CTRL_HW_HOT_RST_EN; + appl_writel(pcie, val, APPL_CTRL); + + return 0; +} + +static void tegra_pcie_dw_shutdown(struct platform_device *pdev) +{ + struct tegra_pcie_dw *pcie = platform_get_drvdata(pdev); + + if (!pcie->link_state) + return; + + debugfs_remove_recursive(pcie->debugfs); + tegra_pcie_downstream_dev_to_D0(pcie); + + disable_irq(pcie->pci.pp.irq); + if (IS_ENABLED(CONFIG_PCI_MSI)) + disable_irq(pcie->pci.pp.msi_irq); + + tegra_pcie_dw_pme_turnoff(pcie); + __deinit_controller(pcie); +} + +static const struct of_device_id tegra_pcie_dw_of_match[] = { + { + .compatible = "nvidia,tegra194-pcie", + }, + {}, +}; + +static const struct dev_pm_ops tegra_pcie_dw_pm_ops = { + .suspend_late = tegra_pcie_dw_suspend_late, + .suspend_noirq = tegra_pcie_dw_suspend_noirq, + .resume_noirq = tegra_pcie_dw_resume_noirq, + .resume_early = tegra_pcie_dw_resume_early, +}; + +static struct platform_driver tegra_pcie_dw_driver = { + .probe = tegra_pcie_dw_probe, + .remove = tegra_pcie_dw_remove, + .shutdown = tegra_pcie_dw_shutdown, + .driver = { + .name = "tegra194-pcie", + .pm = &tegra_pcie_dw_pm_ops, + .of_match_table = tegra_pcie_dw_of_match, + }, +}; +module_platform_driver(tegra_pcie_dw_driver); + +MODULE_DEVICE_TABLE(of, tegra_pcie_dw_of_match); + +MODULE_AUTHOR("Vidya Sagar "); +MODULE_DESCRIPTION("NVIDIA PCIe host controller driver"); +MODULE_LICENSE("GPL v2"); From 151481ef5e360af4ca68835943a036339d3c7826 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:48 +0530 Subject: [PATCH 294/721] dt-bindings: PCI: tegra: Add sideband pins configuration entries Add optional bindings "pinctrl-names" and "pinctrl-0" to describe pin configuration information of a particular PCIe controller. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Reviewed-by: Rob Herring Acked-by: Thierry Reding --- .../devicetree/bindings/pci/nvidia,tegra194-pcie.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt index 674e5adb2895..0ac1b867ac24 100644 --- a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt +++ b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt @@ -83,6 +83,11 @@ Required properties: - vddio-pex-ctl-supply: Regulator supply for PCIe side band signals Optional properties: +- pinctrl-names: A list of pinctrl state names. + It is mandatory for C5 controller and optional for other controllers. + - "default": Configures PCIe I/O for proper operation. +- pinctrl-0: phandle for the 'default' state of pin configuration. + It is mandatory for C5 controller and optional for other controllers. - supports-clkreq: Refer to Documentation/devicetree/bindings/pci/pci.txt - nvidia,update-fc-fixup: This is a boolean property and needs to be present to improve performance when a platform is designed in such a way that it @@ -120,6 +125,9 @@ Tegra194: num-lanes = <8>; linux,pci-domain = <0>; + pinctrl-names = "default"; + pinctrl-0 = <&pex_rst_c5_out_state>, <&clkreq_c5_bi_dir_state>; + clocks = <&bpmp TEGRA194_CLK_PEX0_CORE_0>; clock-names = "core"; From 7ed106d8fdfa4e6cbf9226f49ce921a681c9f240 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:49 +0530 Subject: [PATCH 295/721] dt-bindings: PCI: tegra: Add PCIe slot supplies regulator entries Add optional bindings "vpcie3v3-supply" and "vpcie12v-supply" to describe regulators of a PCIe slot's supplies 3.3V and 12V provided the platform is designed to have regulator controlled slot supplies. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Reviewed-by: Rob Herring Acked-by: Thierry Reding --- .../devicetree/bindings/pci/nvidia,tegra194-pcie.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt index 0ac1b867ac24..b739f92da58e 100644 --- a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt +++ b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt @@ -104,6 +104,12 @@ Optional properties: specified in microseconds - nvidia,aspm-l0s-entrance-latency-us: ASPM L0s entrance latency to be specified in microseconds +- vpcie3v3-supply: A phandle to the regulator node that supplies 3.3V to the slot + if the platform has one such slot. (Ex:- x16 slot owned by C5 controller + in p2972-0000 platform). +- vpcie12v-supply: A phandle to the regulator node that supplies 12V to the slot + if the platform has one such slot. (Ex:- x16 slot owned by C5 controller + in p2972-0000 platform). Examples: ========= @@ -156,6 +162,8 @@ Tegra194: 0xc2000000 0x18 0x00000000 0x18 0x00000000 0x4 0x00000000>; /* prefetchable memory (16GB) */ vddio-pex-ctl-supply = <&vdd_1v8ao>; + vpcie3v3-supply = <&vdd_3v3_pcie>; + vpcie12v-supply = <&vdd_12v_pcie>; phys = <&p2u_hsio_2>, <&p2u_hsio_3>, <&p2u_hsio_4>, <&p2u_hsio_5>; From f73f8a504e27959576a2f4d85182202561e426f2 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Thu, 15 Aug 2019 17:01:45 +0000 Subject: [PATCH 296/721] PCI: hv: Use bytes 4 and 5 from instance ID as the PCI domain numbers As recommended by Azure host team, the bytes 4, 5 have more uniqueness (info entropy) than bytes 8, 9 so use them as the PCI domain numbers. On older hosts, bytes 4, 5 can also be used -- no backward compatibility issues are introduced and the chance of collision is greatly reduced. In the rare cases of collision, the driver code detects and finds another number that is not in use. Suggested-by: Michael Kelley Signed-off-by: Haiyang Zhang Signed-off-by: Lorenzo Pieralisi Acked-by: Sasha Levin --- drivers/pci/controller/pci-hyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 4caa3388692a..3a56de6b2ec2 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2590,7 +2590,7 @@ static int hv_pci_probe(struct hv_device *hdev, * (2) There will be no overlap between domains (after fixing possible * collisions) in the same VM. */ - dom_req = hdev->dev_instance.b[8] << 8 | hdev->dev_instance.b[9]; + dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; dom = hv_get_dom_num(dom_req); if (dom == HVPCI_DOM_INVALID) { From 76e43c8ccaa35c30d5df853013561145a0f750a5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 8 Sep 2019 20:15:18 -0700 Subject: [PATCH 297/721] fuse: fix deadlock with aio poll and fuse_iqueue::waitq.lock When IOCB_CMD_POLL is used on the FUSE device, aio_poll() disables IRQs and takes kioctx::ctx_lock, then fuse_iqueue::waitq.lock. This may have to wait for fuse_iqueue::waitq.lock to be released by one of many places that take it with IRQs enabled. Since the IRQ handler may take kioctx::ctx_lock, lockdep reports that a deadlock is possible. Fix it by protecting the state of struct fuse_iqueue with a separate spinlock, and only accessing fuse_iqueue::waitq using the versions of the waitqueue functions which do IRQ-safe locking internally. Reproducer: #include #include #include #include #include #include #include int main() { char opts[128]; int fd = open("/dev/fuse", O_RDWR); aio_context_t ctx = 0; struct iocb cb = { .aio_lio_opcode = IOCB_CMD_POLL, .aio_fildes = fd }; struct iocb *cbp = &cb; sprintf(opts, "fd=%d,rootmode=040000,user_id=0,group_id=0", fd); mkdir("mnt", 0700); mount("foo", "mnt", "fuse", 0, opts); syscall(__NR_io_setup, 1, &ctx); syscall(__NR_io_submit, ctx, 1, &cbp); } Beginning of lockdep output: ===================================================== WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected 5.3.0-rc5 #9 Not tainted ----------------------------------------------------- syz_fuse/135 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: 000000003590ceda (&fiq->waitq){+.+.}, at: spin_lock include/linux/spinlock.h:338 [inline] 000000003590ceda (&fiq->waitq){+.+.}, at: aio_poll fs/aio.c:1751 [inline] 000000003590ceda (&fiq->waitq){+.+.}, at: __io_submit_one.constprop.0+0x203/0x5b0 fs/aio.c:1825 and this task is already holding: 0000000075037284 (&(&ctx->ctx_lock)->rlock){..-.}, at: spin_lock_irq include/linux/spinlock.h:363 [inline] 0000000075037284 (&(&ctx->ctx_lock)->rlock){..-.}, at: aio_poll fs/aio.c:1749 [inline] 0000000075037284 (&(&ctx->ctx_lock)->rlock){..-.}, at: __io_submit_one.constprop.0+0x1f4/0x5b0 fs/aio.c:1825 which would create a new lock dependency: (&(&ctx->ctx_lock)->rlock){..-.} -> (&fiq->waitq){+.+.} but this new dependency connects a SOFTIRQ-irq-safe lock: (&(&ctx->ctx_lock)->rlock){..-.} [...] Reported-by: syzbot+af05535bb79520f95431@syzkaller.appspotmail.com Reported-by: syzbot+d86c4426a01f60feddc7@syzkaller.appspotmail.com Fixes: bfe4037e722e ("aio: implement IOCB_CMD_POLL") Cc: # v4.19+ Cc: Christoph Hellwig Signed-off-by: Eric Biggers Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 93 +++++++++++++++++++++++++----------------------- fs/fuse/fuse_i.h | 3 ++ fs/fuse/inode.c | 1 + 3 files changed, 52 insertions(+), 45 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fdb85895737b..5a5dd6b2f91e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -377,7 +377,7 @@ static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); list_add_tail(&req->list, &fiq->pending); - wake_up_locked(&fiq->waitq); + wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } @@ -389,16 +389,16 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (fiq->connected) { fiq->forget_list_tail->next = forget; fiq->forget_list_tail = forget; - wake_up_locked(&fiq->waitq); + wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } else { kfree(forget); } - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } static void flush_bg_queue(struct fuse_conn *fc) @@ -412,10 +412,10 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); req->in.h.unique = fuse_get_unique(fiq); queue_request(fiq, req); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } } @@ -439,9 +439,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) * smp_mb() from queue_interrupt(). */ if (!list_empty(&req->intr_entry)) { - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); @@ -483,10 +483,10 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return -EINVAL; } @@ -499,13 +499,13 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) smp_mb(); if (test_bit(FR_FINISHED, &req->flags)) { list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return 0; } - wake_up_locked(&fiq->waitq); + wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return 0; } @@ -535,16 +535,16 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) if (!err) return; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); /* Request is not yet in userspace, bail out */ if (test_bit(FR_PENDING, &req->flags)) { list_del(&req->list); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); __fuse_put_request(req); req->out.h.error = -EINTR; return; } - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } /* @@ -559,9 +559,9 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) struct fuse_iqueue *fiq = &fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (!fiq->connected) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); req->out.h.error = -ENOTCONN; } else { req->in.h.unique = fuse_get_unique(fiq); @@ -569,7 +569,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) /* acquire extra reference, since request is still needed after request_end() */ __fuse_get_request(req); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); request_wait_answer(fc, req); /* Pairs with smp_wmb() in request_end() */ @@ -700,12 +700,12 @@ static int fuse_request_send_notify_reply(struct fuse_conn *fc, __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (fiq->connected) { queue_request(fiq, req); err = 0; } - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return err; } @@ -1149,12 +1149,12 @@ static int request_pending(struct fuse_iqueue *fiq) * Unlike other requests this is assembled on demand, without a need * to allocate a separate fuse_req structure. * - * Called with fiq->waitq.lock held, releases it + * Called with fiq->lock held, releases it */ static int fuse_read_interrupt(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1169,7 +1169,7 @@ __releases(fiq->waitq.lock) ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); arg.unique = req->in.h.unique; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); if (nbytes < reqsize) return -EINVAL; @@ -1206,7 +1206,7 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { int err; struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); @@ -1220,7 +1220,7 @@ __releases(fiq->waitq.lock) .len = sizeof(ih) + sizeof(arg), }; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); kfree(forget); if (nbytes < ih.len) return -EINVAL; @@ -1238,7 +1238,7 @@ __releases(fiq->waitq.lock) static int fuse_read_batch_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { int err; unsigned max_forgets; @@ -1252,13 +1252,13 @@ __releases(fiq->waitq.lock) }; if (nbytes < ih.len) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return -EINVAL; } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); head = dequeue_forget(fiq, max_forgets, &count); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); arg.count = count; ih.len += count * sizeof(struct fuse_forget_one); @@ -1288,7 +1288,7 @@ __releases(fiq->waitq.lock) static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) return fuse_read_single_forget(fiq, cs, nbytes); @@ -1336,16 +1336,19 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, return -EINVAL; restart: - spin_lock(&fiq->waitq.lock); - err = -EAGAIN; - if ((file->f_flags & O_NONBLOCK) && fiq->connected && - !request_pending(fiq)) - goto err_unlock; + for (;;) { + spin_lock(&fiq->lock); + if (!fiq->connected || request_pending(fiq)) + break; + spin_unlock(&fiq->lock); - err = wait_event_interruptible_exclusive_locked(fiq->waitq, + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + err = wait_event_interruptible_exclusive(fiq->waitq, !fiq->connected || request_pending(fiq)); - if (err) - goto err_unlock; + if (err) + return err; + } if (!fiq->connected) { err = fc->aborted ? -ECONNABORTED : -ENODEV; @@ -1369,7 +1372,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, req = list_entry(fiq->pending.next, struct fuse_req, list); clear_bit(FR_PENDING, &req->flags); list_del_init(&req->list); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); in = &req->in; reqsize = in->h.len; @@ -1427,7 +1430,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, return err; err_unlock: - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return err; } @@ -2139,12 +2142,12 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) fiq = &fud->fc->iq; poll_wait(file, &fiq->waitq, wait); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (!fiq->connected) mask = EPOLLERR; else if (request_pending(fiq)) mask |= EPOLLIN | EPOLLRDNORM; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return mask; } @@ -2239,15 +2242,15 @@ void fuse_abort_conn(struct fuse_conn *fc) flush_bg_queue(fc); spin_unlock(&fc->bg_lock); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); fiq->connected = 0; list_for_each_entry(req, &fiq->pending, list) clear_bit(FR_PENDING, &req->flags); list_splice_tail_init(&fiq->pending, &to_end); while (forget_pending(fiq)) kfree(dequeue_forget(fiq, 1, NULL)); - wake_up_all_locked(&fiq->waitq); - spin_unlock(&fiq->waitq.lock); + wake_up_all(&fiq->waitq); + spin_unlock(&fiq->lock); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); end_polls(fc); wake_up_all(&fc->blocked_waitq); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 24dbca777775..89bdc41e0d86 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -450,6 +450,9 @@ struct fuse_iqueue { /** Connection established */ unsigned connected; + /** Lock protecting accesses to members of this structure */ + spinlock_t lock; + /** Readers of the connection are waiting on this */ wait_queue_head_t waitq; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2183967261a4..9ae9fdd5a014 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -589,6 +589,7 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) static void fuse_iqueue_init(struct fuse_iqueue *fiq) { memset(fiq, 0, sizeof(struct fuse_iqueue)); + spin_lock_init(&fiq->lock); init_waitqueue_head(&fiq->waitq); INIT_LIST_HEAD(&fiq->pending); INIT_LIST_HEAD(&fiq->interrupts); From d5b4854357f47899ea5b0336b41b04e81b62b11d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 298/721] fuse: flatten 'struct fuse_args' ...to make future expansion simpler. The hiearachical structure is a historical thing that does not serve any practical purpose. The generated code is excatly the same before and after the patch. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 38 +++++----- fs/fuse/dir.c | 176 +++++++++++++++++++++++------------------------ fs/fuse/file.c | 116 +++++++++++++++---------------- fs/fuse/fuse_i.h | 21 ++---- fs/fuse/inode.c | 12 ++-- fs/fuse/xattr.c | 76 ++++++++++---------- 6 files changed, 216 insertions(+), 223 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5a5dd6b2f91e..e865c6b61652 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -590,32 +590,32 @@ EXPORT_SYMBOL_GPL(fuse_request_send); static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) { - if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS) - args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE; + if (fc->minor < 4 && args->opcode == FUSE_STATFS) + args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE; if (fc->minor < 9) { - switch (args->in.h.opcode) { + switch (args->opcode) { case FUSE_LOOKUP: case FUSE_CREATE: case FUSE_MKNOD: case FUSE_MKDIR: case FUSE_SYMLINK: case FUSE_LINK: - args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; break; case FUSE_GETATTR: case FUSE_SETATTR: - args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; break; } } if (fc->minor < 12) { - switch (args->in.h.opcode) { + switch (args->opcode) { case FUSE_CREATE: - args->in.args[0].size = sizeof(struct fuse_open_in); + args->in_args[0].size = sizeof(struct fuse_open_in); break; case FUSE_MKNOD: - args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; + args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; break; } } @@ -633,19 +633,19 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) /* Needs to be done after fuse_get_req() so that fc->minor is valid */ fuse_adjust_compat(fc, args); - req->in.h.opcode = args->in.h.opcode; - req->in.h.nodeid = args->in.h.nodeid; - req->in.numargs = args->in.numargs; - memcpy(req->in.args, args->in.args, - args->in.numargs * sizeof(struct fuse_in_arg)); - req->out.argvar = args->out.argvar; - req->out.numargs = args->out.numargs; - memcpy(req->out.args, args->out.args, - args->out.numargs * sizeof(struct fuse_arg)); + req->in.h.opcode = args->opcode; + req->in.h.nodeid = args->nodeid; + req->in.numargs = args->in_numargs; + memcpy(req->in.args, args->in_args, + args->in_numargs * sizeof(struct fuse_in_arg)); + req->out.argvar = args->out_argvar; + req->out.numargs = args->out_numargs; + memcpy(req->out.args, args->out_args, + args->out_numargs * sizeof(struct fuse_arg)); fuse_request_send(fc, req); ret = req->out.h.error; - if (!ret && args->out.argvar) { - BUG_ON(args->out.numargs != 1); + if (!ret && args->out_argvar) { + BUG_ON(args->out_numargs != 1); ret = req->out.args[0].size; } fuse_put_request(fc, req); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dd0f64f7bc06..30a96098e64a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -139,14 +139,14 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, struct fuse_entry_out *outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); - args->in.h.opcode = FUSE_LOOKUP; - args->in.h.nodeid = nodeid; - args->in.numargs = 1; - args->in.args[0].size = name->len + 1; - args->in.args[0].value = name->name; - args->out.numargs = 1; - args->out.args[0].size = sizeof(struct fuse_entry_out); - args->out.args[0].value = outarg; + args->opcode = FUSE_LOOKUP; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = name->len + 1; + args->in_args[0].value = name->name; + args->out_numargs = 1; + args->out_args[0].size = sizeof(struct fuse_entry_out); + args->out_args[0].value = outarg; } /* @@ -410,18 +410,18 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); - args.in.h.opcode = FUSE_CREATE; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; - args.out.numargs = 2; - args.out.args[0].size = sizeof(outentry); - args.out.args[0].value = &outentry; - args.out.args[1].size = sizeof(outopen); - args.out.args[1].value = &outopen; + args.opcode = FUSE_CREATE; + args.nodeid = get_node_id(dir); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + args.out_numargs = 2; + args.out_args[0].size = sizeof(outentry); + args.out_args[0].value = &outentry; + args.out_args[1].size = sizeof(outopen); + args.out_args[1].value = &outopen; err = fuse_simple_request(fc, &args); if (err) goto out_free_ff; @@ -526,10 +526,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, return -ENOMEM; memset(&outarg, 0, sizeof(outarg)); - args->in.h.nodeid = get_node_id(dir); - args->out.numargs = 1; - args->out.args[0].size = sizeof(outarg); - args->out.args[0].value = &outarg; + args->nodeid = get_node_id(dir); + args->out_numargs = 1; + args->out_args[0].size = sizeof(outarg); + args->out_args[0].value = &outarg; err = fuse_simple_request(fc, args); if (err) goto out_put_forget_req; @@ -582,12 +582,12 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, inarg.mode = mode; inarg.rdev = new_encode_dev(rdev); inarg.umask = current_umask(); - args.in.h.opcode = FUSE_MKNOD; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; + args.opcode = FUSE_MKNOD; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; return create_new_entry(fc, &args, dir, entry, mode); } @@ -609,12 +609,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.umask = current_umask(); - args.in.h.opcode = FUSE_MKDIR; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; + args.opcode = FUSE_MKDIR; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; return create_new_entry(fc, &args, dir, entry, S_IFDIR); } @@ -625,12 +625,12 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, unsigned len = strlen(link) + 1; FUSE_ARGS(args); - args.in.h.opcode = FUSE_SYMLINK; - args.in.numargs = 2; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; - args.in.args[1].size = len; - args.in.args[1].value = link; + args.opcode = FUSE_SYMLINK; + args.in_numargs = 2; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; + args.in_args[1].size = len; + args.in_args[1].value = link; return create_new_entry(fc, &args, dir, entry, S_IFLNK); } @@ -648,11 +648,11 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); - args.in.h.opcode = FUSE_UNLINK; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 1; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; + args.opcode = FUSE_UNLINK; + args.nodeid = get_node_id(dir); + args.in_numargs = 1; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { struct inode *inode = d_inode(entry); @@ -684,11 +684,11 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); - args.in.h.opcode = FUSE_RMDIR; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 1; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; + args.opcode = FUSE_RMDIR; + args.nodeid = get_node_id(dir); + args.in_numargs = 1; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(d_inode(entry)); @@ -711,15 +711,15 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); inarg.flags = flags; - args.in.h.opcode = opcode; - args.in.h.nodeid = get_node_id(olddir); - args.in.numargs = 3; - args.in.args[0].size = argsize; - args.in.args[0].value = &inarg; - args.in.args[1].size = oldent->d_name.len + 1; - args.in.args[1].value = oldent->d_name.name; - args.in.args[2].size = newent->d_name.len + 1; - args.in.args[2].value = newent->d_name.name; + args.opcode = opcode; + args.nodeid = get_node_id(olddir); + args.in_numargs = 3; + args.in_args[0].size = argsize; + args.in_args[0].value = &inarg; + args.in_args[1].size = oldent->d_name.len + 1; + args.in_args[1].value = oldent->d_name.name; + args.in_args[2].size = newent->d_name.len + 1; + args.in_args[2].value = newent->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { /* ctime changes */ @@ -796,12 +796,12 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); - args.in.h.opcode = FUSE_LINK; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = newent->d_name.len + 1; - args.in.args[1].value = newent->d_name.name; + args.opcode = FUSE_LINK; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = newent->d_name.len + 1; + args.in_args[1].value = newent->d_name.name; err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); /* Contrary to "normal" filesystems it can happen that link makes two "logical" inodes point to the same "physical" @@ -884,14 +884,14 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } - args.in.h.opcode = FUSE_GETATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_GETATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) { if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { @@ -1056,11 +1056,11 @@ static int fuse_access(struct inode *inode, int mask) memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); - args.in.h.opcode = FUSE_ACCESS; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_ACCESS; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_access = 1; @@ -1383,14 +1383,14 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, struct fuse_setattr_in *inarg_p, struct fuse_attr_out *outarg_p) { - args->in.h.opcode = FUSE_SETATTR; - args->in.h.nodeid = get_node_id(inode); - args->in.numargs = 1; - args->in.args[0].size = sizeof(*inarg_p); - args->in.args[0].value = inarg_p; - args->out.numargs = 1; - args->out.args[0].size = sizeof(*outarg_p); - args->out.args[0].value = outarg_p; + args->opcode = FUSE_SETATTR; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg_p); + args->in_args[0].value = inarg_p; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg_p); + args->out_args[0].value = outarg_p; } /* diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e076c2cf65b0..1f90722f0ee8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -29,14 +29,14 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; - args.in.h.opcode = opcode; - args.in.h.nodeid = nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(*outargp); - args.out.args[0].value = outargp; + args.opcode = opcode; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(*outargp); + args.out_args[0].value = outargp; return fuse_simple_request(fc, &args); } @@ -464,11 +464,11 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; - args.in.h.opcode = opcode; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = opcode; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; return fuse_simple_request(fc, &args); } @@ -2221,11 +2221,11 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, inarg->lk.pid = pid; if (flock) inarg->lk_flags |= FUSE_LK_FLOCK; - args->in.h.opcode = opcode; - args->in.h.nodeid = get_node_id(inode); - args->in.numargs = 1; - args->in.args[0].size = sizeof(*inarg); - args->in.args[0].value = inarg; + args->opcode = opcode; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; } static int fuse_getlk(struct file *file, struct file_lock *fl) @@ -2238,9 +2238,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) int err; fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) err = convert_fuse_file_lock(fc, &outarg.lk, fl); @@ -2335,14 +2335,14 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) memset(&inarg, 0, sizeof(inarg)); inarg.block = block; inarg.blocksize = inode->i_sb->s_blocksize; - args.in.h.opcode = FUSE_BMAP; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_BMAP; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) fc->no_bmap = 1; @@ -2367,14 +2367,14 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) if (fc->no_lseek) goto fallback; - args.in.h.opcode = FUSE_LSEEK; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_LSEEK; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err) { if (err == -ENOSYS) { @@ -2860,14 +2860,14 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) fuse_register_polled_file(fc, ff); } - args.in.h.opcode = FUSE_POLL; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_POLL; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) @@ -3075,11 +3075,11 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - args.in.h.opcode = FUSE_FALLOCATE; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_FALLOCATE; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_fallocate = 1; @@ -3167,14 +3167,14 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.in.h.opcode = FUSE_COPY_FILE_RANGE; - args.in.h.nodeid = ff_in->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_COPY_FILE_RANGE; + args.nodeid = ff_in->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_copy_file_range = 1; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 89bdc41e0d86..835c0671320c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -287,20 +287,13 @@ struct fuse_page_desc { }; struct fuse_args { - struct { - struct { - uint32_t opcode; - uint64_t nodeid; - } h; - unsigned numargs; - struct fuse_in_arg args[3]; - - } in; - struct { - unsigned argvar:1; - unsigned numargs; - struct fuse_arg args[2]; - } out; + uint32_t opcode; + uint64_t nodeid; + unsigned int in_numargs; + struct fuse_in_arg in_args[3]; + unsigned int out_argvar:1; + unsigned int out_numargs; + struct fuse_arg out_args[2]; }; #define FUSE_ARGS(args) struct fuse_args args = {} diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9ae9fdd5a014..4eaea0b29965 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -437,12 +437,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) } memset(&outarg, 0, sizeof(outarg)); - args.in.numargs = 0; - args.in.h.opcode = FUSE_STATFS; - args.in.h.nodeid = get_node_id(d_inode(dentry)); - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.in_numargs = 0; + args.opcode = FUSE_STATFS; + args.nodeid = get_node_id(d_inode(dentry)); + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 433717640f78..2e02486e46e6 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -25,15 +25,15 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; - args.in.h.opcode = FUSE_SETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 3; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; - args.in.args[2].size = size; - args.in.args[2].value = value; + args.opcode = FUSE_SETXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 3; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; + args.in_args[2].size = size; + args.in_args[2].value = value; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_setxattr = 1; @@ -60,22 +60,22 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - args.in.h.opcode = FUSE_GETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; + args.opcode = FUSE_GETXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; /* This is really two different operations rolled into one */ - args.out.numargs = 1; + args.out_numargs = 1; if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = value; + args.out_argvar = 1; + args.out_args[0].size = size; + args.out_args[0].value = value; } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; } ret = fuse_simple_request(fc, &args); if (!ret && !size) @@ -121,20 +121,20 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - args.in.h.opcode = FUSE_LISTXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_LISTXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; /* This is really two different operations rolled into one */ - args.out.numargs = 1; + args.out_numargs = 1; if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = list; + args.out_argvar = 1; + args.out_args[0].size = size; + args.out_args[0].value = list; } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; } ret = fuse_simple_request(fc, &args); if (!ret && !size) @@ -157,11 +157,11 @@ int fuse_removexattr(struct inode *inode, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - args.in.h.opcode = FUSE_REMOVEXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = strlen(name) + 1; - args.in.args[0].value = name; + args.opcode = FUSE_REMOVEXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = strlen(name) + 1; + args.in_args[0].value = name; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_removexattr = 1; From 1f4e9d03d1fbff428a0e864d5456e0a2dace6f81 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 299/721] fuse: rearrange and resize fuse_args fields This makes the structure better packed. Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 8 ++++---- fs/fuse/xattr.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 835c0671320c..a89362ee46d9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -287,12 +287,12 @@ struct fuse_page_desc { }; struct fuse_args { - uint32_t opcode; uint64_t nodeid; - unsigned int in_numargs; + uint32_t opcode; + unsigned short in_numargs; + unsigned short out_numargs; + bool out_argvar:1; struct fuse_in_arg in_args[3]; - unsigned int out_argvar:1; - unsigned int out_numargs; struct fuse_arg out_args[2]; }; diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 2e02486e46e6..20d052e08b3b 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -70,7 +70,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, /* This is really two different operations rolled into one */ args.out_numargs = 1; if (size) { - args.out_argvar = 1; + args.out_argvar = true; args.out_args[0].size = size; args.out_args[0].value = value; } else { @@ -129,7 +129,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) /* This is really two different operations rolled into one */ args.out_numargs = 1; if (size) { - args.out_argvar = 1; + args.out_argvar = true; args.out_args[0].size = size; args.out_args[0].value = list; } else { From 40ac7ab2d02176f8a70e37b88e41637ed97b304b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 300/721] fuse: simplify 'nofail' request Instead of complex games with a reserved request, just use __GFP_NOFAIL. Both calers (flush, readdir) guarantee that connection was already initialized, so no need to wait for fc->initialized. Also remove unneeded clearing of FR_BACKGROUND flag. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 64 +++--------------------------------------------- fs/fuse/file.c | 2 +- fs/fuse/fuse_i.h | 8 +----- fs/fuse/inode.c | 1 - 4 files changed, 6 insertions(+), 69 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e865c6b61652..7833aee97565 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -242,52 +242,6 @@ struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, } EXPORT_SYMBOL_GPL(fuse_get_req_for_background); -/* - * Return request in fuse_file->reserved_req. However that may - * currently be in use. If that is the case, wait for it to become - * available. - */ -static struct fuse_req *get_reserved_req(struct fuse_conn *fc, - struct file *file) -{ - struct fuse_req *req = NULL; - struct fuse_inode *fi = get_fuse_inode(file_inode(file)); - struct fuse_file *ff = file->private_data; - - do { - wait_event(fc->reserved_req_waitq, ff->reserved_req); - spin_lock(&fi->lock); - if (ff->reserved_req) { - req = ff->reserved_req; - ff->reserved_req = NULL; - req->stolen_file = get_file(file); - } - spin_unlock(&fi->lock); - } while (!req); - - return req; -} - -/* - * Put stolen request back into fuse_file->reserved_req - */ -static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) -{ - struct file *file = req->stolen_file; - struct fuse_inode *fi = get_fuse_inode(file_inode(file)); - struct fuse_file *ff = file->private_data; - - WARN_ON(req->max_pages); - spin_lock(&fi->lock); - memset(req, 0, sizeof(*req)); - fuse_request_init(req, NULL, NULL, 0); - BUG_ON(ff->reserved_req); - ff->reserved_req = req; - wake_up_all(&fc->reserved_req_waitq); - spin_unlock(&fi->lock); - fput(file); -} - /* * Gets a requests for a file operation, always succeeds * @@ -301,25 +255,18 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) * filesystem should not have it's own file open. If deadlock is * intentional, it can still be broken by "aborting" the filesystem. */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, - struct file *file) +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc) { struct fuse_req *req; atomic_inc(&fc->num_waiting); - wait_event(fc->blocked_waitq, fc->initialized); - /* Matches smp_wmb() in fuse_set_initialized() */ - smp_rmb(); - req = fuse_request_alloc(0); - if (!req) - req = get_reserved_req(fc, file); + req = __fuse_request_alloc(0, GFP_KERNEL | __GFP_NOFAIL); req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); __set_bit(FR_WAITING, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); return req; } @@ -342,10 +289,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) fuse_drop_waiting(fc); } - if (req->stolen_file) - put_reserved_req(fc, req); - else - fuse_request_free(req); + fuse_request_free(req); } } EXPORT_SYMBOL_GPL(fuse_put_request); @@ -719,7 +663,7 @@ void fuse_force_forget(struct file *file, u64 nodeid) memset(&inarg, 0, sizeof(inarg)); inarg.nlookup = 1; - req = fuse_get_req_nofail_nopages(fc, file); + req = fuse_get_req_nofail_nopages(fc); req->in.h.opcode = FUSE_FORGET; req->in.h.nodeid = nodeid; req->in.numargs = 1; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1f90722f0ee8..7d12c1d27132 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -432,7 +432,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - req = fuse_get_req_nofail_nopages(fc, file); + req = fuse_get_req_nofail_nopages(fc); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a89362ee46d9..dd199391d6b9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -435,8 +435,6 @@ struct fuse_req { /** Request completion callback */ void (*end)(struct fuse_conn *, struct fuse_req *); - /** Request is stolen from fuse_file->reserved_req */ - struct file *stolen_file; }; struct fuse_iqueue { @@ -580,9 +578,6 @@ struct fuse_conn { /** waitq for blocked connection */ wait_queue_head_t blocked_waitq; - /** waitq for reserved requests */ - wait_queue_head_t reserved_req_waitq; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -927,8 +922,7 @@ void __fuse_get_request(struct fuse_req *req); /** * Gets a requests for a file operation, always succeeds */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, - struct file *file); +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc); /** * Decrement reference count of a request. If count goes to zero free diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4eaea0b29965..2b9cc19fedcb 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -617,7 +617,6 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); - init_waitqueue_head(&fc->reserved_req_waitq); fuse_iqueue_init(&fc->iq); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); From c500ebaa908dbf6b3c562778a25d7e945b04f40f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 301/721] fuse: convert flush to simple api Add 'force' to fuse_args and use fuse_get_req_nofail_nopages() to allocate the request in that case. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 13 +++++++++---- fs/fuse/file.c | 20 +++++++++----------- fs/fuse/fuse_i.h | 6 +----- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 7833aee97565..a7be13b5d6ef 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -255,7 +255,7 @@ EXPORT_SYMBOL_GPL(fuse_get_req_for_background); * filesystem should not have it's own file open. If deadlock is * intentional, it can still be broken by "aborting" the filesystem. */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc) +static struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc) { struct fuse_req *req; @@ -570,9 +570,14 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) struct fuse_req *req; ssize_t ret; - req = fuse_get_req(fc, 0); - if (IS_ERR(req)) - return PTR_ERR(req); + if (args->force) { + req = fuse_get_req_nofail_nopages(fc); + __set_bit(FR_FORCE, &req->flags); + } else { + req = fuse_get_req(fc, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + } /* Needs to be done after fuse_get_req() so that fc->minor is valid */ fuse_adjust_compat(fc, args); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7d12c1d27132..f404e15357a6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -410,8 +410,8 @@ static int fuse_flush(struct file *file, fl_owner_t id) struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; - struct fuse_req *req; struct fuse_flush_in inarg; + FUSE_ARGS(args); int err; if (is_bad_inode(inode)) @@ -432,19 +432,17 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - req = fuse_get_req_nofail_nopages(fc); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); - req->in.h.opcode = FUSE_FLUSH; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - __set_bit(FR_FORCE, &req->flags); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.opcode = FUSE_FLUSH; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_flush = 1; err = 0; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index dd199391d6b9..24269f470c0e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -291,6 +291,7 @@ struct fuse_args { uint32_t opcode; unsigned short in_numargs; unsigned short out_numargs; + bool force:1; bool out_argvar:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; @@ -919,11 +920,6 @@ struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, */ void __fuse_get_request(struct fuse_req *req); -/** - * Gets a requests for a file operation, always succeeds - */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc); - /** * Decrement reference count of a request. If count goes to zero free * the request. From 454a7613f54e64a36b257812ae879ef8567c675e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 302/721] fuse: add noreply to fuse_args This will be used by fuse_force_forget(). We can expand fuse_request_send() into fuse_simple_request(). The FR_WAITING bit has already been set, no need to check. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 4 +++- fs/fuse/fuse_i.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a7be13b5d6ef..85ed1abb9235 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -591,7 +591,9 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) req->out.numargs = args->out_numargs; memcpy(req->out.args, args->out_args, args->out_numargs * sizeof(struct fuse_arg)); - fuse_request_send(fc, req); + if (!args->noreply) + __set_bit(FR_ISREPLY, &req->flags); + __fuse_request_send(fc, req); ret = req->out.h.error; if (!ret && args->out_argvar) { BUG_ON(args->out_numargs != 1); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 24269f470c0e..ed6538e01feb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -292,6 +292,7 @@ struct fuse_args { unsigned short in_numargs; unsigned short out_numargs; bool force:1; + bool noreply:1; bool out_argvar:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; From 3545fe21128262d425dd476b6d31f9a9e4d1b7a7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 303/721] fuse: convert fuse_force_forget() to simple api Move this function to the readdir.c where its only caller resides. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 21 --------------------- fs/fuse/fuse_i.h | 3 --- fs/fuse/readdir.c | 21 +++++++++++++++++++++ 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 85ed1abb9235..19114b23e95f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -661,27 +661,6 @@ static int fuse_request_send_notify_reply(struct fuse_conn *fc, return err; } -void fuse_force_forget(struct file *file, u64 nodeid) -{ - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - struct fuse_forget_in inarg; - - memset(&inarg, 0, sizeof(inarg)); - inarg.nlookup = 1; - req = fuse_get_req_nofail_nopages(fc); - req->in.h.opcode = FUSE_FORGET; - req->in.h.nodeid = nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - __clear_bit(FR_ISREPLY, &req->flags); - __fuse_request_send(fc, req); - /* ignore errors */ - fuse_put_request(fc, req); -} - /* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ed6538e01feb..e6a8e47d0e0b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -813,9 +813,6 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -/* Used by READDIRPLUS */ -void fuse_force_forget(struct file *file, u64 nodeid); - /** * Initialize READ or READDIR request */ diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 574d03f8a573..56a19faa855d 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -249,6 +249,27 @@ static int fuse_direntplus_link(struct file *file, return 0; } +static void fuse_force_forget(struct file *file, u64 nodeid) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_forget_in inarg; + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + inarg.nlookup = 1; + args.opcode = FUSE_FORGET; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + args.noreply = true; + + fuse_simple_request(fc, &args); + /* ignore errors */ +} + static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, struct dir_context *ctx, u64 attr_version) { From e413754b267e0e47ed38b3eacfa7178f21aca883 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:08 +0200 Subject: [PATCH 304/721] fuse: add nocreds to fuse_args In some cases it makes no sense to set pid/uid/gid fields in the request header. Allow fuse_simple_background() to omit these. This is only required in the "force" case, so for now just WARN if set otherwise. Fold fuse_get_req_nofail_nopages() into its only caller. Comment is obsolete anyway. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 44 +++++++++++++++----------------------------- fs/fuse/fuse_i.h | 1 + 2 files changed, 16 insertions(+), 29 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 19114b23e95f..9f1549166a5d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -242,34 +242,6 @@ struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, } EXPORT_SYMBOL_GPL(fuse_get_req_for_background); -/* - * Gets a requests for a file operation, always succeeds - * - * This is used for sending the FLUSH request, which must get to - * userspace, due to POSIX locks which may need to be unlocked. - * - * If allocation fails due to OOM, use the reserved request in - * fuse_file. - * - * This is very unlikely to deadlock accidentally, since the - * filesystem should not have it's own file open. If deadlock is - * intentional, it can still be broken by "aborting" the filesystem. - */ -static struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc) -{ - struct fuse_req *req; - - atomic_inc(&fc->num_waiting); - req = __fuse_request_alloc(0, GFP_KERNEL | __GFP_NOFAIL); - - req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); - req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); - - __set_bit(FR_WAITING, &req->flags); - return req; -} - void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (refcount_dec_and_test(&req->count)) { @@ -565,15 +537,29 @@ static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) } } +static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) +{ + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); +} + ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) { struct fuse_req *req; ssize_t ret; if (args->force) { - req = fuse_get_req_nofail_nopages(fc); + atomic_inc(&fc->num_waiting); + req = __fuse_request_alloc(0, GFP_KERNEL | __GFP_NOFAIL); + + if (!args->nocreds) + fuse_force_creds(fc, req); + + __set_bit(FR_WAITING, &req->flags); __set_bit(FR_FORCE, &req->flags); } else { + WARN_ON(args->nocreds); req = fuse_get_req(fc, 0); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e6a8e47d0e0b..43ae5b7f4dd4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -293,6 +293,7 @@ struct fuse_args { unsigned short out_numargs; bool force:1; bool noreply:1; + bool nocreds:1; bool out_argvar:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; From 1ccd1ea24962276ca0548386889ef7bf57479c5d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 305/721] fuse: convert destroy to simple api We can use the "force" flag to make sure the DESTROY request is always sent to userspace. So no need to keep it allocated during the lifetime of the filesystem. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- fs/fuse/fuse_i.h | 6 +++--- fs/fuse/inode.c | 28 ++++++++++------------------ 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f404e15357a6..53f2cc6970f1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -279,7 +279,7 @@ void fuse_release_common(struct file *file, bool isdir) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); + fuse_file_put(ff, ff->fc->destroy, isdir); } static int fuse_open(struct inode *inode, struct file *file) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 43ae5b7f4dd4..73f70f3872e7 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -715,6 +715,9 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /* Send DESTROY request */ + unsigned int destroy:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -736,9 +739,6 @@ struct fuse_conn { /** Key for lock owner ID scrambling */ u32 scramble_key[4]; - /** Reserved request for the DESTROY message */ - struct fuse_req *destroy_req; - /** Version counter for attribute changes */ atomic64_t attr_version; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2b9cc19fedcb..127984642a71 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -386,14 +386,13 @@ static void fuse_umount_begin(struct super_block *sb) static void fuse_send_destroy(struct fuse_conn *fc) { - struct fuse_req *req = fc->destroy_req; - if (req && fc->conn_init) { - fc->destroy_req = NULL; - req->in.h.opcode = FUSE_DESTROY; - __set_bit(FR_FORCE, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - fuse_request_send(fc, req); - fuse_put_request(fc, req); + if (fc->conn_init) { + FUSE_ARGS(args); + + args.opcode = FUSE_DESTROY; + args.force = true; + args.nocreds = true; + fuse_simple_request(fc, &args); } } @@ -640,8 +639,6 @@ EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { - if (fc->destroy_req) - fuse_request_free(fc->destroy_req); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); fc->release(fc); @@ -1171,6 +1168,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; fc->max_read = max_t(unsigned, 4096, ctx->max_read); + fc->destroy = is_bdev; /* Used by get_root_inode() */ sb->s_fs_info = fc; @@ -1189,12 +1187,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) goto err_put_root; __set_bit(FR_BACKGROUND, &init_req->flags); - if (is_bdev) { - fc->destroy_req = fuse_request_alloc(0); - if (!fc->destroy_req) - goto err_free_init_req; - } - mutex_lock(&fuse_mutex); err = -EINVAL; if (file->private_data) @@ -1221,7 +1213,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) err_unlock: mutex_unlock(&fuse_mutex); - err_free_init_req: fuse_request_free(init_req); err_put_root: dput(root_dentry); @@ -1287,7 +1278,8 @@ static void fuse_sb_destroy(struct super_block *sb) struct fuse_conn *fc = get_fuse_conn_super(sb); if (fc) { - fuse_send_destroy(fc); + if (fc->destroy) + fuse_send_destroy(fc); fuse_abort_conn(fc); fuse_wait_aborted(fc); From 68583165f962793a6fe7d11f54713045f43fc8bb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 306/721] fuse: add pages to fuse_args Derive fuse_args_pages from fuse_args. This is used to handle requests which use pages for input or output. The related flags are added to fuse_args. New FR_ALLOC_PAGES flags is added to indicate whether the page arrays in fuse_req need to be freed by fuse_put_request() or not. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 41 +++++++++++++++++++++++++++++++---------- fs/fuse/fuse_i.h | 12 ++++++++++++ 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9f1549166a5d..3ef85c957122 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -81,6 +81,7 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) kmem_cache_free(fuse_req_cachep, req); return NULL; } + __set_bit(FR_ALLOC_PAGES, &req->flags); } else if (npages) { pages = req->inline_pages; page_descs = req->inline_page_descs; @@ -104,7 +105,7 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages) static void fuse_req_pages_free(struct fuse_req *req) { - if (req->pages != req->inline_pages) + if (test_bit(FR_ALLOC_PAGES, &req->flags)) kfree(req->pages); } @@ -127,6 +128,7 @@ bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, memcpy(page_descs, req->page_descs, sizeof(struct fuse_page_desc) * req->max_pages); fuse_req_pages_free(req); + __set_bit(FR_ALLOC_PAGES, &req->flags); req->pages = pages; req->page_descs = page_descs; req->max_pages = npages; @@ -544,6 +546,33 @@ static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } +void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) +{ + struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); + + req->in.h.opcode = args->opcode; + req->in.h.nodeid = args->nodeid; + req->in.numargs = args->in_numargs; + memcpy(req->in.args, args->in_args, + args->in_numargs * sizeof(struct fuse_in_arg)); + req->out.argvar = args->out_argvar; + req->out.numargs = args->out_numargs; + memcpy(req->out.args, args->out_args, + args->out_numargs * sizeof(struct fuse_arg)); + + if (args->in_pages || args->out_pages) { + req->in.argpages = args->in_pages; + req->out.argpages = args->out_pages; + req->out.page_zeroing = args->page_zeroing; + req->out.page_replace = args->page_replace; + + req->pages = ap->pages; + req->page_descs = ap->descs; + req->num_pages = ap->num_pages; + } + +} + ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) { struct fuse_req *req; @@ -567,16 +596,8 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) /* Needs to be done after fuse_get_req() so that fc->minor is valid */ fuse_adjust_compat(fc, args); + fuse_args_to_req(req, args); - req->in.h.opcode = args->opcode; - req->in.h.nodeid = args->nodeid; - req->in.numargs = args->in_numargs; - memcpy(req->in.args, args->in_args, - args->in_numargs * sizeof(struct fuse_in_arg)); - req->out.argvar = args->out_argvar; - req->out.numargs = args->out_numargs; - memcpy(req->out.args, args->out_args, - args->out_numargs * sizeof(struct fuse_arg)); if (!args->noreply) __set_bit(FR_ISREPLY, &req->flags); __fuse_request_send(fc, req); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 73f70f3872e7..1998d6ab4025 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -294,11 +294,22 @@ struct fuse_args { bool force:1; bool noreply:1; bool nocreds:1; + bool in_pages:1; + bool out_pages:1; bool out_argvar:1; + bool page_zeroing:1; + bool page_replace:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; }; +struct fuse_args_pages { + struct fuse_args args; + struct page **pages; + struct fuse_page_desc *descs; + unsigned int num_pages; +}; + #define FUSE_ARGS(args) struct fuse_args args = {} /** The request IO state (for asynchronous processing) */ @@ -352,6 +363,7 @@ enum fuse_req_flag { FR_SENT, FR_FINISHED, FR_PRIVATE, + FR_ALLOC_PAGES, }; /** From 4c29afece8729c6f6c1dd4865b6b7c972b7b3bbd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 307/721] fuse: convert readlink to simple api Also turn BUG_ON into gracefully recovered WARN_ON. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 54 +++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 30a96098e64a..11334ee01dc9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1152,38 +1152,36 @@ static int fuse_permission(struct inode *inode, int mask) static int fuse_readlink_page(struct inode *inode, struct page *page) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - int err; + struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_args_pages ap = { + .num_pages = 1, + .pages = &page, + .descs = &desc, + }; + char *link; + ssize_t res; - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + ap.args.opcode = FUSE_READLINK; + ap.args.nodeid = get_node_id(inode); + ap.args.out_pages = true; + ap.args.out_argvar = true; + ap.args.page_zeroing = true; + ap.args.out_numargs = 1; + ap.args.out_args[0].size = desc.length; + res = fuse_simple_request(fc, &ap.args); - req->out.page_zeroing = 1; - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE - 1; - req->in.h.opcode = FUSE_READLINK; - req->in.h.nodeid = get_node_id(inode); - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = PAGE_SIZE - 1; - fuse_request_send(fc, req); - err = req->out.h.error; - - if (!err) { - char *link = page_address(page); - size_t len = req->out.args[0].size; - - BUG_ON(len >= PAGE_SIZE); - link[len] = '\0'; - } - - fuse_put_request(fc, req); fuse_invalidate_atime(inode); - return err; + if (res < 0) + return res; + + if (WARN_ON(res >= PAGE_SIZE)) + return -EIO; + + link = page_address(page); + link[res] = '\0'; + + return 0; } static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, From 4c4f03f78ca9ce41a158710b87ad7e6d363e881a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 308/721] fuse: move page alloc fuse_req_pages_alloc() is moved to file.c, since its internal use by the device code will eventually be removed. Rename to fuse_pages_alloc() to signify that it's not only usable for fuse_req page array. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 17 ++--------------- fs/fuse/file.c | 12 ++++++++++++ fs/fuse/fuse_i.h | 2 ++ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3ef85c957122..0a6624aeced9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -54,18 +54,6 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, __set_bit(FR_PENDING, &req->flags); } -static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) -{ - struct page **pages; - - pages = kzalloc(npages * (sizeof(struct page *) + - sizeof(struct fuse_page_desc)), flags); - *desc = (void *) pages + npages * sizeof(struct page *); - - return pages; -} - static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); @@ -75,8 +63,7 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) WARN_ON(npages > FUSE_MAX_MAX_PAGES); if (npages > FUSE_REQ_INLINE_PAGES) { - pages = fuse_req_pages_alloc(npages, flags, - &page_descs); + pages = fuse_pages_alloc(npages, flags, &page_descs); if (!pages) { kmem_cache_free(fuse_req_cachep, req); return NULL; @@ -120,7 +107,7 @@ bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, fc->max_pages); WARN_ON(npages <= req->max_pages); - pages = fuse_req_pages_alloc(npages, flags, &page_descs); + pages = fuse_pages_alloc(npages, flags, &page_descs); if (!pages) return false; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 53f2cc6970f1..c3e95002f489 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,6 +19,18 @@ #include #include +struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) (pages + npages); + + return pages; +} + static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1998d6ab4025..b62a3e37ea4c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -909,6 +909,8 @@ struct fuse_req *fuse_request_alloc(unsigned npages); struct fuse_req *fuse_request_alloc_nofs(unsigned npages); +struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc); bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, gfp_t flags); From 093f38a2c1a81405f261ef1ac681a9003b061db5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 309/721] fuse: convert ioctl to simple api fuse_simple_request() is converted to return length of last (instead of single) out arg, since FUSE_IOCTL_OUT has two out args, the second of which is variable length. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 4 +-- fs/fuse/file.c | 97 ++++++++++++++++++++++---------------------------- 2 files changed, 45 insertions(+), 56 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0a6624aeced9..29723e143666 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -590,8 +590,8 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) __fuse_request_send(fc, req); ret = req->out.h.error; if (!ret && args->out_argvar) { - BUG_ON(args->out_numargs != 1); - ret = req->out.args[0].size; + BUG_ON(args->out_numargs == 0); + ret = req->out.args[args->out_numargs - 1].size; } fuse_put_request(fc, req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c3e95002f489..1847cc53c416 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1267,14 +1267,14 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return written ? written : err; } -static inline void fuse_page_descs_length_init(struct fuse_req *req, - unsigned index, unsigned nr_pages) +static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, + unsigned int index, + unsigned int nr_pages) { int i; for (i = index; i < index + nr_pages; i++) - req->page_descs[i].length = PAGE_SIZE - - req->page_descs[i].offset; + descs[i].length = PAGE_SIZE - descs[i].offset; } static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) @@ -1326,7 +1326,8 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; req->page_descs[req->num_pages].offset = start; - fuse_page_descs_length_init(req, req->num_pages, npages); + fuse_page_descs_length_init(req->page_descs, req->num_pages, + npages); req->num_pages += npages; req->page_descs[req->num_pages - 1].length -= @@ -2582,14 +2583,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, .flags = flags }; struct fuse_ioctl_out outarg; - struct fuse_req *req = NULL; - struct page **pages = NULL; struct iovec *iov_page = NULL; struct iovec *in_iov = NULL, *out_iov = NULL; - unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; - size_t in_size, out_size, transferred, c; + unsigned int in_iovs = 0, out_iovs = 0, max_pages; + size_t in_size, out_size, c; + ssize_t transferred; int err, i; struct iov_iter ii; + struct fuse_args_pages ap = {}; #if BITS_PER_LONG == 32 inarg.flags |= FUSE_IOCTL_32BIT; @@ -2607,11 +2608,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL); + ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!pages || !iov_page) + if (!ap.pages || !iov_page) goto out; + fuse_page_descs_length_init(ap.descs, 0, fc->max_pages); + /* * If restricted, initialize IO parameters as encoded in @cmd. * RETRY from server is not allowed. @@ -2648,56 +2651,44 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -ENOMEM; if (max_pages > fc->max_pages) goto out; - while (num_pages < max_pages) { - pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!pages[num_pages]) + while (ap.num_pages < max_pages) { + ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!ap.pages[ap.num_pages]) goto out; - num_pages++; + ap.num_pages++; } - req = fuse_get_req(fc, num_pages); - if (IS_ERR(req)) { - err = PTR_ERR(req); - req = NULL; - goto out; - } - memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); - req->num_pages = num_pages; - fuse_page_descs_length_init(req, 0, req->num_pages); /* okay, let's send it to the client */ - req->in.h.opcode = FUSE_IOCTL; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; + ap.args.opcode = FUSE_IOCTL; + ap.args.nodeid = ff->nodeid; + ap.args.in_numargs = 1; + ap.args.in_args[0].size = sizeof(inarg); + ap.args.in_args[0].value = &inarg; if (in_size) { - req->in.numargs++; - req->in.args[1].size = in_size; - req->in.argpages = 1; + ap.args.in_numargs++; + ap.args.in_args[1].size = in_size; + ap.args.in_pages = true; err = -EFAULT; iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { - c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } } - req->out.numargs = 2; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - req->out.args[1].size = out_size; - req->out.argpages = 1; - req->out.argvar = 1; + ap.args.out_numargs = 2; + ap.args.out_args[0].size = sizeof(outarg); + ap.args.out_args[0].value = &outarg; + ap.args.out_args[1].size = out_size; + ap.args.out_pages = true; + ap.args.out_argvar = true; - fuse_request_send(fc, req); - err = req->out.h.error; - transferred = req->out.args[1].size; - fuse_put_request(fc, req); - req = NULL; - if (err) + transferred = fuse_simple_request(fc, &ap.args); + err = transferred; + if (transferred < 0) goto out; /* did it ask for retry? */ @@ -2722,7 +2713,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - vaddr = kmap_atomic(pages[0]); + vaddr = kmap_atomic(ap.pages[0]); err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); @@ -2750,19 +2741,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { - c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } err = 0; out: - if (req) - fuse_put_request(fc, req); free_page((unsigned long) iov_page); - while (num_pages) - __free_page(pages[--num_pages]); - kfree(pages); + while (ap.num_pages) + __free_page(ap.pages[--ap.num_pages]); + kfree(ap.pages); return err ? err : outarg.result; } From a0d45d84f4c9d119e0eaede4778211a789457a7a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 310/721] fuse: fuse_short_read(): don't take fuse_req as argument This will allow the use of this function when converting to the simple api (which doesn't use fuse_req). Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1847cc53c416..8e67add0f37f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -705,10 +705,9 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, spin_unlock(&fi->lock); } -static void fuse_short_read(struct fuse_req *req, struct inode *inode, - u64 attr_ver) +static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, + struct page **pages, unsigned int num_pages) { - size_t num_read = req->out.args[0].size; struct fuse_conn *fc = get_fuse_conn(inode); if (fc->writeback_cache) { @@ -721,12 +720,12 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, int start_idx = num_read >> PAGE_SHIFT; size_t off = num_read & (PAGE_SIZE - 1); - for (i = start_idx; i < req->num_pages; i++) { - zero_user_segment(req->pages[i], off, PAGE_SIZE); + for (i = start_idx; i < num_pages; i++) { + zero_user_segment(pages[i], off, PAGE_SIZE); off = 0; } } else { - loff_t pos = page_offset(req->pages[0]) + num_read; + loff_t pos = page_offset(pages[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } @@ -772,7 +771,8 @@ static int fuse_do_readpage(struct file *file, struct page *page) * Short read means EOF. If file size is larger, truncate it */ if (num_read < count) - fuse_short_read(req, inode, attr_ver); + fuse_short_read(inode, attr_ver, num_read, req->pages, + req->num_pages); SetPageUptodate(page); } @@ -815,7 +815,8 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) * Short read means EOF. If file size is larger, truncate it */ if (!req->out.h.error && num_read < count) - fuse_short_read(req, inode, req->misc.read.attr_ver); + fuse_short_read(inode, req->misc.read.attr_ver, + num_read, req->pages, req->num_pages); fuse_invalidate_atime(inode); } From 00793ca5d4439a22b18c0939522c81fa8661a362 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 311/721] fuse: covert readpage to simple api Derive fuse_io_args from struct fuse_args_pages. This will be used for both synchronous and asynchronous read/write requests. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 80 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 32 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8e67add0f37f..d927c336683a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -552,6 +552,33 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, req->out.args[0].size = count; } +struct fuse_io_args { + struct { + struct fuse_read_in in; + } read; + struct fuse_args_pages ap; +}; + +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode) +{ + struct fuse_file *ff = file->private_data; + struct fuse_args *args = &ia->ap.args; + + ia->read.in.fh = ff->fh; + ia->read.in.offset = pos; + ia->read.in.size = count; + ia->read.in.flags = file->f_flags; + args->opcode = opcode; + args->nodeid = ff->nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(ia->read.in); + args->in_args[0].value = &ia->read.in; + args->out_argvar = true; + args->out_numargs = 1; + args->out_args[0].size = count; +} + static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) { unsigned i; @@ -732,16 +759,19 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, static int fuse_do_readpage(struct file *file, struct page *page) { - struct kiocb iocb; - struct fuse_io_priv io; struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - size_t num_read; loff_t pos = page_offset(page); - size_t count = PAGE_SIZE; + struct fuse_page_desc desc = { .length = PAGE_SIZE }; + struct fuse_io_args ia = { + .ap.args.page_zeroing = true, + .ap.args.out_pages = true, + .ap.num_pages = 1, + .ap.pages = &page, + .ap.descs = &desc, + }; + ssize_t res; u64 attr_ver; - int err; /* * Page writeback can extend beyond the lifetime of the @@ -750,36 +780,22 @@ static int fuse_do_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); - attr_ver = fuse_get_attr_version(fc); - req->out.page_zeroing = 1; - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = count; - init_sync_kiocb(&iocb, file); - io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb); - num_read = fuse_send_read(req, &io, pos, count, NULL); - err = req->out.h.error; + fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); + res = fuse_simple_request(fc, &ia.ap.args); + if (res < 0) + return res; + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (res < desc.length) + fuse_short_read(inode, attr_ver, res, ia.ap.pages, + ia.ap.num_pages); - if (!err) { - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (num_read < count) - fuse_short_read(inode, attr_ver, num_read, req->pages, - req->num_pages); + SetPageUptodate(page); - SetPageUptodate(page); - } - - fuse_put_request(fc, req); - - return err; + return 0; } static int fuse_readpage(struct file *file, struct page *page) From 338f2e3f3341a9a844331b1bcb159e68638d5eef Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:09 +0200 Subject: [PATCH 312/721] fuse: convert sync write to simple api Extract a fuse_write_flags() helper that converts ki_flags relevant write to open flags. The other parts of fuse_send_write() aren't used in the fuse_perform_write() case. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 130 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 44 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d927c336683a..1dc499ad8606 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -553,9 +553,15 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, } struct fuse_io_args { - struct { - struct fuse_read_in in; - } read; + union { + struct { + struct fuse_read_in in; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + } write; + }; struct fuse_args_pages ap; }; @@ -1001,6 +1007,40 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, req->out.args[0].value = outarg; } +static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, + loff_t pos, size_t count) +{ + struct fuse_args *args = &ia->ap.args; + + ia->write.in.fh = ff->fh; + ia->write.in.offset = pos; + ia->write.in.size = count; + args->opcode = FUSE_WRITE; + args->nodeid = ff->nodeid; + args->in_numargs = 2; + if (ff->fc->minor < 9) + args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; + else + args->in_args[0].size = sizeof(ia->write.in); + args->in_args[0].value = &ia->write.in; + args->in_args[1].size = count; + args->out_numargs = 1; + args->out_args[0].size = sizeof(ia->write.out); + args->out_args[0].value = &ia->write.out; +} + +static unsigned int fuse_write_flags(struct kiocb *iocb) +{ + unsigned int flags = iocb->ki_filp->f_flags; + + if (iocb->ki_flags & IOCB_DSYNC) + flags |= O_DSYNC; + if (iocb->ki_flags & IOCB_SYNC) + flags |= O_SYNC; + + return flags; +} + static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { @@ -1011,11 +1051,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, struct fuse_write_in *inarg = &req->misc.write.in; fuse_write_fill(req, ff, pos, count); - inarg->flags = file->f_flags; - if (iocb->ki_flags & IOCB_DSYNC) - inarg->flags |= O_DSYNC; - if (iocb->ki_flags & IOCB_SYNC) - inarg->flags |= O_SYNC; + inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); @@ -1045,26 +1081,31 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) return ret; } -static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, - struct inode *inode, loff_t pos, - size_t count) +static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, + struct kiocb *iocb, struct inode *inode, + loff_t pos, size_t count) { - size_t res; - unsigned offset; - unsigned i; - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + struct fuse_args_pages *ap = &ia->ap; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + unsigned int offset, i; + int err; - for (i = 0; i < req->num_pages; i++) - fuse_wait_on_page_writeback(inode, req->pages[i]->index); + for (i = 0; i < ap->num_pages; i++) + fuse_wait_on_page_writeback(inode, ap->pages[i]->index); - res = fuse_send_write(req, &io, pos, count, NULL); + fuse_write_args_fill(ia, ff, pos, count); + ia->write.in.flags = fuse_write_flags(iocb); - offset = req->page_descs[0].offset; - count = res; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; + err = fuse_simple_request(fc, &ap->args); - if (!req->out.h.error && !offset && count >= PAGE_SIZE) + offset = ap->descs[0].offset; + count = ia->write.out.size; + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + + if (!err && !offset && count >= PAGE_SIZE) SetPageUptodate(page); if (count > PAGE_SIZE - offset) @@ -1077,20 +1118,21 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, put_page(page); } - return res; + return err; } -static ssize_t fuse_fill_write_pages(struct fuse_req *req, - struct address_space *mapping, - struct iov_iter *ii, loff_t pos) +static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos, + unsigned int max_pages) { struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; int err; - req->in.argpages = 1; - req->page_descs[0].offset = offset; + ap->args.in_pages = true; + ap->descs[0].offset = offset; do { size_t tmp; @@ -1126,9 +1168,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, } err = 0; - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].length = tmp; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].length = tmp; + ap->num_pages++; count += tmp; pos += tmp; @@ -1139,7 +1181,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - req->num_pages < req->max_pages && offset == 0); + ap->num_pages < max_pages && offset == 0); return count > 0 ? count : err; } @@ -1167,27 +1209,27 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); do { - struct fuse_req *req; ssize_t count; + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - req = fuse_get_req(fc, nr_pages); - if (IS_ERR(req)) { - err = PTR_ERR(req); + ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); + if (!ap->pages) { + err = -ENOMEM; break; } - count = fuse_fill_write_pages(req, mapping, ii, pos); + count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); if (count <= 0) { err = count; } else { - size_t num_written; - - num_written = fuse_send_write_pages(req, iocb, inode, - pos, count); - err = req->out.h.error; + err = fuse_send_write_pages(&ia, iocb, inode, + pos, count); if (!err) { + size_t num_written = ia.write.out.size; + res += num_written; pos += num_written; @@ -1196,7 +1238,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, err = -EIO; } } - fuse_put_request(fc, req); + kfree(ap->pages); } while (!err && iov_iter_count(ii)); if (res > 0) From 1259728731a7902c4965952c55fa16014b4fd0e7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 313/721] fuse: add simple background helper Create a helper named fuse_simple_background() that is similar to fuse_simple_request(). Unlike the latter, it returns immediately and calls the supplied 'end' callback when the reply is received. The supplied 'args' pointer is stored in 'fuse_req' which allows the callback to interpret the output arguments decoded from the reply. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 6 ++++++ 2 files changed, 51 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 29723e143666..a52a7380baa4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -626,6 +626,51 @@ bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) return queued; } +static void fuse_simple_end(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_args *args = req->args; + int err = req->out.h.error; + + if (!err && args->out_argvar) { + BUG_ON(args->out_numargs == 0); + args->out_args[args->out_numargs - 1].size = + req->out.args[args->out_numargs - 1].size; + } + req->args->end(fc, req->args, req->out.h.error); +} + +int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, + gfp_t gfp_flags) +{ + struct fuse_req *req; + + if (args->force) { + WARN_ON(!args->nocreds); + req = __fuse_request_alloc(0, gfp_flags); + if (!req) + return -ENOMEM; + __set_bit(FR_BACKGROUND, &req->flags); + } else { + WARN_ON(args->nocreds); + req = __fuse_get_req(fc, 0, true); + if (IS_ERR(req)) + return PTR_ERR(req); + } + + fuse_args_to_req(req, args); + + req->args = args; + req->end = fuse_simple_end; + + if (!fuse_request_queue_background(fc, req)) { + fuse_put_request(fc, req); + return -ENOTCONN; + } + + return 0; +} +EXPORT_SYMBOL_GPL(fuse_simple_background); + void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { WARN_ON(!req->end); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b62a3e37ea4c..8f46c5549e57 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -301,6 +301,7 @@ struct fuse_args { bool page_replace:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; + void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); }; struct fuse_args_pages { @@ -381,6 +382,9 @@ struct fuse_req { /** Entry on the interrupts list */ struct list_head intr_entry; + /* Input/output arguments */ + struct fuse_args *args; + /** refcount */ refcount_t count; @@ -948,6 +952,8 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); * Simple request sending that does request allocation and freeing */ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); +int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, + gfp_t gfp_flags); /** * Send a request in the background From 45ac96ed7c369112039289cc7c325a5682ef7c03 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 314/721] fuse: convert direct_io to simple api Change of semantics in fuse_async_req_send/fuse_send_(read|write): these can now return error, in which case the 'end' callback isn't called, so the fuse_io_args object needs to be freed. Added verification that the return value is sane (less than or equal to the requested read/write size). Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 227 ++++++++++++++++++++++++++++--------------------- 1 file changed, 128 insertions(+), 99 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1dc499ad8606..ee0817a3d5f1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -563,6 +563,7 @@ struct fuse_io_args { } write; }; struct fuse_args_pages ap; + struct fuse_io_priv *io; }; void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, @@ -585,15 +586,15 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) +static void fuse_release_user_pages(struct fuse_args_pages *ap, + bool should_dirty) { - unsigned i; + unsigned int i; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; + for (i = 0; i < ap->num_pages; i++) { if (should_dirty) - set_page_dirty_lock(page); - put_page(page); + set_page_dirty_lock(ap->pages[i]); + put_page(ap->pages[i]); } } @@ -663,64 +664,94 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) kref_put(&io->refcnt, fuse_io_release); } -static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) +static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, + unsigned int npages) { - struct fuse_io_priv *io = req->io; - ssize_t pos = -1; + struct fuse_io_args *ia; - fuse_release_user_pages(req, io->should_dirty); - - if (io->write) { - if (req->misc.write.in.size != req->misc.write.out.size) - pos = req->misc.write.in.offset - io->offset + - req->misc.write.out.size; - } else { - if (req->misc.read.in.size != req->out.args[0].size) - pos = req->misc.read.in.offset - io->offset + - req->out.args[0].size; + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (ia) { + ia->io = io; + ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, + &ia->ap.descs); + if (!ia->ap.pages) { + kfree(ia); + ia = NULL; + } } - - fuse_aio_complete(io, req->out.h.error, pos); + return ia; } -static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, - size_t num_bytes, struct fuse_io_priv *io) +static void fuse_io_free(struct fuse_io_args *ia) { + kfree(ia->ap.pages); + kfree(ia); +} + +static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, + int err) +{ + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_io_priv *io = ia->io; + ssize_t pos = -1; + + fuse_release_user_pages(&ia->ap, io->should_dirty); + + if (err) { + /* Nothing */ + } else if (io->write) { + if (ia->write.out.size > ia->write.in.size) { + err = -EIO; + } else if (ia->write.in.size != ia->write.out.size) { + pos = ia->write.in.offset - io->offset + + ia->write.out.size; + } + } else { + u32 outsize = args->out_args[0].size; + + if (ia->read.in.size != outsize) + pos = ia->read.in.offset - io->offset + outsize; + } + + fuse_aio_complete(io, err, pos); + fuse_io_free(ia); +} + +static ssize_t fuse_async_req_send(struct fuse_conn *fc, + struct fuse_io_args *ia, size_t num_bytes) +{ + ssize_t err; + struct fuse_io_priv *io = ia->io; + spin_lock(&io->lock); kref_get(&io->refcnt); io->size += num_bytes; io->reqs++; spin_unlock(&io->lock); - req->io = io; - req->end = fuse_aio_complete_req; + ia->ap.args.end = fuse_aio_complete_req; + err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); - __fuse_get_request(req); - fuse_request_send_background(fc, req); - - return num_bytes; + return err ?: num_bytes; } -static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, - loff_t pos, size_t count, fl_owner_t owner) +static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, + fl_owner_t owner) { - struct file *file = io->iocb->ki_filp; + struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_read_fill(req, file, pos, count, FUSE_READ); + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { - struct fuse_read_in *inarg = &req->misc.read.in; - - inarg->read_flags |= FUSE_READ_LOCKOWNER; - inarg->lock_owner = fuse_lock_owner_id(fc, owner); + ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; + ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner); } - if (io->async) - return fuse_async_req_send(fc, req, count, io); + if (ia->io->async) + return fuse_async_req_send(fc, ia, count); - fuse_request_send(fc, req); - return req->out.args[0].size; + return fuse_simple_request(fc, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -1041,27 +1072,31 @@ static unsigned int fuse_write_flags(struct kiocb *iocb) return flags; } -static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, - loff_t pos, size_t count, fl_owner_t owner) +static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, + size_t count, fl_owner_t owner) { - struct kiocb *iocb = io->iocb; + struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_write_in *inarg = &ia->write.in; + ssize_t err; - fuse_write_fill(req, ff, pos, count); + fuse_write_args_fill(ia, ff, pos, count); inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } - if (io->async) - return fuse_async_req_send(fc, req, count, io); + if (ia->io->async) + return fuse_async_req_send(fc, ia, count); - fuse_request_send(fc, req); - return req->misc.write.out.size; + err = fuse_simple_request(fc, &ia->ap.args); + if (!err && ia->write.out.size > count) + err = -EIO; + + return err ?: ia->write.out.size; } bool fuse_write_update_size(struct inode *inode, loff_t pos) @@ -1347,8 +1382,9 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, return min(iov_iter_single_seg_count(ii), max_size); } -static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, - size_t *nbytesp, int write) +static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, + size_t *nbytesp, int write, + unsigned int max_pages) { size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; @@ -1359,21 +1395,21 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, size_t frag_size = fuse_get_frag_size(ii, *nbytesp); if (write) - req->in.args[1].value = (void *) user_addr; + ap->args.in_args[1].value = (void *) user_addr; else - req->out.args[0].value = (void *) user_addr; + ap->args.out_args[0].value = (void *) user_addr; iov_iter_advance(ii, frag_size); *nbytesp = frag_size; return 0; } - while (nbytes < *nbytesp && req->num_pages < req->max_pages) { + while (nbytes < *nbytesp && ap->num_pages < max_pages) { unsigned npages; size_t start; - ret = iov_iter_get_pages(ii, &req->pages[req->num_pages], + ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages], *nbytesp - nbytes, - req->max_pages - req->num_pages, + max_pages - ap->num_pages, &start); if (ret < 0) break; @@ -1384,19 +1420,18 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, ret += start; npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; - req->page_descs[req->num_pages].offset = start; - fuse_page_descs_length_init(req->page_descs, req->num_pages, - npages); + ap->descs[ap->num_pages].offset = start; + fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); - req->num_pages += npages; - req->page_descs[req->num_pages - 1].length -= + ap->num_pages += npages; + ap->descs[ap->num_pages - 1].length -= (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } if (write) - req->in.argpages = 1; + ap->args.in_pages = 1; else - req->out.argpages = 1; + ap->args.out_pages = 1; *nbytesp = nbytes; @@ -1418,17 +1453,16 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, pgoff_t idx_from = pos >> PAGE_SHIFT; pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; ssize_t res = 0; - struct fuse_req *req; int err = 0; + struct fuse_io_args *ia; + unsigned int max_pages; - if (io->async) - req = fuse_get_req_for_background(fc, iov_iter_npages(iter, - fc->max_pages)); - else - req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages)); - if (IS_ERR(req)) - return PTR_ERR(req); + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) + return -ENOMEM; + ia->io = io; if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); @@ -1439,54 +1473,49 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, io->should_dirty = !write && iter_is_iovec(iter); while (count) { - size_t nres; + ssize_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - err = fuse_get_user_pages(req, iter, &nbytes, write); + + err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, + max_pages); if (err && !nbytes) break; if (write) { - if (!capable(CAP_FSETID)) { - struct fuse_write_in *inarg; + if (!capable(CAP_FSETID)) + ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV; - inarg = &req->misc.write.in; - inarg->write_flags |= FUSE_WRITE_KILL_PRIV; - } - nres = fuse_send_write(req, io, pos, nbytes, owner); + nres = fuse_send_write(ia, pos, nbytes, owner); } else { - nres = fuse_send_read(req, io, pos, nbytes, owner); + nres = fuse_send_read(ia, pos, nbytes, owner); } - if (!io->async) - fuse_release_user_pages(req, io->should_dirty); - if (req->out.h.error) { - err = req->out.h.error; - break; - } else if (nres > nbytes) { - res = 0; - err = -EIO; + if (!io->async || nres < 0) { + fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_io_free(ia); + } + ia = NULL; + if (nres < 0) { + err = nres; break; } + WARN_ON(nres > nbytes); + count -= nres; res += nres; pos += nres; if (nres != nbytes) break; if (count) { - fuse_put_request(fc, req); - if (io->async) - req = fuse_get_req_for_background(fc, - iov_iter_npages(iter, fc->max_pages)); - else - req = fuse_get_req(fc, iov_iter_npages(iter, - fc->max_pages)); - if (IS_ERR(req)) + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) break; } } - if (!IS_ERR(req)) - fuse_put_request(fc, req); + if (ia) + fuse_io_free(ia); if (res > 0) *ppos = pos; From 134831e36bbdd6ce6c3ef623ddf6d9238f244ed3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 315/721] fuse: convert readpages to simple api Need to extend fuse_io_args with 'attr_ver' and 'ff' members, that take the functionality of the same named members in fuse_req. fuse_short_read() can now take struct fuse_args_pages. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 137 +++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 66 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ee0817a3d5f1..ac2323443c09 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -556,6 +556,7 @@ struct fuse_io_args { union { struct { struct fuse_read_in in; + u64 attr_ver; } read; struct { struct fuse_write_in in; @@ -564,6 +565,7 @@ struct fuse_io_args { }; struct fuse_args_pages ap; struct fuse_io_priv *io; + struct fuse_file *ff; }; void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, @@ -770,7 +772,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, } static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, - struct page **pages, unsigned int num_pages) + struct fuse_args_pages *ap) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -784,12 +786,12 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, int start_idx = num_read >> PAGE_SHIFT; size_t off = num_read & (PAGE_SIZE - 1); - for (i = start_idx; i < num_pages; i++) { - zero_user_segment(pages[i], off, PAGE_SIZE); + for (i = start_idx; i < ap->num_pages; i++) { + zero_user_segment(ap->pages[i], off, PAGE_SIZE); off = 0; } } else { - loff_t pos = page_offset(pages[0]) + num_read; + loff_t pos = page_offset(ap->pages[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } @@ -827,8 +829,7 @@ static int fuse_do_readpage(struct file *file, struct page *page) * Short read means EOF. If file size is larger, truncate it */ if (res < desc.length) - fuse_short_read(inode, attr_ver, res, ia.ap.pages, - ia.ap.num_pages); + fuse_short_read(inode, attr_ver, res, &ia.ap); SetPageUptodate(page); @@ -851,15 +852,18 @@ static int fuse_readpage(struct file *file, struct page *page) return err; } -static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, + int err) { int i; - size_t count = req->misc.read.in.size; - size_t num_read = req->out.args[0].size; + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; + size_t count = ia->read.in.size; + size_t num_read = args->out_args[0].size; struct address_space *mapping = NULL; - for (i = 0; mapping == NULL && i < req->num_pages; i++) - mapping = req->pages[i]->mapping; + for (i = 0; mapping == NULL && i < ap->num_pages; i++) + mapping = ap->pages[i]->mapping; if (mapping) { struct inode *inode = mapping->host; @@ -867,94 +871,97 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) /* * Short read means EOF. If file size is larger, truncate it */ - if (!req->out.h.error && num_read < count) - fuse_short_read(inode, req->misc.read.attr_ver, - num_read, req->pages, req->num_pages); + if (!err && num_read < count) + fuse_short_read(inode, ia->read.attr_ver, num_read, ap); fuse_invalidate_atime(inode); } - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; - if (!req->out.h.error) + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + + if (!err) SetPageUptodate(page); else SetPageError(page); unlock_page(page); put_page(page); } - if (req->ff) - fuse_file_put(req->ff, false, false); + if (ia->ff) + fuse_file_put(ia->ff, false, false); + + fuse_io_free(ia); } -static void fuse_send_readpages(struct fuse_req *req, struct file *file) +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - loff_t pos = page_offset(req->pages[0]); - size_t count = req->num_pages << PAGE_SHIFT; + struct fuse_args_pages *ap = &ia->ap; + loff_t pos = page_offset(ap->pages[0]); + size_t count = ap->num_pages << PAGE_SHIFT; + int err; - req->out.argpages = 1; - req->out.page_zeroing = 1; - req->out.page_replace = 1; - fuse_read_fill(req, file, pos, count, FUSE_READ); - req->misc.read.attr_ver = fuse_get_attr_version(fc); + ap->args.out_pages = true; + ap->args.page_zeroing = true; + ap->args.page_replace = true; + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); + ia->read.attr_ver = fuse_get_attr_version(fc); if (fc->async_read) { - req->ff = fuse_file_get(ff); - req->end = fuse_readpages_end; - fuse_request_send_background(fc, req); + ia->ff = fuse_file_get(ff); + ap->args.end = fuse_readpages_end; + err = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + if (!err) + return; } else { - fuse_request_send(fc, req); - fuse_readpages_end(fc, req); - fuse_put_request(fc, req); + err = fuse_simple_request(fc, &ap->args); } + fuse_readpages_end(fc, &ap->args, err); } struct fuse_fill_data { - struct fuse_req *req; + struct fuse_io_args *ia; struct file *file; struct inode *inode; - unsigned nr_pages; + unsigned int nr_pages; + unsigned int max_pages; }; static int fuse_readpages_fill(void *_data, struct page *page) { struct fuse_fill_data *data = _data; - struct fuse_req *req = data->req; + struct fuse_io_args *ia = data->ia; + struct fuse_args_pages *ap = &ia->ap; struct inode *inode = data->inode; struct fuse_conn *fc = get_fuse_conn(inode); fuse_wait_on_page_writeback(inode, page->index); - if (req->num_pages && - (req->num_pages == fc->max_pages || - (req->num_pages + 1) * PAGE_SIZE > fc->max_read || - req->pages[req->num_pages - 1]->index + 1 != page->index)) { - unsigned int nr_alloc = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(req, data->file); - if (fc->async_read) - req = fuse_get_req_for_background(fc, nr_alloc); - else - req = fuse_get_req(fc, nr_alloc); - - data->req = req; - if (IS_ERR(req)) { + if (ap->num_pages && + (ap->num_pages == fc->max_pages || + (ap->num_pages + 1) * PAGE_SIZE > fc->max_read || + ap->pages[ap->num_pages - 1]->index + 1 != page->index)) { + data->max_pages = min_t(unsigned int, data->nr_pages, + fc->max_pages); + fuse_send_readpages(ia, data->file); + data->ia = ia = fuse_io_alloc(NULL, data->max_pages); + if (!ia) { unlock_page(page); - return PTR_ERR(req); + return -ENOMEM; } + ap = &ia->ap; } - if (WARN_ON(req->num_pages >= req->max_pages)) { + if (WARN_ON(ap->num_pages >= data->max_pages)) { unlock_page(page); - fuse_put_request(fc, req); + fuse_io_free(ia); return -EIO; } get_page(page); - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].length = PAGE_SIZE; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].length = PAGE_SIZE; + ap->num_pages++; data->nr_pages--; return 0; } @@ -966,7 +973,6 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; - unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages); err = -EIO; if (is_bad_inode(inode)) @@ -974,21 +980,20 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - if (fc->async_read) - data.req = fuse_get_req_for_background(fc, nr_alloc); - else - data.req = fuse_get_req(fc, nr_alloc); data.nr_pages = nr_pages; - err = PTR_ERR(data.req); - if (IS_ERR(data.req)) + data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages); +; + data.ia = fuse_io_alloc(NULL, data.max_pages); + err = -ENOMEM; + if (!data.ia) goto out; err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); if (!err) { - if (data.req->num_pages) - fuse_send_readpages(data.req, file); + if (data.ia->ap.num_pages) + fuse_send_readpages(data.ia, file); else - fuse_put_request(fc, data.req); + fuse_io_free(data.ia); } out: return err; From 43f5098eb82b1dbf3988cab0a26e729e88a004fc Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 316/721] fuse: convert readdir to simple api The old fuse_read_fill() helper can be deleted, now that the last user is gone. The fuse_io_args struct is moved to fuse_i.h so it can be shared between readdir/read code. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 36 ------------------------------------ fs/fuse/fuse_i.h | 23 ++++++++++++++++++++--- fs/fuse/readdir.c | 47 ++++++++++++++++++++--------------------------- 3 files changed, 40 insertions(+), 66 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ac2323443c09..3cb98ff267cf 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -532,42 +532,6 @@ static int fuse_fsync(struct file *file, loff_t start, loff_t end, return err; } -void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, - size_t count, int opcode) -{ - struct fuse_read_in *inarg = &req->misc.read.in; - struct fuse_file *ff = file->private_data; - - inarg->fh = ff->fh; - inarg->offset = pos; - inarg->size = count; - inarg->flags = file->f_flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_read_in); - req->in.args[0].value = inarg; - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = count; -} - -struct fuse_io_args { - union { - struct { - struct fuse_read_in in; - u64 attr_ver; - } read; - struct { - struct fuse_write_in in; - struct fuse_write_out out; - } write; - }; - struct fuse_args_pages ap; - struct fuse_io_priv *io; - struct fuse_file *ff; -}; - void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8f46c5549e57..4d7cd20967c2 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -830,11 +830,28 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -/** +/* * Initialize READ or READDIR request */ -void fuse_read_fill(struct fuse_req *req, struct file *file, - loff_t pos, size_t count, int opcode); +struct fuse_io_args { + union { + struct { + struct fuse_read_in in; + u64 attr_ver; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + } write; + }; + struct fuse_args_pages ap; + struct fuse_io_priv *io; + struct fuse_file *ff; +}; + +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode); + /** * Send OPEN or OPENDIR request diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 56a19faa855d..fae025db06f4 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -316,62 +316,55 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { - int plus, err; - size_t nbytes; + int plus; + ssize_t res; struct page *page; struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; + struct fuse_page_desc desc = { .length = PAGE_SIZE }; u64 attr_version = 0; bool locked; - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); - page = alloc_page(GFP_KERNEL); - if (!page) { - fuse_put_request(fc, req); + if (!page) return -ENOMEM; - } plus = fuse_use_readdirplus(inode, ctx); - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE; + ap->args.out_pages = 1; + ap->num_pages = 1; + ap->pages = &page; + ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fc); - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); + fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, + FUSE_READDIRPLUS); } else { - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); + fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, + FUSE_READDIR); } locked = fuse_lock_inode(inode); - fuse_request_send(fc, req); + res = fuse_simple_request(fc, &ap->args); fuse_unlock_inode(inode, locked); - nbytes = req->out.args[0].size; - err = req->out.h.error; - fuse_put_request(fc, req); - if (!err) { - if (!nbytes) { + if (res >= 0) { + if (!res) { struct fuse_file *ff = file->private_data; if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - err = parse_dirplusfile(page_address(page), nbytes, + res = parse_dirplusfile(page_address(page), res, file, ctx, attr_version); } else { - err = parse_dirfile(page_address(page), nbytes, file, + res = parse_dirfile(page_address(page), res, file, ctx); } } __free_page(page); fuse_invalidate_atime(inode); - return err; + return res; } enum fuse_parse_result { From 33826ebbbe4b45ccecf2f5a08b3457f5d59c6282 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 317/721] fuse: convert writepages to simple api Derive fuse_writepage_args from fuse_io_args. Sending the request is tricky since it was done with fi->lock held, hence we must either use atomic allocation or release the lock. Both are possible so try atomic first and if it fails, release the lock and do the regular allocation with GFP_NOFS and __GFP_NOFAIL. Both flags are necessary for correct operation. Move the page realloc function from dev.c to file.c and convert to using fuse_writepage_args. The last caller of fuse_write_fill() is gone, so get rid of it. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 27 ---- fs/fuse/file.c | 361 +++++++++++++++++++++++++++-------------------- fs/fuse/fuse_i.h | 3 - 3 files changed, 206 insertions(+), 185 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a52a7380baa4..d87accc9df9d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -96,33 +96,6 @@ static void fuse_req_pages_free(struct fuse_req *req) kfree(req->pages); } -bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, - gfp_t flags) -{ - struct page **pages; - struct fuse_page_desc *page_descs; - unsigned int npages = min_t(unsigned int, - max_t(unsigned int, req->max_pages * 2, - FUSE_DEFAULT_MAX_PAGES_PER_REQ), - fc->max_pages); - WARN_ON(npages <= req->max_pages); - - pages = fuse_pages_alloc(npages, flags, &page_descs); - if (!pages) - return false; - - memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages); - memcpy(page_descs, req->page_descs, - sizeof(struct fuse_page_desc) * req->max_pages); - fuse_req_pages_free(req); - __set_bit(FR_ALLOC_PAGES, &req->flags); - req->pages = pages; - req->page_descs = page_descs; - req->max_pages = npages; - - return true; -} - void fuse_request_free(struct fuse_req *req) { fuse_req_pages_free(req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3cb98ff267cf..399b89b29bb4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -347,19 +347,27 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) return (u64) v0 + ((u64) v1 << 32); } -static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi, +struct fuse_writepage_args { + struct fuse_io_args ia; + struct list_head writepages_entry; + struct list_head queue_entry; + struct fuse_writepage_args *next; + struct inode *inode; +}; + +static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_req *req; + struct fuse_writepage_args *wpa; - list_for_each_entry(req, &fi->writepages, writepages_entry) { + list_for_each_entry(wpa, &fi->writepages, writepages_entry) { pgoff_t curr_index; - WARN_ON(get_fuse_inode(req->inode) != fi); - curr_index = req->misc.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + req->num_pages && + WARN_ON(get_fuse_inode(wpa->inode) != fi); + curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; + if (idx_from < curr_index + wpa->ia.ap.num_pages && curr_index <= idx_to) { - return req; + return wpa; } } return NULL; @@ -984,29 +992,6 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } -static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, - loff_t pos, size_t count) -{ - struct fuse_write_in *inarg = &req->misc.write.in; - struct fuse_write_out *outarg = &req->misc.write.out; - - inarg->fh = ff->fh; - inarg->offset = pos; - inarg->size = count; - req->in.h.opcode = FUSE_WRITE; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 2; - if (ff->fc->minor < 9) - req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; - else - req->in.args[0].size = sizeof(struct fuse_write_in); - req->in.args[0].value = inarg; - req->in.args[1].size = count; - req->out.numargs = 1; - req->out.args[0].size = sizeof(struct fuse_write_out); - req->out.args[0].value = outarg; -} - static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, loff_t pos, size_t count) { @@ -1576,45 +1561,53 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return fuse_direct_write_iter(iocb, from); } -static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_free(struct fuse_writepage_args *wpa) { + struct fuse_args_pages *ap = &wpa->ia.ap; int i; - for (i = 0; i < req->num_pages; i++) - __free_page(req->pages[i]); + for (i = 0; i < ap->num_pages; i++) + __free_page(ap->pages[i]); - if (req->ff) - fuse_file_put(req->ff, false, false); + if (wpa->ia.ff) + fuse_file_put(wpa->ia.ff, false, false); + + kfree(ap->pages); + kfree(wpa); } -static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_finish(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) { - struct inode *inode = req->inode; + struct fuse_args_pages *ap = &wpa->ia.ap; + struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&req->writepages_entry); - for (i = 0; i < req->num_pages; i++) { + list_del(&wpa->writepages_entry); + for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP); + dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); } /* Called under fi->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, - loff_t size) +static void fuse_send_writepage(struct fuse_conn *fc, + struct fuse_writepage_args *wpa, loff_t size) __releases(fi->lock) __acquires(fi->lock) { - struct fuse_req *aux, *next; - struct fuse_inode *fi = get_fuse_inode(req->inode); - struct fuse_write_in *inarg = &req->misc.write.in; - __u64 data_size = req->num_pages * PAGE_SIZE; - bool queued; + struct fuse_writepage_args *aux, *next; + struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_args *args = &wpa->ia.ap.args; + __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; + int err; + fi->writectr++; if (inarg->offset + data_size <= size) { inarg->size = data_size; } else if (inarg->offset < size) { @@ -1624,29 +1617,36 @@ __acquires(fi->lock) goto out_free; } - req->in.args[1].size = inarg->size; - queued = fuse_request_queue_background(fc, req); + args->in_args[1].size = inarg->size; + args->force = true; + args->nocreds = true; + + err = fuse_simple_background(fc, args, GFP_ATOMIC); + if (err == -ENOMEM) { + spin_unlock(&fi->lock); + err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL); + spin_lock(&fi->lock); + } + /* Fails on broken connection only */ - if (unlikely(!queued)) + if (unlikely(err)) goto out_free; - fi->writectr++; return; out_free: - fuse_writepage_finish(fc, req); + fi->writectr--; + fuse_writepage_finish(fc, wpa); spin_unlock(&fi->lock); /* After fuse_writepage_finish() aux request list is private */ - for (aux = req->misc.write.next; aux; aux = next) { - next = aux->misc.write.next; - aux->misc.write.next = NULL; - fuse_writepage_free(fc, aux); - fuse_put_request(fc, aux); + for (aux = wpa->next; aux; aux = next) { + next = aux->next; + aux->next = NULL; + fuse_writepage_free(aux); } - fuse_writepage_free(fc, req); - fuse_put_request(fc, req); + fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1663,29 +1663,34 @@ __acquires(fi->lock) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t crop = i_size_read(inode); - struct fuse_req *req; + struct fuse_writepage_args *wpa; while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { - req = list_entry(fi->queued_writes.next, struct fuse_req, list); - list_del_init(&req->list); - fuse_send_writepage(fc, req, crop); + wpa = list_entry(fi->queued_writes.next, + struct fuse_writepage_args, queue_entry); + list_del_init(&wpa->queue_entry); + fuse_send_writepage(fc, wpa, crop); } } -static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - struct inode *inode = req->inode; + struct fuse_writepage_args *wpa = + container_of(args, typeof(*wpa), ia.ap.args); + struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - mapping_set_error(inode->i_mapping, req->out.h.error); + mapping_set_error(inode->i_mapping, error); spin_lock(&fi->lock); - while (req->misc.write.next) { + while (wpa->next) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_write_in *inarg = &req->misc.write.in; - struct fuse_req *next = req->misc.write.next; - req->misc.write.next = next->misc.write.next; - next->misc.write.next = NULL; - next->ff = fuse_file_get(req->ff); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_writepage_args *next = wpa->next; + + wpa->next = next->next; + next->next = NULL; + next->ia.ff = fuse_file_get(wpa->ia.ff); list_add(&next->writepages_entry, &fi->writepages); /* @@ -1714,9 +1719,9 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) fuse_send_writepage(fc, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fc, req); + fuse_writepage_finish(fc, wpa); spin_unlock(&fi->lock); - fuse_writepage_free(fc, req); + fuse_writepage_free(wpa); } static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, @@ -1758,52 +1763,71 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } +static struct fuse_writepage_args *fuse_writepage_args_alloc(void) +{ + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; + + wpa = kzalloc(sizeof(*wpa), GFP_NOFS); + if (wpa) { + ap = &wpa->ia.ap; + ap->num_pages = 0; + ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); + if (!ap->pages) { + kfree(wpa); + wpa = NULL; + } + } + return wpa; + +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_req *req; + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; struct page *tmp_page; int error = -ENOMEM; set_page_writeback(page); - req = fuse_request_alloc_nofs(1); - if (!req) + wpa = fuse_writepage_args_alloc(); + if (!wpa) goto err; + ap = &wpa->ia.ap; - /* writeback always goes to bg_queue */ - __set_bit(FR_BACKGROUND, &req->flags); tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) goto err_free; error = -EIO; - req->ff = fuse_write_file_get(fc, fi); - if (!req->ff) + wpa->ia.ff = fuse_write_file_get(fc, fi); + if (!wpa->ia.ff) goto err_nofile; - fuse_write_fill(req, req->ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); copy_highpage(tmp_page, page); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; - req->misc.write.next = NULL; - req->in.argpages = 1; - req->num_pages = 1; - req->pages[0] = tmp_page; - req->page_descs[0].offset = 0; - req->page_descs[0].length = PAGE_SIZE; - req->end = fuse_writepage_end; - req->inode = inode; + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->next = NULL; + ap->args.in_pages = true; + ap->num_pages = 1; + ap->pages[0] = tmp_page; + ap->descs[0].offset = 0; + ap->descs[0].length = PAGE_SIZE; + ap->args.end = fuse_writepage_end; + wpa->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - list_add(&req->writepages_entry, &fi->writepages); - list_add_tail(&req->list, &fi->queued_writes); + list_add(&wpa->writepages_entry, &fi->writepages); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1814,7 +1838,7 @@ static int fuse_writepage_locked(struct page *page) err_nofile: __free_page(tmp_page); err_free: - fuse_request_free(req); + kfree(wpa); err: mapping_set_error(page->mapping, error); end_page_writeback(page); @@ -1844,23 +1868,50 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) } struct fuse_fill_wb_data { - struct fuse_req *req; + struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; struct page **orig_pages; + unsigned int max_pages; }; +static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) +{ + struct fuse_args_pages *ap = &data->wpa->ia.ap; + struct fuse_conn *fc = get_fuse_conn(data->inode); + struct page **pages; + struct fuse_page_desc *descs; + unsigned int npages = min_t(unsigned int, + max_t(unsigned int, data->max_pages * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), + fc->max_pages); + WARN_ON(npages <= data->max_pages); + + pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); + if (!pages) + return false; + + memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); + memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); + kfree(ap->pages); + ap->pages = pages; + ap->descs = descs; + data->max_pages = npages; + + return true; +} + static void fuse_writepages_send(struct fuse_fill_wb_data *data) { - struct fuse_req *req = data->req; + struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = req->num_pages; + int num_pages = wpa->ia.ap.num_pages; int i; - req->ff = fuse_file_get(data->ff); + wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); - list_add_tail(&req->list, &fi->queued_writes); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1875,54 +1926,52 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) * this new request onto the auxiliary list, otherwise reuse the existing one by * copying the new page contents over to the old temporary page. */ -static bool fuse_writepage_in_flight(struct fuse_req *new_req, +static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, struct page *page) { - struct fuse_conn *fc = get_fuse_conn(new_req->inode); - struct fuse_inode *fi = get_fuse_inode(new_req->inode); - struct fuse_req *tmp; - struct fuse_req *old_req; + struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); + struct fuse_writepage_args *tmp; + struct fuse_writepage_args *old_wpa; + struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - WARN_ON(new_req->num_pages != 0); + WARN_ON(new_ap->num_pages != 0); spin_lock(&fi->lock); - list_del(&new_req->writepages_entry); - old_req = fuse_find_writeback(fi, page->index, page->index); - if (!old_req) { - list_add(&new_req->writepages_entry, &fi->writepages); + list_del(&new_wpa->writepages_entry); + old_wpa = fuse_find_writeback(fi, page->index, page->index); + if (!old_wpa) { + list_add(&new_wpa->writepages_entry, &fi->writepages); spin_unlock(&fi->lock); return false; } - new_req->num_pages = 1; - for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) { + new_ap->num_pages = 1; + for (tmp = old_wpa->next; tmp; tmp = tmp->next) { pgoff_t curr_index; - WARN_ON(tmp->inode != new_req->inode); - curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT; + WARN_ON(tmp->inode != new_wpa->inode); + curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; if (curr_index == page->index) { - WARN_ON(tmp->num_pages != 1); - WARN_ON(!test_bit(FR_PENDING, &tmp->flags)); - swap(tmp->pages[0], new_req->pages[0]); + WARN_ON(tmp->ia.ap.num_pages != 1); + swap(tmp->ia.ap.pages[0], new_ap->pages[0]); break; } } if (!tmp) { - new_req->misc.write.next = old_req->misc.write.next; - old_req->misc.write.next = new_req; + new_wpa->next = old_wpa->next; + old_wpa->next = new_wpa; } spin_unlock(&fi->lock); if (tmp) { - struct backing_dev_info *bdi = inode_to_bdi(new_req->inode); + struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); + dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); - fuse_writepage_free(fc, new_req); - fuse_request_free(new_req); + fuse_writepage_free(new_wpa); } return true; @@ -1932,7 +1981,8 @@ static int fuse_writepages_fill(struct page *page, struct writeback_control *wbc, void *_data) { struct fuse_fill_wb_data *data = _data; - struct fuse_req *req = data->req; + struct fuse_writepage_args *wpa = data->wpa; + struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1955,16 +2005,16 @@ static int fuse_writepages_fill(struct page *page, */ is_writeback = fuse_page_is_writeback(inode, page->index); - if (req && req->num_pages && - (is_writeback || req->num_pages == fc->max_pages || - (req->num_pages + 1) * PAGE_SIZE > fc->max_write || - data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { + if (wpa && ap->num_pages && + (is_writeback || ap->num_pages == fc->max_pages || + (ap->num_pages + 1) * PAGE_SIZE > fc->max_write || + data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) { fuse_writepages_send(data); - data->req = NULL; - } else if (req && req->num_pages == req->max_pages) { - if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) { + data->wpa = NULL; + } else if (wpa && ap->num_pages == data->max_pages) { + if (!fuse_pages_realloc(data)) { fuse_writepages_send(data); - req = data->req = NULL; + data->wpa = NULL; } } @@ -1982,59 +2032,60 @@ static int fuse_writepages_fill(struct page *page, * This is ensured by holding the page lock in page_mkwrite() while * checking fuse_page_is_writeback(). We already hold the page lock * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment req->num_pages. + * request to the fi->writepages list and increment ap->num_pages. * After this fuse_page_is_writeback() will indicate that the page is * under writeback, so we can release the page lock. */ - if (data->req == NULL) { + if (data->wpa == NULL) { struct fuse_inode *fi = get_fuse_inode(inode); err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); - if (!req) { + wpa = fuse_writepage_args_alloc(); + if (!wpa) { __free_page(tmp_page); goto out_unlock; } + data->max_pages = 1; - fuse_write_fill(req, data->ff, page_offset(page), 0); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; - req->misc.write.next = NULL; - req->in.argpages = 1; - __set_bit(FR_BACKGROUND, &req->flags); - req->num_pages = 0; - req->end = fuse_writepage_end; - req->inode = inode; + ap = &wpa->ia.ap; + fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->next = NULL; + ap->args.in_pages = true; + ap->args.end = fuse_writepage_end; + ap->num_pages = 0; + wpa->inode = inode; spin_lock(&fi->lock); - list_add(&req->writepages_entry, &fi->writepages); + list_add(&wpa->writepages_entry, &fi->writepages); spin_unlock(&fi->lock); - data->req = req; + data->wpa = wpa; } set_page_writeback(page); copy_highpage(tmp_page, page); - req->pages[req->num_pages] = tmp_page; - req->page_descs[req->num_pages].offset = 0; - req->page_descs[req->num_pages].length = PAGE_SIZE; + ap->pages[ap->num_pages] = tmp_page; + ap->descs[ap->num_pages].offset = 0; + ap->descs[ap->num_pages].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; - if (is_writeback && fuse_writepage_in_flight(req, page)) { + if (is_writeback && fuse_writepage_in_flight(wpa, page)) { end_page_writeback(page); - data->req = NULL; + data->wpa = NULL; goto out_unlock; } - data->orig_pages[req->num_pages] = page; + data->orig_pages[ap->num_pages] = page; /* * Protected by fi->lock against concurrent access by * fuse_page_is_writeback(). */ spin_lock(&fi->lock); - req->num_pages++; + ap->num_pages++; spin_unlock(&fi->lock); out_unlock: @@ -2056,7 +2107,7 @@ static int fuse_writepages(struct address_space *mapping, goto out; data.inode = inode; - data.req = NULL; + data.wpa = NULL; data.ff = NULL; err = -ENOMEM; @@ -2067,9 +2118,9 @@ static int fuse_writepages(struct address_space *mapping, goto out; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); - if (data.req) { + if (data.wpa) { /* Ignore errors if we can write at least one page */ - BUG_ON(!data.req->num_pages); + WARN_ON(!data.wpa->ia.ap.num_pages); fuse_writepages_send(&data); err = 0; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 4d7cd20967c2..a9d707bf2e69 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -932,9 +932,6 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages); struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, struct fuse_page_desc **desc); -bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, - gfp_t flags); - /** * Free a request From 615047eff10886bb5ef8e97232b431348bad06d1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 318/721] fuse: convert init to simple api Bypass the fc->initialized check by setting the force flag. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 67 ++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 127984642a71..2cec25ad54b7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -874,11 +874,19 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) spin_unlock(&fc->bg_lock); } -static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) -{ - struct fuse_init_out *arg = &req->misc.init_out; +struct fuse_init_args { + struct fuse_args args; + struct fuse_init_in in; + struct fuse_init_out out; +}; - if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) +static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, + int error) +{ + struct fuse_init_args *ia = container_of(args, typeof(*ia), args); + struct fuse_init_out *arg = &ia->out; + + if (error || arg->major != FUSE_KERNEL_VERSION) fc->conn_error = 1; else { unsigned long ra_pages; @@ -955,18 +963,23 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } + kfree(ia); + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } -static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_send_init(struct fuse_conn *fc) { - struct fuse_init_in *arg = &req->misc.init_in; + struct fuse_init_args *ia; - arg->major = FUSE_KERNEL_VERSION; - arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; - arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); + + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; + ia->in.flags |= + FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | @@ -975,19 +988,23 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; - req->in.h.opcode = FUSE_INIT; - req->in.numargs = 1; - req->in.args[0].size = sizeof(*arg); - req->in.args[0].value = arg; - req->out.numargs = 1; + ia->args.opcode = FUSE_INIT; + ia->args.in_numargs = 1; + ia->args.in_args[0].size = sizeof(ia->in); + ia->args.in_args[0].value = &ia->in; + ia->args.out_numargs = 1; /* Variable length argument used for backward compatibility with interface version < 7.5. Rest of init_out is zeroed by do_get_request(), so a short reply is not a problem */ - req->out.argvar = 1; - req->out.args[0].size = sizeof(struct fuse_init_out); - req->out.args[0].value = &req->misc.init_out; - req->end = process_init_reply; - fuse_request_send_background(fc, req); + ia->args.out_argvar = 1; + ia->args.out_args[0].size = sizeof(ia->out); + ia->args.out_args[0].value = &ia->out; + ia->args.force = true; + ia->args.nocreds = true; + ia->args.end = process_init_reply; + + if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0) + process_init_reply(fc, &ia->args, -ENOTCONN); } static void fuse_free_conn(struct fuse_conn *fc) @@ -1087,7 +1104,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) struct inode *root; struct file *file; struct dentry *root_dentry; - struct fuse_req *init_req; int err; int is_bdev = sb->s_bdev != NULL; @@ -1182,11 +1198,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) /* Root dentry doesn't have .d_revalidate */ sb->s_d_op = &fuse_dentry_operations; - init_req = fuse_request_alloc(0); - if (!init_req) - goto err_put_root; - __set_bit(FR_BACKGROUND, &init_req->flags); - mutex_lock(&fuse_mutex); err = -EINVAL; if (file->private_data) @@ -1207,14 +1218,12 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) */ fput(file); - fuse_send_init(fc, init_req); + fuse_send_init(fc); return 0; err_unlock: mutex_unlock(&fuse_mutex); - fuse_request_free(init_req); - err_put_root: dput(root_dentry); err_dev_free: fuse_dev_free(fud); From b50ef7c52ad7c954b743cd1cb181e9bf03537bab Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 319/721] cuse: convert init to simple api This is a straightforward conversion. Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 91 ++++++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index d011a1ad1d4f..2332e7f960a8 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -298,6 +298,14 @@ static void cuse_gendev_release(struct device *dev) kfree(dev); } +struct cuse_init_args { + struct fuse_args_pages ap; + struct cuse_init_in in; + struct cuse_init_out out; + struct page *page; + struct fuse_page_desc desc; +}; + /** * cuse_process_init_reply - finish initializing CUSE channel * @@ -305,21 +313,22 @@ static void cuse_gendev_release(struct device *dev) * required data structures for it. Please read the comment at the * top of this file for high level overview. */ -static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +static void cuse_process_init_reply(struct fuse_conn *fc, + struct fuse_args *args, int error) { + struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; - struct cuse_init_out *arg = req->out.args[0].value; - struct page *page = req->pages[0]; + struct cuse_init_out *arg = &ia->out; + struct page *page = ap->pages[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; dev_t devt; int rc, i; - if (req->out.h.error || - arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { + if (error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) goto err; - } fc->minor = arg->minor; fc->max_read = max_t(unsigned, arg->max_read, 4096); @@ -328,7 +337,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) /* parse init reply */ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; - rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size, + rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size, &devinfo); if (rc) goto err; @@ -395,7 +404,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) dev_set_uevent_suppress(dev, 0); kobject_uevent(&dev->kobj, KOBJ_ADD); out: - kfree(arg); + kfree(ia); __free_page(page); return; @@ -414,55 +423,49 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) static int cuse_send_init(struct cuse_conn *cc) { int rc; - struct fuse_req *req; struct page *page; struct fuse_conn *fc = &cc->fc; - struct cuse_init_in *arg; - void *outarg; + struct cuse_init_args *ia; + struct fuse_args_pages *ap; BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - req = fuse_get_req_for_background(fc, 1); - if (IS_ERR(req)) { - rc = PTR_ERR(req); - goto err; - } - rc = -ENOMEM; page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) - goto err_put_req; + goto err; - outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL); - if (!outarg) + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (!ia) goto err_free_page; - arg = &req->misc.cuse_init_in; - arg->major = FUSE_KERNEL_VERSION; - arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->flags |= CUSE_UNRESTRICTED_IOCTL; - req->in.h.opcode = CUSE_INIT; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct cuse_init_in); - req->in.args[0].value = arg; - req->out.numargs = 2; - req->out.args[0].size = sizeof(struct cuse_init_out); - req->out.args[0].value = outarg; - req->out.args[1].size = CUSE_INIT_INFO_MAX; - req->out.argvar = 1; - req->out.argpages = 1; - req->pages[0] = page; - req->page_descs[0].length = req->out.args[1].size; - req->num_pages = 1; - req->end = cuse_process_init_reply; - fuse_request_send_background(fc, req); - - return 0; + ap = &ia->ap; + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.flags |= CUSE_UNRESTRICTED_IOCTL; + ap->args.opcode = CUSE_INIT; + ap->args.in_numargs = 1; + ap->args.in_args[0].size = sizeof(ia->in); + ap->args.in_args[0].value = &ia->in; + ap->args.out_numargs = 2; + ap->args.out_args[0].size = sizeof(ia->out); + ap->args.out_args[0].value = &ia->out; + ap->args.out_args[1].size = CUSE_INIT_INFO_MAX; + ap->args.out_argvar = 1; + ap->args.out_pages = 1; + ap->num_pages = 1; + ap->pages = &ia->page; + ap->descs = &ia->desc; + ia->page = page; + ia->desc.length = ap->args.out_args[1].size; + ap->args.end = cuse_process_init_reply; + rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + if (rc) { + kfree(ia); err_free_page: - __free_page(page); -err_put_req: - fuse_put_request(fc, req); + __free_page(page); + } err: return rc; } From 4cb548666e4c13699904a063f6909b169d3f900c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:10 +0200 Subject: [PATCH 320/721] fuse: convert release to simple api Since we cannot reserve the request structure up-front, make sure that the request allocation doesn't fail using __GFP_NOFAIL. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 75 +++++++++++++++++++++++++----------------------- fs/fuse/fuse_i.h | 8 ++---- 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 399b89b29bb4..56ed45ef3058 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -53,6 +53,12 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, return fuse_simple_request(fc, &args); } +struct fuse_release_args { + struct fuse_args args; + struct fuse_release_in inarg; + struct inode *inode; +}; + struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; @@ -62,8 +68,8 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(0); - if (unlikely(!ff->reserved_req)) { + ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL); + if (!ff->release_args) { kfree(ff); return NULL; } @@ -81,7 +87,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) void fuse_file_free(struct fuse_file *ff) { - fuse_request_free(ff->reserved_req); + kfree(ff->release_args); mutex_destroy(&ff->readdir.lock); kfree(ff); } @@ -92,34 +98,31 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - iput(req->misc.release.inode); + struct fuse_release_args *ra = container_of(args, typeof(*ra), args); + + iput(ra->inode); + kfree(ra); } static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_req *req = ff->reserved_req; + struct fuse_args *args = &ff->release_args->args; if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { - /* - * Drop the release request when client does not - * implement 'open' - */ - __clear_bit(FR_BACKGROUND, &req->flags); - iput(req->misc.release.inode); - fuse_put_request(ff->fc, req); + /* Do nothing when client does not implement 'open' */ + fuse_release_end(ff->fc, args, 0); } else if (sync) { - __set_bit(FR_FORCE, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - fuse_request_send(ff->fc, req); - iput(req->misc.release.inode); - fuse_put_request(ff->fc, req); + fuse_simple_request(ff->fc, args); + fuse_release_end(ff->fc, args, 0); } else { - req->end = fuse_release_end; - __set_bit(FR_BACKGROUND, &req->flags); - fuse_request_send_background(ff->fc, req); + args->end = fuse_release_end; + if (fuse_simple_background(ff->fc, args, + GFP_KERNEL | __GFP_NOFAIL)) + fuse_release_end(ff->fc, args, -ENOTCONN); } kfree(ff); } @@ -239,8 +242,7 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, int flags, int opcode) { struct fuse_conn *fc = ff->fc; - struct fuse_req *req = ff->reserved_req; - struct fuse_release_in *inarg = &req->misc.release.in; + struct fuse_release_args *ra = ff->release_args; /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { @@ -255,32 +257,33 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); - inarg->fh = ff->fh; - inarg->flags = flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_release_in); - req->in.args[0].value = inarg; + ra->inarg.fh = ff->fh; + ra->inarg.flags = flags; + ra->args.in_numargs = 1; + ra->args.in_args[0].size = sizeof(struct fuse_release_in); + ra->args.in_args[0].value = &ra->inarg; + ra->args.opcode = opcode; + ra->args.nodeid = ff->nodeid; + ra->args.force = true; + ra->args.nocreds = true; } void fuse_release_common(struct file *file, bool isdir) { struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; - struct fuse_req *req = ff->reserved_req; + struct fuse_release_args *ra = ff->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; fuse_prepare_release(fi, ff, file->f_flags, opcode); if (ff->flock) { - struct fuse_release_in *inarg = &req->misc.release.in; - inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; - inarg->lock_owner = fuse_lock_owner_id(ff->fc, - (fl_owner_t) file); + ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc, + (fl_owner_t) file); } /* Hold inode until release is finished */ - req->misc.release.inode = igrab(file_inode(file)); + ra->inode = igrab(file_inode(file)); /* * Normally this will send the RELEASE request, however if diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a9d707bf2e69..a055a45361ef 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -164,17 +164,15 @@ enum { }; struct fuse_conn; +struct fuse_release_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ struct fuse_conn *fc; - /* - * Request reserved for flush and release. - * Modified under relative fuse_inode::lock. - */ - struct fuse_req *reserved_req; + /* Argument space reserved for release */ + struct fuse_release_args *release_args; /** Kernel file handle guaranteed to be unique */ u64 kh; From 75b399dda5be1e9ad6952ffcec4c1a057ffae860 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 321/721] fuse: convert retrieve to simple api Rename fuse_request_send_notify_reply() to fuse_simple_notify_reply() and convert to passing fuse_args instead of fuse_req. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 92 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 30 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d87accc9df9d..42db40750a2a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -655,18 +655,32 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) } EXPORT_SYMBOL_GPL(fuse_request_send_background); -static int fuse_request_send_notify_reply(struct fuse_conn *fc, - struct fuse_req *req, u64 unique) +static int fuse_simple_notify_reply(struct fuse_conn *fc, + struct fuse_args *args, u64 unique) { - int err = -ENODEV; + struct fuse_req *req; struct fuse_iqueue *fiq = &fc->iq; + int err = 0; + + req = fuse_get_req(fc, 0); + if (IS_ERR(req)) + return PTR_ERR(req); __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; + + fuse_args_to_req(req, args); + req->args = args; + req->end = fuse_simple_end; + spin_lock(&fiq->lock); if (fiq->connected) { queue_request(fiq, req); - err = 0; + spin_unlock(&fiq->lock); + } else { + err = -ENODEV; + spin_unlock(&fiq->lock); + fuse_put_request(fc, req); } spin_unlock(&fiq->lock); @@ -1691,9 +1705,19 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, return err; } -static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) +struct fuse_retrieve_args { + struct fuse_args_pages ap; + struct fuse_notify_retrieve_in inarg; +}; + +static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - release_pages(req->pages, req->num_pages); + struct fuse_retrieve_args *ra = + container_of(args, typeof(*ra), ap.args); + + release_pages(ra->ap.pages, ra->ap.num_pages); + kfree(ra); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, @@ -1701,13 +1725,16 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, { int err; struct address_space *mapping = inode->i_mapping; - struct fuse_req *req; pgoff_t index; loff_t file_size; unsigned int num; unsigned int offset; size_t total_len = 0; unsigned int num_pages; + struct fuse_retrieve_args *ra; + size_t args_size = sizeof(*ra); + struct fuse_args_pages *ap; + struct fuse_args *args; offset = outarg->offset & ~PAGE_MASK; file_size = i_size_read(inode); @@ -1721,19 +1748,26 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); - req = fuse_get_req(fc, num_pages); - if (IS_ERR(req)) - return PTR_ERR(req); + args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0])); - req->in.h.opcode = FUSE_NOTIFY_REPLY; - req->in.h.nodeid = outarg->nodeid; - req->in.numargs = 2; - req->in.argpages = 1; - req->end = fuse_retrieve_end; + ra = kzalloc(args_size, GFP_KERNEL); + if (!ra) + return -ENOMEM; + + ap = &ra->ap; + ap->pages = (void *) (ra + 1); + ap->descs = (void *) (ap->pages + num_pages); + + args = &ap->args; + args->nodeid = outarg->nodeid; + args->opcode = FUSE_NOTIFY_REPLY; + args->in_numargs = 2; + args->in_pages = true; + args->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; - while (num && req->num_pages < num_pages) { + while (num && ap->num_pages < num_pages) { struct page *page; unsigned int this_num; @@ -1742,27 +1776,25 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, break; this_num = min_t(unsigned, num, PAGE_SIZE - offset); - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].offset = offset; - req->page_descs[req->num_pages].length = this_num; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].offset = offset; + ap->descs[ap->num_pages].length = this_num; + ap->num_pages++; offset = 0; num -= this_num; total_len += this_num; index++; } - req->misc.retrieve_in.offset = outarg->offset; - req->misc.retrieve_in.size = total_len; - req->in.args[0].size = sizeof(req->misc.retrieve_in); - req->in.args[0].value = &req->misc.retrieve_in; - req->in.args[1].size = total_len; + ra->inarg.offset = outarg->offset; + ra->inarg.size = total_len; + args->in_args[0].size = sizeof(ra->inarg); + args->in_args[0].value = &ra->inarg; + args->in_args[1].size = total_len; - err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); - if (err) { - fuse_retrieve_end(fc, req); - fuse_put_request(fc, req); - } + err = fuse_simple_notify_reply(fc, args, outarg->notify_unique); + if (err) + fuse_retrieve_end(fc, args, err); return err; } From 66abc3599c3c8795861470f21ae149520a57153d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 322/721] fuse: unexport request ops All requests are now sent with one of the fuse_simple_... helpers. Get rid of the old api from the fuse internal header. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 51 +++++++++--------------------------------------- fs/fuse/fuse_i.h | 39 ------------------------------------ 2 files changed, 9 insertions(+), 81 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 42db40750a2a..1ff18d319949 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -79,16 +79,10 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) return req; } -struct fuse_req *fuse_request_alloc(unsigned npages) +static struct fuse_req *fuse_request_alloc(unsigned int npages) { return __fuse_request_alloc(npages, GFP_KERNEL); } -EXPORT_SYMBOL_GPL(fuse_request_alloc); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages) -{ - return __fuse_request_alloc(npages, GFP_NOFS); -} static void fuse_req_pages_free(struct fuse_req *req) { @@ -96,13 +90,13 @@ static void fuse_req_pages_free(struct fuse_req *req) kfree(req->pages); } -void fuse_request_free(struct fuse_req *req) +static void fuse_request_free(struct fuse_req *req) { fuse_req_pages_free(req); kmem_cache_free(fuse_req_cachep, req); } -void __fuse_get_request(struct fuse_req *req) +static void __fuse_get_request(struct fuse_req *req) { refcount_inc(&req->count); } @@ -139,6 +133,8 @@ static void fuse_drop_waiting(struct fuse_conn *fc) } } +static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); + static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, bool for_background) { @@ -191,20 +187,12 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, return ERR_PTR(err); } -struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) +static struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned int npages) { return __fuse_get_req(fc, npages, false); } -EXPORT_SYMBOL_GPL(fuse_get_req); -struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, - unsigned npages) -{ - return __fuse_get_req(fc, npages, true); -} -EXPORT_SYMBOL_GPL(fuse_get_req_for_background); - -void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { @@ -455,17 +443,6 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) } } -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) -{ - __set_bit(FR_ISREPLY, &req->flags); - if (!test_bit(FR_WAITING, &req->flags)) { - __set_bit(FR_WAITING, &req->flags); - atomic_inc(&fc->num_waiting); - } - __fuse_request_send(fc, req); -} -EXPORT_SYMBOL_GPL(fuse_request_send); - static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) { if (fc->minor < 4 && args->opcode == FUSE_STATFS) @@ -571,7 +548,8 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) return ret; } -bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) +static bool fuse_request_queue_background(struct fuse_conn *fc, + struct fuse_req *req) { bool queued = false; @@ -644,17 +622,6 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, } EXPORT_SYMBOL_GPL(fuse_simple_background); -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) -{ - WARN_ON(!req->end); - if (!fuse_request_queue_background(fc, req)) { - req->out.h.error = -ENOTCONN; - req->end(fc, req); - fuse_put_request(fc, req); - } -} -EXPORT_SYMBOL_GPL(fuse_request_send_background); - static int fuse_simple_notify_reply(struct fuse_conn *fc, struct fuse_args *args, u64 unique) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a055a45361ef..ffbc2cd649e3 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -924,42 +924,9 @@ void __exit fuse_ctl_cleanup(void); /** * Allocate a request */ -struct fuse_req *fuse_request_alloc(unsigned npages); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages); - struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, struct fuse_page_desc **desc); -/** - * Free a request - */ -void fuse_request_free(struct fuse_req *req); - -/** - * Get a request, may fail with -ENOMEM, - * caller should specify # elements in req->pages[] explicitly - */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); -struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, - unsigned npages); - -/* - * Increment reference count on request - */ -void __fuse_get_request(struct fuse_req *req); - -/** - * Decrement reference count of a request. If count goes to zero free - * the request. - */ -void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); - -/** - * Send a request (synchronous) - */ -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); - /** * Simple request sending that does request allocation and freeing */ @@ -967,12 +934,6 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, gfp_t gfp_flags); -/** - * Send a request in the background - */ -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); -bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); - /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); From 7213394c4e184b002d8011c13d916e7ac6d17520 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 323/721] fuse: simplify request allocation Page arrays are not allocated together with the request anymore. Get rid of the dead code Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 61 +++++++++--------------------------------------- fs/fuse/file.c | 4 ++-- fs/fuse/fuse_i.h | 7 ------ 3 files changed, 13 insertions(+), 59 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1ff18d319949..f240d541edfc 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -40,59 +40,26 @@ static struct fuse_dev *fuse_get_dev(struct file *file) return READ_ONCE(file->private_data); } -static void fuse_request_init(struct fuse_req *req, struct page **pages, - struct fuse_page_desc *page_descs, - unsigned npages) +static void fuse_request_init(struct fuse_req *req) { INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); refcount_set(&req->count, 1); - req->pages = pages; - req->page_descs = page_descs; - req->max_pages = npages; __set_bit(FR_PENDING, &req->flags); } -static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) +static struct fuse_req *fuse_request_alloc(gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); - if (req) { - struct page **pages = NULL; - struct fuse_page_desc *page_descs = NULL; + if (req) + fuse_request_init(req); - WARN_ON(npages > FUSE_MAX_MAX_PAGES); - if (npages > FUSE_REQ_INLINE_PAGES) { - pages = fuse_pages_alloc(npages, flags, &page_descs); - if (!pages) { - kmem_cache_free(fuse_req_cachep, req); - return NULL; - } - __set_bit(FR_ALLOC_PAGES, &req->flags); - } else if (npages) { - pages = req->inline_pages; - page_descs = req->inline_page_descs; - } - - fuse_request_init(req, pages, page_descs, npages); - } return req; } -static struct fuse_req *fuse_request_alloc(unsigned int npages) -{ - return __fuse_request_alloc(npages, GFP_KERNEL); -} - -static void fuse_req_pages_free(struct fuse_req *req) -{ - if (test_bit(FR_ALLOC_PAGES, &req->flags)) - kfree(req->pages); -} - static void fuse_request_free(struct fuse_req *req) { - fuse_req_pages_free(req); kmem_cache_free(fuse_req_cachep, req); } @@ -135,8 +102,7 @@ static void fuse_drop_waiting(struct fuse_conn *fc) static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); -static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, - bool for_background) +static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) { struct fuse_req *req; int err; @@ -159,7 +125,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (fc->conn_error) goto out; - req = fuse_request_alloc(npages); + req = fuse_request_alloc(GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) @@ -187,11 +153,6 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, return ERR_PTR(err); } -static struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned int npages) -{ - return __fuse_get_req(fc, npages, false); -} - static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (refcount_dec_and_test(&req->count)) { @@ -517,7 +478,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) if (args->force) { atomic_inc(&fc->num_waiting); - req = __fuse_request_alloc(0, GFP_KERNEL | __GFP_NOFAIL); + req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL); if (!args->nocreds) fuse_force_creds(fc, req); @@ -526,7 +487,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fc, 0); + req = fuse_get_req(fc, false); if (IS_ERR(req)) return PTR_ERR(req); } @@ -597,13 +558,13 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, if (args->force) { WARN_ON(!args->nocreds); - req = __fuse_request_alloc(0, gfp_flags); + req = fuse_request_alloc(gfp_flags); if (!req) return -ENOMEM; __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); - req = __fuse_get_req(fc, 0, true); + req = fuse_get_req(fc, true); if (IS_ERR(req)) return PTR_ERR(req); } @@ -629,7 +590,7 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc, struct fuse_iqueue *fiq = &fc->iq; int err = 0; - req = fuse_get_req(fc, 0); + req = fuse_get_req(fc, false); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 56ed45ef3058..a2ea347c4d2c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,8 +19,8 @@ #include #include -struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) +static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) { struct page **pages; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ffbc2cd649e3..90cb82d5d62d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -362,7 +362,6 @@ enum fuse_req_flag { FR_SENT, FR_FINISHED, FR_PRIVATE, - FR_ALLOC_PAGES, }; /** @@ -921,12 +920,6 @@ void fuse_dev_cleanup(void); int fuse_ctl_init(void); void __exit fuse_ctl_cleanup(void); -/** - * Allocate a request - */ -struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc); - /** * Simple request sending that does request allocation and freeing */ From 145b673bd208af97b2f98572f286111ab8e7bc59 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 324/721] fuse: clean up fuse_req Get rid of request specific fields in fuse_req that are not used anymore. Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 45 --------------------------------------------- 1 file changed, 45 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 90cb82d5d62d..a81fdbe62df9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -47,9 +47,6 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 -/** Number of page pointers embedded in fuse_req */ -#define FUSE_REQ_INLINE_PAGES 1 - /** List of active connections */ extern struct list_head fuse_conn_list; @@ -397,57 +394,15 @@ struct fuse_req { /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; - /** Data for asynchronous requests */ - union { - struct { - struct fuse_release_in in; - struct inode *inode; - } release; - struct fuse_init_in init_in; - struct fuse_init_out init_out; - struct cuse_init_in cuse_init_in; - struct { - struct fuse_read_in in; - u64 attr_ver; - } read; - struct { - struct fuse_write_in in; - struct fuse_write_out out; - struct fuse_req *next; - } write; - struct fuse_notify_retrieve_in retrieve_in; - } misc; - /** page vector */ struct page **pages; /** page-descriptor vector */ struct fuse_page_desc *page_descs; - /** size of the 'pages' array */ - unsigned max_pages; - - /** inline page vector */ - struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; - - /** inline page-descriptor vector */ - struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES]; - /** number of pages in vector */ unsigned num_pages; - /** File used in the request (or NULL) */ - struct fuse_file *ff; - - /** Inode used in the request or NULL */ - struct inode *inode; - - /** AIO control block */ - struct fuse_io_priv *io; - - /** Link on fi->writepages */ - struct list_head writepages_entry; - /** Request completion callback */ void (*end)(struct fuse_conn *, struct fuse_req *); From d49937749fef2597f6bcaf2a0ed67e88e347b7fb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 325/721] fuse: stop copying args to fuse_req No need to duplicate the argument arrays in fuse_req, so just dereference req->args instead of copying to the fuse_req internal ones. This allows further cleanup of the fuse_req structure. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 80 ++++++++++++++++-------------------------------- fs/fuse/fuse_i.h | 60 +++++------------------------------- 2 files changed, 34 insertions(+), 106 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f240d541edfc..dfc0658d990b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -202,7 +202,8 @@ static unsigned int fuse_req_hash(u64 unique) static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + - len_args(req->in.numargs, (struct fuse_arg *) req->in.args); + len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); list_add_tail(&req->list, &fiq->pending); wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); @@ -257,6 +258,7 @@ static void flush_bg_queue(struct fuse_conn *fc) static void request_end(struct fuse_conn *fc, struct fuse_req *req) { struct fuse_iqueue *fiq = &fc->iq; + bool async = req->args->end; if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; @@ -302,8 +304,8 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); } - if (req->end) - req->end(fc, req); + if (async) + req->args->end(fc, req->args, req->out.h.error); put_request: fuse_put_request(fc, req); } @@ -450,25 +452,12 @@ void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; - req->in.numargs = args->in_numargs; - memcpy(req->in.args, args->in_args, - args->in_numargs * sizeof(struct fuse_in_arg)); - req->out.argvar = args->out_argvar; - req->out.numargs = args->out_numargs; - memcpy(req->out.args, args->out_args, - args->out_numargs * sizeof(struct fuse_arg)); - if (args->in_pages || args->out_pages) { - req->in.argpages = args->in_pages; - req->out.argpages = args->out_pages; - req->out.page_zeroing = args->page_zeroing; - req->out.page_replace = args->page_replace; - req->pages = ap->pages; req->page_descs = ap->descs; req->num_pages = ap->num_pages; } - + req->args = args; } ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) @@ -502,7 +491,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) ret = req->out.h.error; if (!ret && args->out_argvar) { BUG_ON(args->out_numargs == 0); - ret = req->out.args[args->out_numargs - 1].size; + ret = args->out_args[args->out_numargs - 1].size; } fuse_put_request(fc, req); @@ -538,19 +527,6 @@ static bool fuse_request_queue_background(struct fuse_conn *fc, return queued; } -static void fuse_simple_end(struct fuse_conn *fc, struct fuse_req *req) -{ - struct fuse_args *args = req->args; - int err = req->out.h.error; - - if (!err && args->out_argvar) { - BUG_ON(args->out_numargs == 0); - args->out_args[args->out_numargs - 1].size = - req->out.args[args->out_numargs - 1].size; - } - req->args->end(fc, req->args, req->out.h.error); -} - int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, gfp_t gfp_flags) { @@ -571,9 +547,6 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, fuse_args_to_req(req, args); - req->args = args; - req->end = fuse_simple_end; - if (!fuse_request_queue_background(fc, req)) { fuse_put_request(fc, req); return -ENOTCONN; @@ -598,8 +571,6 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc, req->in.h.unique = unique; fuse_args_to_req(req, args); - req->args = args; - req->end = fuse_simple_end; spin_lock(&fiq->lock); if (fiq->connected) { @@ -1197,7 +1168,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_iqueue *fiq = &fc->iq; struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; - struct fuse_in *in; + struct fuse_args *args; unsigned reqsize; unsigned int hash; @@ -1258,14 +1229,14 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, list_del_init(&req->list); spin_unlock(&fiq->lock); - in = &req->in; - reqsize = in->h.len; + args = req->args; + reqsize = req->in.h.len; /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; /* SETXATTR is special, since it may contain too large data */ - if (in->h.opcode == FUSE_SETXATTR) + if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; request_end(fc, req); goto restart; @@ -1274,10 +1245,10 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; - err = fuse_copy_one(cs, &in->h, sizeof(in->h)); + err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h)); if (!err) - err = fuse_copy_args(cs, in->numargs, in->argpages, - (struct fuse_arg *) in->args, 0); + err = fuse_copy_args(cs, args->in_numargs, args->in_pages, + (struct fuse_arg *) args->in_args, 0); fuse_copy_finish(cs); spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); @@ -1808,27 +1779,25 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) return NULL; } -static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, +static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, unsigned nbytes) { unsigned reqsize = sizeof(struct fuse_out_header); - if (out->h.error) - return nbytes != reqsize ? -EINVAL : 0; + reqsize += len_args(args->out_numargs, args->out_args); - reqsize += len_args(out->numargs, out->args); - - if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) + if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar)) return -EINVAL; else if (reqsize > nbytes) { - struct fuse_arg *lastarg = &out->args[out->numargs-1]; + struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1]; unsigned diffsize = reqsize - nbytes; + if (diffsize > lastarg->size) return -EINVAL; lastarg->size -= diffsize; } - return fuse_copy_args(cs, out->numargs, out->argpages, out->args, - out->page_zeroing); + return fuse_copy_args(cs, args->out_numargs, args->out_pages, + args->out_args, args->page_zeroing); } /* @@ -1907,10 +1876,13 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, set_bit(FR_LOCKED, &req->flags); spin_unlock(&fpq->lock); cs->req = req; - if (!req->out.page_replace) + if (!req->args->page_replace) cs->move_pages = 0; - err = copy_out_args(cs, &req->out, nbytes); + if (oh.error) + err = nbytes != sizeof(oh) ? -EINVAL : 0; + else + err = copy_out_args(cs, req->args, nbytes); fuse_copy_finish(cs); spin_lock(&fpq->lock); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a81fdbe62df9..9454f2328bd0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -224,57 +224,12 @@ struct fuse_in_arg { const void *value; }; -/** The request input */ -struct fuse_in { - /** The request header */ - struct fuse_in_header h; - - /** True if the data for the last argument is in req->pages */ - unsigned argpages:1; - - /** Number of arguments */ - unsigned numargs; - - /** Array of arguments */ - struct fuse_in_arg args[3]; -}; - /** One output argument of a request */ struct fuse_arg { unsigned size; void *value; }; -/** The request output */ -struct fuse_out { - /** Header returned from userspace */ - struct fuse_out_header h; - - /* - * The following bitfields are not changed during the request - * processing - */ - - /** Last argument is variable length (can be shorter than - arg->size) */ - unsigned argvar:1; - - /** Last argument is a list of pages to copy data to */ - unsigned argpages:1; - - /** Zero partially or not copied pages */ - unsigned page_zeroing:1; - - /** Pages may be replaced with new ones */ - unsigned page_replace:1; - - /** Number or arguments */ - unsigned numargs; - - /** Array of arguments */ - struct fuse_arg args[2]; -}; - /** FUSE page descriptor */ struct fuse_page_desc { unsigned int length; @@ -385,11 +340,15 @@ struct fuse_req { /* Request flags, updated with test/set/clear_bit() */ unsigned long flags; - /** The request input */ - struct fuse_in in; + /* The request input header */ + struct { + struct fuse_in_header h; + } in; - /** The request output */ - struct fuse_out out; + /* The request output header */ + struct { + struct fuse_out_header h; + } out; /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; @@ -403,9 +362,6 @@ struct fuse_req { /** number of pages in vector */ unsigned num_pages; - /** Request completion callback */ - void (*end)(struct fuse_conn *, struct fuse_req *); - }; struct fuse_iqueue { From 05ea48cc2b098c533193bb058b82aa016a8361bc Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 10 Sep 2019 15:04:11 +0200 Subject: [PATCH 326/721] fuse: stop copying pages to fuse_req The page array pointers are also duplicated across fuse_args_pages and fuse_req. Get rid of the fuse_req ones. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 18 ++++++------------ fs/fuse/fuse_i.h | 9 --------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index dfc0658d990b..d7cae9001ca7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -448,15 +448,8 @@ static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) { - struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); - req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; - if (args->in_pages || args->out_pages) { - req->pages = ap->pages; - req->page_descs = ap->descs; - req->num_pages = ap->num_pages; - } req->args = args; } @@ -939,14 +932,15 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, { unsigned i; struct fuse_req *req = cs->req; + struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); - for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { + + for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { int err; - unsigned offset = req->page_descs[i].offset; - unsigned count = min(nbytes, req->page_descs[i].length); + unsigned int offset = ap->descs[i].offset; + unsigned int count = min(nbytes, ap->descs[i].length); - err = fuse_copy_page(cs, &req->pages[i], offset, count, - zeroing); + err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); if (err) return err; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 9454f2328bd0..378f1fe69d07 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -353,15 +353,6 @@ struct fuse_req { /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; - /** page vector */ - struct page **pages; - - /** page-descriptor vector */ - struct fuse_page_desc *page_descs; - - /** number of pages in vector */ - unsigned num_pages; - }; struct fuse_iqueue { From f22f812d5ce75a18b56073a7a63862e6ea764070 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 12 Sep 2019 14:28:13 +0200 Subject: [PATCH 327/721] fuse: fix request limit The size of struct fuse_req was reduced from 392B to 144B on a non-debug config, thus the sanitize_global_limit() helper was setting a larger default limit. This doesn't really reflect reduction in the memory used by requests, since the fields removed from fuse_req were added to fuse_args derived structs; e.g. sizeof(struct fuse_writepages_args) is 248B, thus resulting in slightly more memory being used for writepage requests overalll (due to using 256B slabs). Make the calculatation ignore the size of fuse_req and use the old 392B value. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2cec25ad54b7..4404d21649ff 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -826,9 +826,12 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { + /* + * The default maximum number of async requests is calculated to consume + * 1/2^13 of the total memory, assuming 392 bytes per request. + */ if (*limit == 0) - *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / - sizeof(struct fuse_req); + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392; if (*limit >= 1 << 16) *limit = (1 << 16) - 1; From 04ec5af0776e9baefed59891f12adbcb5fa71a23 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 21 Jun 2018 09:33:40 +0100 Subject: [PATCH 328/721] fuse: export fuse_end_request() virtio-fs will need to complete requests from outside fs/fuse/dev.c. Make the symbol visible. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 19 ++++++++++--------- fs/fuse/fuse_i.h | 5 +++++ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d7cae9001ca7..2696f000df67 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -255,7 +255,7 @@ static void flush_bg_queue(struct fuse_conn *fc) * the 'end' callback is called if given, else the reference to the * request is released */ -static void request_end(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) { struct fuse_iqueue *fiq = &fc->iq; bool async = req->args->end; @@ -309,6 +309,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) put_request: fuse_put_request(fc, req); } +EXPORT_SYMBOL_GPL(fuse_request_end); static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { @@ -396,12 +397,12 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) req->in.h.unique = fuse_get_unique(fiq); queue_request(fiq, req); /* acquire extra reference, since request is still needed - after request_end() */ + after fuse_request_end() */ __fuse_get_request(req); spin_unlock(&fiq->lock); request_wait_answer(fc, req); - /* Pairs with smp_wmb() in request_end() */ + /* Pairs with smp_wmb() in fuse_request_end() */ smp_rmb(); } } @@ -1151,7 +1152,7 @@ __releases(fiq->lock) * the pending list and copies request data to userspace buffer. If * no reply is needed (FORGET) or request has been aborted or there * was an error during the copying then it's finished by calling - * request_end(). Otherwise add it to the processing list, and set + * fuse_request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, @@ -1232,7 +1233,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, /* SETXATTR is special, since it may contain too large data */ if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; - request_end(fc, req); + fuse_request_end(fc, req); goto restart; } spin_lock(&fpq->lock); @@ -1275,7 +1276,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, if (!test_bit(FR_PRIVATE, &req->flags)) list_del_init(&req->list); spin_unlock(&fpq->lock); - request_end(fc, req); + fuse_request_end(fc, req); return err; err_unlock: @@ -1799,7 +1800,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, * the write buffer. The request is then searched on the processing * list by the unique ID found in the header. If found, then remove * it from the list and copy the rest of the buffer to the request. - * The request is finished by calling request_end() + * The request is finished by calling fuse_request_end(). */ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_copy_state *cs, size_t nbytes) @@ -1889,7 +1890,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, list_del_init(&req->list); spin_unlock(&fpq->lock); - request_end(fc, req); + fuse_request_end(fc, req); out: return err ? err : nbytes; @@ -2029,7 +2030,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req->out.h.error = -ECONNABORTED; clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - request_end(fc, req); + fuse_request_end(fc, req); } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 378f1fe69d07..5c50d3da1b0d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -829,6 +829,11 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, gfp_t gfp_flags); +/** + * End a finished request + */ +void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); + /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); From 14d46d7abc3973a47e8eb0eb5eb87ee8d910a505 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 21 Jun 2018 09:34:25 +0100 Subject: [PATCH 329/721] fuse: export fuse_len_args() virtio-fs will need to query the length of fuse_arg lists. Make the symbol visible. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 9 +++++---- fs/fuse/fuse_i.h | 5 +++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 2696f000df67..8f36e6485bb2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -177,7 +177,7 @@ static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) } EXPORT_SYMBOL_GPL(fuse_put_request); -static unsigned len_args(unsigned numargs, struct fuse_arg *args) +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) { unsigned nbytes = 0; unsigned i; @@ -187,6 +187,7 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) return nbytes; } +EXPORT_SYMBOL_GPL(fuse_len_args); static u64 fuse_get_unique(struct fuse_iqueue *fiq) { @@ -202,8 +203,8 @@ static unsigned int fuse_req_hash(u64 unique) static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + - len_args(req->args->in_numargs, - (struct fuse_arg *) req->args->in_args); + fuse_len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); list_add_tail(&req->list, &fiq->pending); wake_up(&fiq->waitq); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); @@ -1779,7 +1780,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, { unsigned reqsize = sizeof(struct fuse_out_header); - reqsize += len_args(args->out_numargs, args->out_args); + reqsize += fuse_len_args(args->out_numargs, args->out_args); if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar)) return -EINVAL; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 5c50d3da1b0d..39ef981a618c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -971,4 +971,9 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); /* readdir.c */ int fuse_readdir(struct file *file, struct dir_context *ctx); +/** + * Return the number of bytes in an arguments list + */ +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); + #endif /* _FS_FUSE_I_H */ From 95a84cdb11c26315a6d34664846f82c438c961a1 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 6 Mar 2019 16:51:39 -0500 Subject: [PATCH 330/721] fuse: export fuse_send_init_request() This will be used by virtio-fs to send init request to fuse server after initialization of virt queues. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 39ef981a618c..7192080a06d0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -867,6 +867,7 @@ void fuse_conn_put(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); +void fuse_send_init(struct fuse_conn *fc); /** * Add connection to control filesystem diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4404d21649ff..5d455f4d6195 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -972,7 +972,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -static void fuse_send_init(struct fuse_conn *fc) +void fuse_send_init(struct fuse_conn *fc) { struct fuse_init_args *ia; @@ -1009,6 +1009,7 @@ static void fuse_send_init(struct fuse_conn *fc) if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0) process_init_reply(fc, &ia->args, -ENOTCONN); } +EXPORT_SYMBOL_GPL(fuse_send_init); static void fuse_free_conn(struct fuse_conn *fc) { From 79d96efffda7597b41968d5d8813b39fc2965f1b Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 22 Jun 2018 13:48:30 +0100 Subject: [PATCH 331/721] fuse: export fuse_get_unique() virtio-fs will need unique IDs for FORGET requests from outside fs/fuse/dev.c. Make the symbol visible. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 3 ++- fs/fuse/fuse_i.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8f36e6485bb2..151176a99dc2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -189,11 +189,12 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) } EXPORT_SYMBOL_GPL(fuse_len_args); -static u64 fuse_get_unique(struct fuse_iqueue *fiq) +u64 fuse_get_unique(struct fuse_iqueue *fiq) { fiq->reqctr += FUSE_REQ_ID_STEP; return fiq->reqctr; } +EXPORT_SYMBOL_GPL(fuse_get_unique); static unsigned int fuse_req_hash(u64 unique) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7192080a06d0..b868b71d47d9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -977,4 +977,9 @@ int fuse_readdir(struct file *file, struct dir_context *ctx); */ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); +/** + * Get the next unique ID for a request + */ +u64 fuse_get_unique(struct fuse_iqueue *fiq); + #endif /* _FS_FUSE_I_H */ From 4388c5aac4bae5c83a2c66882043942002ba09a2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 5 Jun 2019 15:50:43 -0400 Subject: [PATCH 332/721] fuse: export fuse_dequeue_forget() function File systems like virtio-fs need to do not have to play directly with forget list data structures. There is a helper function use that instead. Rename dequeue_forget() to fuse_dequeue_forget() and export it so that stacked filesystems can use it. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 13 +++++++------ fs/fuse/fuse_i.h | 4 ++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 151176a99dc2..943bc5cf941a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1033,9 +1033,9 @@ __releases(fiq->lock) return err ? err : reqsize; } -static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, - unsigned max, - unsigned *countp) +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp) { struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; @@ -1054,6 +1054,7 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, return head; } +EXPORT_SYMBOL(fuse_dequeue_forget); static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, @@ -1061,7 +1062,7 @@ static int fuse_read_single_forget(struct fuse_iqueue *fiq, __releases(fiq->lock) { int err; - struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); + struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL); struct fuse_forget_in arg = { .nlookup = forget->forget_one.nlookup, }; @@ -1109,7 +1110,7 @@ __releases(fiq->lock) } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); - head = dequeue_forget(fiq, max_forgets, &count); + head = fuse_dequeue_forget(fiq, max_forgets, &count); spin_unlock(&fiq->lock); arg.count = count; @@ -2119,7 +2120,7 @@ void fuse_abort_conn(struct fuse_conn *fc) clear_bit(FR_PENDING, &req->flags); list_splice_tail_init(&fiq->pending, &to_end); while (forget_pending(fiq)) - kfree(dequeue_forget(fiq, 1, NULL)); + kfree(fuse_dequeue_forget(fiq, 1, NULL)); wake_up_all(&fiq->waitq); spin_unlock(&fiq->lock); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b868b71d47d9..5f910c99e8dd 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -729,6 +729,10 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp); + /* * Initialize READ or READDIR request */ From 0cc2656cdb0b1f234e6d29378cb061e29d7522bc Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 13 Jun 2018 10:23:04 +0100 Subject: [PATCH 333/721] fuse: extract fuse_fill_super_common() fuse_fill_super() includes code to process the fd= option and link the struct fuse_dev to the fd's struct file. In virtio-fs there is no file descriptor because /dev/fuse is not used. This patch extracts fuse_fill_super_common() so that both classic fuse and virtio-fs can share the code to initialize a mount. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 27 +++++++++++ fs/fuse/inode.c | 114 ++++++++++++++++++++++------------------------- 2 files changed, 81 insertions(+), 60 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 5f910c99e8dd..1902148281cc 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -416,6 +416,26 @@ struct fuse_dev { struct list_head entry; }; +struct fuse_fs_context { + int fd; + unsigned int rootmode; + kuid_t user_id; + kgid_t group_id; + bool is_bdev:1; + bool fd_present:1; + bool rootmode_present:1; + bool user_id_present:1; + bool group_id_present:1; + bool default_permissions:1; + bool allow_other:1; + unsigned int max_read; + unsigned int blksize; + const char *subtype; + + /* fuse_dev pointer to fill in, should contain NULL on entry */ + void **fudptr; +}; + /** * A Fuse connection. * @@ -873,6 +893,13 @@ struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); void fuse_send_init(struct fuse_conn *fc); +/** + * Fill in superblock and initialize fuse connection + * @sb: partially-initialized superblock to fill in + * @ctx: mount context + */ +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); + /** * Add connection to control filesystem */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5d455f4d6195..30d92e633ece 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -64,23 +64,6 @@ MODULE_PARM_DESC(max_user_congthresh, static struct file_system_type fuseblk_fs_type; #endif -struct fuse_fs_context { - const char *subtype; - bool is_bdev; - int fd; - unsigned rootmode; - kuid_t user_id; - kgid_t group_id; - unsigned fd_present:1; - unsigned rootmode_present:1; - unsigned user_id_present:1; - unsigned group_id_present:1; - unsigned default_permissions:1; - unsigned allow_other:1; - unsigned max_read; - unsigned blksize; -}; - struct fuse_forget_link *fuse_alloc_forget(void) { return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); @@ -1100,16 +1083,13 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); -static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { - struct fuse_fs_context *ctx = fsc->fs_private; struct fuse_dev *fud; - struct fuse_conn *fc; + struct fuse_conn *fc = get_fuse_conn_super(sb); struct inode *root; - struct file *file; struct dentry *root_dentry; int err; - int is_bdev = sb->s_bdev != NULL; err = -EINVAL; if (sb->s_flags & SB_MANDLOCK) @@ -1117,7 +1097,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - if (is_bdev) { + if (ctx->is_bdev) { #ifdef CONFIG_BLOCK err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) @@ -1140,19 +1120,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - file = fget(ctx->fd); - err = -EINVAL; - if (!file) - goto err; - - /* - * Require mount to happen from the same user namespace which - * opened /dev/fuse to prevent potential attacks. - */ - if (file->f_op != &fuse_dev_operations || - file->f_cred->user_ns != sb->s_user_ns) - goto err_fput; - /* * If we are not in the initial user namespace posix * acls must be translated. @@ -1160,17 +1127,9 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) if (sb->s_user_ns != &init_user_ns) sb->s_xattr = fuse_no_acl_xattr_handlers; - fc = kmalloc(sizeof(*fc), GFP_KERNEL); - err = -ENOMEM; - if (!fc) - goto err_fput; - - fuse_conn_init(fc, sb->s_user_ns); - fc->release = fuse_free_conn; - fud = fuse_dev_alloc(fc); if (!fud) - goto err_put_conn; + goto err; fc->dev = sb->s_dev; fc->sb = sb; @@ -1188,10 +1147,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; fc->max_read = max_t(unsigned, 4096, ctx->max_read); - fc->destroy = is_bdev; - - /* Used by get_root_inode() */ - sb->s_fs_info = fc; + fc->destroy = ctx->is_bdev; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); @@ -1204,7 +1160,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) mutex_lock(&fuse_mutex); err = -EINVAL; - if (file->private_data) + if (*ctx->fudptr) goto err_unlock; err = fuse_ctl_add_conn(fc); @@ -1213,17 +1169,8 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - file->private_data = fud; + *ctx->fudptr = fud; mutex_unlock(&fuse_mutex); - /* - * atomic_dec_and_test() in fput() provides the necessary - * memory barrier for file->private_data to be visible on all - * CPUs after this - */ - fput(file); - - fuse_send_init(fc); - return 0; err_unlock: @@ -1231,6 +1178,53 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) dput(root_dentry); err_dev_free: fuse_dev_free(fud); + err: + return err; +} +EXPORT_SYMBOL_GPL(fuse_fill_super_common); + +static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + struct file *file; + int err; + struct fuse_conn *fc; + + err = -EINVAL; + file = fget(ctx->fd); + if (!file) + goto err; + + /* + * Require mount to happen from the same user namespace which + * opened /dev/fuse to prevent potential attacks. + */ + if ((file->f_op != &fuse_dev_operations) || + (file->f_cred->user_ns != sb->s_user_ns)) + goto err_fput; + ctx->fudptr = &file->private_data; + + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + err = -ENOMEM; + if (!fc) + goto err_fput; + + fuse_conn_init(fc, sb->s_user_ns); + fc->release = fuse_free_conn; + sb->s_fs_info = fc; + + err = fuse_fill_super_common(sb, ctx); + if (err) + goto err_put_conn; + /* + * atomic_dec_and_test() in fput() provides the necessary + * memory barrier for file->private_data to be visible on all + * CPUs after this + */ + fput(file); + fuse_send_init(get_fuse_conn_super(sb)); + return 0; + err_put_conn: fuse_conn_put(fc); sb->s_fs_info = NULL; From ae3aad77f46fbba56eff7141b2fc49870b60827e Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 18 Jun 2018 15:53:19 +0100 Subject: [PATCH 334/721] fuse: add fuse_iqueue_ops callbacks The /dev/fuse device uses fiq->waitq and fasync to signal that requests are available. These mechanisms do not apply to virtio-fs. This patch introduces callbacks so alternative behavior can be used. Note that queue_interrupt() changes along these lines: spin_lock(&fiq->waitq.lock); wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); spin_unlock(&fiq->waitq.lock); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); Since queue_request() and queue_forget() also call kill_fasync() inside the spinlock this should be safe. Signed-off-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 2 +- fs/fuse/dev.c | 46 ++++++++++++++++++++++++++++++---------------- fs/fuse/fuse_i.h | 42 +++++++++++++++++++++++++++++++++++++++++- fs/fuse/inode.c | 13 +++++++++---- 4 files changed, 81 insertions(+), 22 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 2332e7f960a8..6a0de0ce4403 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -506,7 +506,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) * Limit the cuse channel to requests that can * be represented in file->f_cred->user_ns. */ - fuse_conn_init(&cc->fc, file->f_cred->user_ns); + fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); fud = fuse_dev_alloc(&cc->fc); if (!fud) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 943bc5cf941a..358a01435058 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -201,14 +201,33 @@ static unsigned int fuse_req_hash(u64 unique) return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } -static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) +/** + * A new request is available, wake fiq->waitq + */ +static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + wake_up(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + spin_unlock(&fiq->lock); +} + +const struct fuse_iqueue_ops fuse_dev_fiq_ops = { + .wake_forget_and_unlock = fuse_dev_wake_and_unlock, + .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, + .wake_pending_and_unlock = fuse_dev_wake_and_unlock, +}; +EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); + +static void queue_request_and_unlock(struct fuse_iqueue *fiq, + struct fuse_req *req) +__releases(fiq->lock) { req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); list_add_tail(&req->list, &fiq->pending); - wake_up(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_pending_and_unlock(fiq); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, @@ -223,12 +242,11 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, if (fiq->connected) { fiq->forget_list_tail->next = forget; fiq->forget_list_tail = forget; - wake_up(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_forget_and_unlock(fiq); } else { kfree(forget); + spin_unlock(&fiq->lock); } - spin_unlock(&fiq->lock); } static void flush_bg_queue(struct fuse_conn *fc) @@ -244,8 +262,7 @@ static void flush_bg_queue(struct fuse_conn *fc) fc->active_background++; spin_lock(&fiq->lock); req->in.h.unique = fuse_get_unique(fiq); - queue_request(fiq, req); - spin_unlock(&fiq->lock); + queue_request_and_unlock(fiq, req); } } @@ -334,10 +351,10 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) spin_unlock(&fiq->lock); return 0; } - wake_up(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_interrupt_and_unlock(fiq); + } else { + spin_unlock(&fiq->lock); } - spin_unlock(&fiq->lock); return 0; } @@ -397,11 +414,10 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) req->out.h.error = -ENOTCONN; } else { req->in.h.unique = fuse_get_unique(fiq); - queue_request(fiq, req); /* acquire extra reference, since request is still needed after fuse_request_end() */ __fuse_get_request(req); - spin_unlock(&fiq->lock); + queue_request_and_unlock(fiq, req); request_wait_answer(fc, req); /* Pairs with smp_wmb() in fuse_request_end() */ @@ -570,14 +586,12 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc, spin_lock(&fiq->lock); if (fiq->connected) { - queue_request(fiq, req); - spin_unlock(&fiq->lock); + queue_request_and_unlock(fiq, req); } else { err = -ENODEV; spin_unlock(&fiq->lock); fuse_put_request(fc, req); } - spin_unlock(&fiq->lock); return err; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1902148281cc..8c13865955d4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -355,6 +355,39 @@ struct fuse_req { }; +struct fuse_iqueue; + +/** + * Input queue callbacks + * + * Input queue signalling is device-specific. For example, the /dev/fuse file + * uses fiq->waitq and fasync to wake processes that are waiting on queue + * readiness. These callbacks allow other device types to respond to input + * queue activity. + */ +struct fuse_iqueue_ops { + /** + * Signal that a forget has been queued + */ + void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); + + /** + * Signal that an INTERRUPT request has been queued + */ + void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); + + /** + * Signal that a request has been queued + */ + void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); +}; + +/** /dev/fuse input queue operations */ +extern const struct fuse_iqueue_ops fuse_dev_fiq_ops; + struct fuse_iqueue { /** Connection established */ unsigned connected; @@ -383,6 +416,12 @@ struct fuse_iqueue { /** O_ASYNC requests */ struct fasync_struct *fasync; + + /** Device-specific callbacks */ + const struct fuse_iqueue_ops *ops; + + /** Device-specific state */ + void *priv; }; #define FUSE_PQ_HASH_BITS 8 @@ -882,7 +921,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns); +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); /** * Release reference to fuse_conn diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 30d92e633ece..734fdd597c3e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -568,7 +568,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void fuse_iqueue_init(struct fuse_iqueue *fiq) +static void fuse_iqueue_init(struct fuse_iqueue *fiq, + const struct fuse_iqueue_ops *ops, + void *priv) { memset(fiq, 0, sizeof(struct fuse_iqueue)); spin_lock_init(&fiq->lock); @@ -577,6 +579,8 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq) INIT_LIST_HEAD(&fiq->interrupts); fiq->forget_list_tail = &fiq->forget_list_head; fiq->connected = 1; + fiq->ops = ops; + fiq->priv = priv; } static void fuse_pqueue_init(struct fuse_pqueue *fpq) @@ -590,7 +594,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); @@ -599,7 +604,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); - fuse_iqueue_init(&fc->iq); + fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); INIT_LIST_HEAD(&fc->devices); @@ -1209,7 +1214,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) if (!fc) goto err_fput; - fuse_conn_init(fc, sb->s_user_ns); + fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); fc->release = fuse_free_conn; sb->s_fs_info = fc; From 0cd1eb9a4160a96e0ec9b93b2e7b489f449bf22d Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 6 Mar 2019 16:51:40 -0500 Subject: [PATCH 335/721] fuse: separate fuse device allocation and installation in fuse_conn As of now fuse_dev_alloc() both allocates a fuse device and installs it in fuse_conn list. fuse_dev_alloc() can fail if fuse_device allocation fails. virtio-fs needs to initialize multiple fuse devices (one per virtio queue). It initializes one fuse device as part of call to fuse_fill_super_common() and rest of the devices are allocated and installed after that. But, we can't afford to fail after calling fuse_fill_super_common() as we don't have a way to undo all the actions done by fuse_fill_super_common(). So to avoid failures after the call to fuse_fill_super_common(), pre-allocate all fuse devices early and install them into fuse connection later. This patch provides two separate helpers for fuse device allocation and fuse device installation in fuse_conn. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 2 +- fs/fuse/dev.c | 2 +- fs/fuse/fuse_i.h | 4 +++- fs/fuse/inode.c | 31 ++++++++++++++++++++++++------- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 6a0de0ce4403..45762bb7a934 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -508,7 +508,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); - fud = fuse_dev_alloc(&cc->fc); + fud = fuse_dev_alloc_install(&cc->fc); if (!fud) { kfree(cc); return -ENOMEM; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 358a01435058..b2e18861c59c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2203,7 +2203,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) if (new->private_data) return -EINVAL; - fud = fuse_dev_alloc(fc); + fud = fuse_dev_alloc_install(fc); if (!fud) return -ENOMEM; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8c13865955d4..242d47752e78 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -929,7 +929,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, */ void fuse_conn_put(struct fuse_conn *fc); -struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc(void); +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); void fuse_send_init(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 734fdd597c3e..6a0f31faaeaa 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1045,7 +1045,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) return 0; } -struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) +struct fuse_dev *fuse_dev_alloc(void) { struct fuse_dev *fud; struct list_head *pq; @@ -1061,17 +1061,34 @@ struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) } fud->pq.processing = pq; - fud->fc = fuse_conn_get(fc); fuse_pqueue_init(&fud->pq); - spin_lock(&fc->lock); - list_add_tail(&fud->entry, &fc->devices); - spin_unlock(&fc->lock); - return fud; } EXPORT_SYMBOL_GPL(fuse_dev_alloc); +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc) +{ + fud->fc = fuse_conn_get(fc); + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_dev_install); + +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + fud = fuse_dev_alloc(); + if (!fud) + return NULL; + + fuse_dev_install(fud, fc); + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc_install); + void fuse_dev_free(struct fuse_dev *fud) { struct fuse_conn *fc = fud->fc; @@ -1132,7 +1149,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_user_ns != &init_user_ns) sb->s_xattr = fuse_no_acl_xattr_handlers; - fud = fuse_dev_alloc(fc); + fud = fuse_dev_alloc_install(fc); if (!fud) goto err; From 8fab010644363f8f80194322aa7a81e38c867af3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 15 Aug 2018 17:42:34 +0200 Subject: [PATCH 336/721] fuse: delete dentry if timeout is zero Don't hold onto dentry in lru list if need to re-lookup it anyway at next access. Only do this if explicitly enabled, otherwise it could result in performance regression. More advanced version of this patch would periodically flush out dentries from the lru which have gone stale. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 28 +++++++++++++++++++++++++--- fs/fuse/fuse_i.h | 3 +++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 11334ee01dc9..ba0a175d7578 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -29,12 +29,28 @@ union fuse_dentry { struct rcu_head rcu; }; -static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +static void fuse_dentry_settime(struct dentry *dentry, u64 time) { - ((union fuse_dentry *) entry->d_fsdata)->time = time; + struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); + bool delete = !time && fc->delete_stale; + /* + * Mess with DCACHE_OP_DELETE because dput() will be faster without it. + * Don't care about races, either way it's just an optimization + */ + if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) || + (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) { + spin_lock(&dentry->d_lock); + if (!delete) + dentry->d_flags &= ~DCACHE_OP_DELETE; + else + dentry->d_flags |= DCACHE_OP_DELETE; + spin_unlock(&dentry->d_lock); + } + + ((union fuse_dentry *) dentry->d_fsdata)->time = time; } -static inline u64 fuse_dentry_time(struct dentry *entry) +static inline u64 fuse_dentry_time(const struct dentry *entry) { return ((union fuse_dentry *) entry->d_fsdata)->time; } @@ -255,8 +271,14 @@ static void fuse_dentry_release(struct dentry *dentry) kfree_rcu(fd, rcu); } +static int fuse_dentry_delete(const struct dentry *dentry) +{ + return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_delete = fuse_dentry_delete, .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, }; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 242d47752e78..fed68a427a4c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -692,6 +692,9 @@ struct fuse_conn { /* Send DESTROY request */ unsigned int destroy:1; + /* Delete dentries that have gone stale */ + unsigned int delete_stale:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; From 783863d6476ce9f27fa87227f76ae9134caf43fa Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 29 Aug 2019 11:01:20 +0200 Subject: [PATCH 337/721] fuse: dissociate DESTROY from fuseblk Allow virtio-fs to also send DESTROY request. Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 9 +++++++++ fs/fuse/inode.c | 12 ++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fed68a427a4c..48d214df9172 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -467,6 +467,7 @@ struct fuse_fs_context { bool group_id_present:1; bool default_permissions:1; bool allow_other:1; + bool destroy:1; unsigned int max_read; unsigned int blksize; const char *subtype; @@ -945,6 +946,13 @@ void fuse_send_init(struct fuse_conn *fc); */ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); +/** + * Disassociate fuse connection from superblock and kill the superblock + * + * Calls kill_anon_super(), do not use with bdev mounts. + */ +void fuse_kill_sb_anon(struct super_block *sb); + /** * Add connection to control filesystem */ @@ -1057,5 +1065,6 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); * Get the next unique ID for a request */ u64 fuse_get_unique(struct fuse_iqueue *fiq); +void fuse_free_conn(struct fuse_conn *fc); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6a0f31faaeaa..cd6ffb3fe69e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -999,11 +999,12 @@ void fuse_send_init(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_send_init); -static void fuse_free_conn(struct fuse_conn *fc) +void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); kfree_rcu(fc, rcu); } +EXPORT_SYMBOL_GPL(fuse_free_conn); static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { @@ -1169,7 +1170,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; fc->max_read = max_t(unsigned, 4096, ctx->max_read); - fc->destroy = ctx->is_bdev; + fc->destroy = ctx->destroy; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); @@ -1293,8 +1294,10 @@ static int fuse_init_fs_context(struct fs_context *fc) ctx->blksize = FUSE_DEFAULT_BLKSIZE; #ifdef CONFIG_BLOCK - if (fc->fs_type == &fuseblk_fs_type) + if (fc->fs_type == &fuseblk_fs_type) { ctx->is_bdev = true; + ctx->destroy = true; + } #endif fc->fs_private = ctx; @@ -1319,11 +1322,12 @@ static void fuse_sb_destroy(struct super_block *sb) } } -static void fuse_kill_sb_anon(struct super_block *sb) +void fuse_kill_sb_anon(struct super_block *sb) { fuse_sb_destroy(sb); kill_anon_super(sb); } +EXPORT_SYMBOL_GPL(fuse_kill_sb_anon); static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, From 15c8e72e88e0b707ffefd524ca33f28cdb3db487 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 6 May 2019 15:35:43 -0400 Subject: [PATCH 338/721] fuse: allow skipping control interface and forced unmount virtio-fs does not support aborting requests which are being processed. That is requests which have been sent to fuse daemon on host. Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 8 ++++++++ fs/fuse/inode.c | 7 ++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 48d214df9172..fc89cb40e874 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -468,6 +468,8 @@ struct fuse_fs_context { bool default_permissions:1; bool allow_other:1; bool destroy:1; + bool no_control:1; + bool no_force_umount:1; unsigned int max_read; unsigned int blksize; const char *subtype; @@ -696,6 +698,12 @@ struct fuse_conn { /* Delete dentries that have gone stale */ unsigned int delete_stale:1; + /** Do not create entry in fusectl fs */ + unsigned int no_control:1; + + /** Do not allow MNT_FORCE umount */ + unsigned int no_force_umount:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cd6ffb3fe69e..10d193b24fb8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -364,7 +364,10 @@ void fuse_unlock_inode(struct inode *inode, bool locked) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb)); + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (!fc->no_force_umount) + fuse_abort_conn(fc); } static void fuse_send_destroy(struct fuse_conn *fc) @@ -1171,6 +1174,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->group_id = ctx->group_id; fc->max_read = max_t(unsigned, 4096, ctx->max_read); fc->destroy = ctx->destroy; + fc->no_control = ctx->no_control; + fc->no_force_umount = ctx->no_force_umount; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); From 501ae8ecae2ba5122774dee4445003505a7fd01b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 4 Sep 2019 08:36:33 -0400 Subject: [PATCH 339/721] fuse: reserve byteswapped init opcodes virtio fs tunnels fuse over a virtio channel. One issue is two sides might be speaking different endian-ness. To detects this, host side looks at the opcode value in the FUSE_INIT command. Works fine at the moment but might fail if a future version of fuse will use such an opcode for initialization. Let's reserve this opcode so we remember and don't do this. Same for CUSE_INIT. Signed-off-by: Michael S. Tsirkin Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 2971d29a42e4..df2e12fb3381 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -425,6 +425,10 @@ enum fuse_opcode { /* CUSE specific operations */ CUSE_INIT = 4096, + + /* Reserved opcodes: helpful to detect structure endian-ness */ + CUSE_INIT_BSWAP_RESERVED = 1048576, /* CUSE_INIT << 8 */ + FUSE_INIT_BSWAP_RESERVED = 436207616, /* FUSE_INIT << 24 */ }; enum fuse_notify_code { From a4098bc6eed5e31e0391bcc068e61804c98138df Mon Sep 17 00:00:00 2001 From: Igor Druzhinin Date: Thu, 12 Sep 2019 19:31:51 +0100 Subject: [PATCH 340/721] xen/pci: reserve MCFG areas earlier If MCFG area is not reserved in E820, Xen by default will defer its usage until Dom0 registers it explicitly after ACPI parser recognizes it as a reserved resource in DSDT. Having it reserved in E820 is not mandatory according to "PCI Firmware Specification, rev 3.2" (par. 4.1.2) and firmware is free to keep a hole in E820 in that place. Xen doesn't know what exactly is inside this hole since it lacks full ACPI view of the platform therefore it's potentially harmful to access MCFG region without additional checks as some machines are known to provide inconsistent information on the size of the region. Now xen_mcfg_late() runs after acpi_init() which is too late as some basic PCI enumeration starts exactly there as well. Trying to register a device prior to MCFG reservation causes multiple problems with PCIe extended capability initializations in Xen (e.g. SR-IOV VF BAR sizing). There are no convenient hooks for us to subscribe to so register MCFG areas earlier upon the first invocation of xen_add_device(). It should be safe to do once since all the boot time buses must have their MCFG areas in MCFG table already and we don't support PCI bus hot-plug. Signed-off-by: Igor Druzhinin Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/pci.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 3eeb9bea7630..224df03ce42e 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -17,6 +17,8 @@ #include "../pci/pci.h" #ifdef CONFIG_PCI_MMCONFIG #include + +static int xen_mcfg_late(void); #endif static bool __read_mostly pci_seg_supported = true; @@ -28,7 +30,18 @@ static int xen_add_device(struct device *dev) #ifdef CONFIG_PCI_IOV struct pci_dev *physfn = pci_dev->physfn; #endif - +#ifdef CONFIG_PCI_MMCONFIG + static bool pci_mcfg_reserved = false; + /* + * Reserve MCFG areas in Xen on first invocation due to this being + * potentially called from inside of acpi_init immediately after + * MCFG table has been finally parsed. + */ + if (!pci_mcfg_reserved) { + xen_mcfg_late(); + pci_mcfg_reserved = true; + } +#endif if (pci_seg_supported) { struct { struct physdev_pci_device_add add; @@ -201,7 +214,7 @@ static int __init register_xen_pci_notifier(void) arch_initcall(register_xen_pci_notifier); #ifdef CONFIG_PCI_MMCONFIG -static int __init xen_mcfg_late(void) +static int xen_mcfg_late(void) { struct pci_mmcfg_region *cfg; int rc; @@ -240,8 +253,4 @@ static int __init xen_mcfg_late(void) } return 0; } -/* - * Needs to be done after acpi_init which are subsys_initcall. - */ -subsys_initcall_sync(xen_mcfg_late); #endif From 716c6a228ec9635e4b524f9c01aac089c5dd6ed5 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 13 Sep 2019 12:33:07 +0100 Subject: [PATCH 341/721] MAINTAINERS: Add PCI native host/endpoint controllers designated reviewer Add Andrew Murray as designated reviewer for PCI native host and endpoint controller drivers. Signed-off-by: Lorenzo Pieralisi Cc: Andrew Murray Cc: Bjorn Helgaas --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..08d97988034d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12442,6 +12442,7 @@ F: arch/x86/kernel/early-quirks.c PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS M: Lorenzo Pieralisi +R: Andrew Murray L: linux-pci@vger.kernel.org Q: http://patchwork.ozlabs.org/project/linux-pci/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/ From 11330a9fef049db195f50300627dd972e19e0f8e Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Fri, 6 Sep 2019 15:53:19 +0800 Subject: [PATCH 342/721] i2c: imx: ACPI support for NXP i2c controller Enable NXP i2c controller to boot with ACPI. Signed-off-by: Meenakshi Aggarwal Signed-off-by: Udit Kumar Signed-off-by: Chuanhua Han Signed-off-by: Biwen Li Reviewed-by: Andy Shevchenko Acked-by: Oleksij Rempel Tested-by: Oleksij Rempel Acked-by: Rafael J. Wysocki Signed-off-by: Wolfram Sang --- drivers/acpi/acpi_apd.c | 7 +++++++ drivers/i2c/busses/i2c-imx.c | 17 +++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c index 7cd0c9ac71ea..71511ae2dfcd 100644 --- a/drivers/acpi/acpi_apd.c +++ b/drivers/acpi/acpi_apd.c @@ -160,11 +160,17 @@ static const struct apd_device_desc hip08_i2c_desc = { .setup = acpi_apd_setup, .fixed_clk_rate = 250000000, }; + static const struct apd_device_desc thunderx2_i2c_desc = { .setup = acpi_apd_setup, .fixed_clk_rate = 125000000, }; +static const struct apd_device_desc nxp_i2c_desc = { + .setup = acpi_apd_setup, + .fixed_clk_rate = 350000000, +}; + static const struct apd_device_desc hip08_spi_desc = { .setup = acpi_apd_setup, .fixed_clk_rate = 250000000, @@ -238,6 +244,7 @@ static const struct acpi_device_id acpi_apd_device_ids[] = { { "HISI02A1", APD_ADDR(hip07_i2c_desc) }, { "HISI02A2", APD_ADDR(hip08_i2c_desc) }, { "HISI0173", APD_ADDR(hip08_spi_desc) }, + { "NXP0001", APD_ADDR(nxp_i2c_desc) }, #endif { } }; diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index b1b8b938d7f4..40692fc70d92 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -20,6 +20,7 @@ * */ +#include #include #include #include @@ -255,6 +256,12 @@ static const struct of_device_id i2c_imx_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, i2c_imx_dt_ids); +static const struct acpi_device_id i2c_imx_acpi_ids[] = { + {"NXP0001", .driver_data = (kernel_ulong_t)&vf610_i2c_hwdata}, + { } +}; +MODULE_DEVICE_TABLE(acpi, i2c_imx_acpi_ids); + static inline int is_imx1_i2c(struct imx_i2c_struct *i2c_imx) { return i2c_imx->hwdata->devtype == IMX1_I2C; @@ -1050,14 +1057,13 @@ static const struct i2c_algorithm i2c_imx_algo = { static int i2c_imx_probe(struct platform_device *pdev) { - const struct of_device_id *of_id = of_match_device(i2c_imx_dt_ids, - &pdev->dev); struct imx_i2c_struct *i2c_imx; struct resource *res; struct imxi2c_platform_data *pdata = dev_get_platdata(&pdev->dev); void __iomem *base; int irq, ret; dma_addr_t phy_addr; + const struct imx_i2c_hwdata *match; dev_dbg(&pdev->dev, "<%s>\n", __func__); @@ -1077,8 +1083,9 @@ static int i2c_imx_probe(struct platform_device *pdev) if (!i2c_imx) return -ENOMEM; - if (of_id) - i2c_imx->hwdata = of_id->data; + match = device_get_match_data(&pdev->dev); + if (match) + i2c_imx->hwdata = match; else i2c_imx->hwdata = (struct imx_i2c_hwdata *) platform_get_device_id(pdev)->driver_data; @@ -1091,6 +1098,7 @@ static int i2c_imx_probe(struct platform_device *pdev) i2c_imx->adapter.nr = pdev->id; i2c_imx->adapter.dev.of_node = pdev->dev.of_node; i2c_imx->base = base; + ACPI_COMPANION_SET(&i2c_imx->adapter.dev, ACPI_COMPANION(&pdev->dev)); /* Get I2C clock */ i2c_imx->clk = devm_clk_get(&pdev->dev, NULL); @@ -1253,6 +1261,7 @@ static struct platform_driver i2c_imx_driver = { .name = DRIVER_NAME, .pm = &i2c_imx_pm_ops, .of_match_table = i2c_imx_dt_ids, + .acpi_match_table = i2c_imx_acpi_ids, }, .id_table = imx_i2c_devtype, }; From 8ebf15e9c869e764b3aab4928938ee68c0e2bd6d Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 10 Sep 2019 10:29:17 +0100 Subject: [PATCH 343/721] i2c: tegra: Move suspend handling to NOIRQ phase Commit acc8abcb2a9c ("i2c: tegra: Add suspend-resume support") added suspend support for the Tegra I2C driver and following this change on Tegra30 the following WARNING is seen on entering suspend ... WARNING: CPU: 2 PID: 689 at /dvs/git/dirty/git-master_l4t-upstream/kernel/drivers/i2c/i2c-core.h:54 __i2c_transfer+0x35c/0x70c i2c i2c-4: Transfer while suspended Modules linked in: brcmfmac brcmutil CPU: 2 PID: 689 Comm: rtcwake Not tainted 5.3.0-rc7-g089cf7f6ecb2 #1 Hardware name: NVIDIA Tegra SoC (Flattened Device Tree) [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0xb4/0xc8) [] (dump_stack) from [] (__warn+0xe0/0xf8) [] (__warn) from [] (warn_slowpath_fmt+0x48/0x6c) [] (warn_slowpath_fmt) from [] (__i2c_transfer+0x35c/0x70c) [] (__i2c_transfer) from [] (i2c_transfer+0x58/0xf4) [] (i2c_transfer) from [] (i2c_transfer_buffer_flags+0x4c/0x70) [] (i2c_transfer_buffer_flags) from [] (regmap_i2c_write+0x14/0x30) [] (regmap_i2c_write) from [] (_regmap_raw_write_impl+0x35c/0x868) [] (_regmap_raw_write_impl) from [] (_regmap_update_bits+0xe4/0xec) [] (_regmap_update_bits) from [] (regmap_update_bits_base+0x50/0x74) [] (regmap_update_bits_base) from [] (regulator_disable_regmap+0x44/0x54) [] (regulator_disable_regmap) from [] (_regulator_do_disable+0xf8/0x268) [] (_regulator_do_disable) from [] (_regulator_disable+0xf4/0x19c) [] (_regulator_disable) from [] (regulator_disable+0x34/0x64) [] (regulator_disable) from [] (regulator_bulk_disable+0x28/0xb4) [] (regulator_bulk_disable) from [] (tegra_pcie_power_off+0x64/0xa8) [] (tegra_pcie_power_off) from [] (tegra_pcie_pm_suspend+0x25c/0x3f4) [] (tegra_pcie_pm_suspend) from [] (dpm_run_callback+0x38/0x1d4) [] (dpm_run_callback) from [] (__device_suspend_noirq+0xc0/0x2b8) [] (__device_suspend_noirq) from [] (dpm_noirq_suspend_devices+0x100/0x37c) [] (dpm_noirq_suspend_devices) from [] (dpm_suspend_noirq+0x1c/0x48) [] (dpm_suspend_noirq) from [] (suspend_devices_and_enter+0x1d0/0xa00) [] (suspend_devices_and_enter) from [] (pm_suspend+0x220/0x74c) [] (pm_suspend) from [] (state_store+0x6c/0xc8) [] (state_store) from [] (kernfs_fop_write+0xe8/0x1c4) [] (kernfs_fop_write) from [] (__vfs_write+0x2c/0x1c4) [] (__vfs_write) from [] (vfs_write+0xa4/0x184) [] (vfs_write) from [] (ksys_write+0x9c/0xdc) [] (ksys_write) from [] (ret_fast_syscall+0x0/0x54) Exception stack(0xe9f21fa8 to 0xe9f21ff0) 1fa0: 0000006c 004b2438 00000004 004b2438 00000004 00000000 1fc0: 0000006c 004b2438 004b1228 00000004 00000004 00000004 0049e78c 004b1228 1fe0: 00000004 be9809b8 b6f0bc0b b6e96206 The problem is that the Tegra PCIe driver indirectly uses I2C for controlling some regulators and the I2C driver is now being suspended before the PCIe driver causing the PCIe suspend to fail. The Tegra PCIe driver is suspended during the NOIRQ phase and this cannot be changed due to other dependencies. Therefore, we also need to move the suspend handling for the Tegra I2C driver to the NOIRQ phase as well. In order to move the I2C suspend handling to the NOIRQ phase we also need to avoid calling pm_runtime_get/put() because per commit 1e2ef05bb8cf ("PM: Limit race conditions between runtime PM and system sleep (v2)") these cannot be called early in resume. The function tegra_i2c_init(), called during resume, calls pm_runtime_get/put() and so move these calls outside of tegra_i2c_init(), so this function can be used during the NOIRQ resume phase. Fixes: acc8abcb2a9c ("i2c: tegra: Add suspend-resume support") Signed-off-by: Jon Hunter Acked-by: Thierry Reding Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-tegra.c | 40 +++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 18f0ceed9f1b..c1683f9338b4 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -713,12 +713,6 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev, bool clk_reinit) u32 tsu_thd; u8 tlow, thigh; - err = pm_runtime_get_sync(i2c_dev->dev); - if (err < 0) { - dev_err(i2c_dev->dev, "runtime resume failed %d\n", err); - return err; - } - reset_control_assert(i2c_dev->rst); udelay(2); reset_control_deassert(i2c_dev->rst); @@ -772,7 +766,7 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev, bool clk_reinit) if (err) { dev_err(i2c_dev->dev, "failed changing clock rate: %d\n", err); - goto err; + return err; } } @@ -787,23 +781,21 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev, bool clk_reinit) err = tegra_i2c_flush_fifos(i2c_dev); if (err) - goto err; + return err; if (i2c_dev->is_multimaster_mode && i2c_dev->hw->has_slcg_override_reg) i2c_writel(i2c_dev, I2C_MST_CORE_CLKEN_OVR, I2C_CLKEN_OVERRIDE); err = tegra_i2c_wait_for_config_load(i2c_dev); if (err) - goto err; + return err; if (i2c_dev->irq_disabled) { i2c_dev->irq_disabled = false; enable_irq(i2c_dev->irq); } -err: - pm_runtime_put(i2c_dev->dev); - return err; + return 0; } static int tegra_i2c_disable_packet_mode(struct tegra_i2c_dev *i2c_dev) @@ -1616,12 +1608,14 @@ static int tegra_i2c_probe(struct platform_device *pdev) } pm_runtime_enable(&pdev->dev); - if (!pm_runtime_enabled(&pdev->dev)) { + if (!pm_runtime_enabled(&pdev->dev)) ret = tegra_i2c_runtime_resume(&pdev->dev); - if (ret < 0) { - dev_err(&pdev->dev, "runtime resume failed\n"); - goto unprepare_div_clk; - } + else + ret = pm_runtime_get_sync(i2c_dev->dev); + + if (ret < 0) { + dev_err(&pdev->dev, "runtime resume failed\n"); + goto unprepare_div_clk; } if (i2c_dev->is_multimaster_mode) { @@ -1666,6 +1660,8 @@ static int tegra_i2c_probe(struct platform_device *pdev) if (ret) goto release_dma; + pm_runtime_put(&pdev->dev); + return 0; release_dma: @@ -1725,17 +1721,25 @@ static int __maybe_unused tegra_i2c_resume(struct device *dev) struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); int err; + err = tegra_i2c_runtime_resume(dev); + if (err) + return err; + err = tegra_i2c_init(i2c_dev, false); if (err) return err; + err = tegra_i2c_runtime_suspend(dev); + if (err) + return err; + i2c_mark_adapter_resumed(&i2c_dev->adapter); return 0; } static const struct dev_pm_ops tegra_i2c_pm = { - SET_SYSTEM_SLEEP_PM_OPS(tegra_i2c_suspend, tegra_i2c_resume) + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(tegra_i2c_suspend, tegra_i2c_resume) SET_RUNTIME_PM_OPS(tegra_i2c_runtime_suspend, tegra_i2c_runtime_resume, NULL) }; From 85624f90c8fc7214cd37e064635d17541d657732 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sun, 15 Sep 2019 16:46:23 +0300 Subject: [PATCH 344/721] platform/x86: asus-wmi: Make it depend on ACPI battery API When driver has been switched to use ACPI battery API in the commit 7973353e92ee ("Refactor charge threshold to use the battery hooking API") it makes it implicitly dependent to a corresponding kernel configuration option. Make this dependency explicit in Kconfig. Fixes: 7973353e92ee ("Refactor charge threshold to use the battery hooking API") Reported-by: kbuild test robot Cc: Kristian Klausen Signed-off-by: Andy Shevchenko --- drivers/platform/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 1b67bb578f9f..ae21d08c65e8 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -674,6 +674,7 @@ config EEEPC_LAPTOP config ASUS_WMI tristate "ASUS WMI Driver" depends on ACPI_WMI + depends on ACPI_BATTERY depends on INPUT depends on HWMON depends on BACKLIGHT_CLASS_DEVICE From 750670341a24cb714e624e0fd7da30900ad93752 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 23 Jul 2019 16:50:20 +0100 Subject: [PATCH 345/721] ceph: fix directories inode i_blkbits initialization When filling an inode with info from the MDS, i_blkbits is being initialized using fl_stripe_unit, which contains the stripe unit in bytes. Unfortunately, this doesn't make sense for directories as they have fl_stripe_unit set to '0'. This means that i_blkbits will be set to 0xff, causing an UBSAN undefined behaviour in i_blocksize(): UBSAN: Undefined behaviour in ./include/linux/fs.h:731:12 shift exponent 255 is too large for 32-bit type 'int' Fix this by initializing i_blkbits to CEPH_BLOCK_SHIFT if fl_stripe_unit is zero. Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 18500edefc56..3b537e7038c7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -801,7 +801,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page, /* update inode */ inode->i_rdev = le32_to_cpu(info->rdev); - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + /* directories have fl_stripe_unit set to zero */ + if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); From e1e44602021358336fb524affe53ce3c09b34beb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 24 Jul 2019 07:59:51 -0400 Subject: [PATCH 346/721] ceph: allow copy_file_range when src and dst inode are same There is no reason to prevent this. The OSD should be able to handle this as long as the objects are different, and the existing code falls back when the offset into the object is different. Signed-off-by: Jeff Layton Acked-by: Luis Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 685a03cc4b77..f657fabcb3ee 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1913,8 +1913,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, int src_got = 0, dst_got = 0, err, dirty; bool do_final_copy = false; - if (src_inode == dst_inode) - return -EINVAL; if (src_inode->i_sb != dst_inode->i_sb) return -EXDEV; if (ceph_snap(dst_inode) != CEPH_NOSNAP) From e09580b343aa117fd07c1bb7f7dfc5bc630a2953 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 24 Jul 2019 12:46:20 -0400 Subject: [PATCH 347/721] ceph: don't list vxattrs in listxattr() Most filesystems that provide virtual xattrs (e.g. CIFS) don't display them via listxattr(). Ceph does, and that causes some of the tests in xfstests to fail. Have cephfs stop listing vxattrs in listxattr. Userspace can always query them directly when the name is known. Signed-off-by: Jeff Layton Acked-by: David Disseldorp Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 939eab7aa219..2fba06b50f25 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -903,11 +903,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); bool len_only = (size == 0); u32 namelen; int err; - int i; spin_lock(&ci->i_ceph_lock); dout("listxattr %p ver=%lld index_ver=%lld\n", inode, @@ -936,33 +934,6 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) names = __copy_xattr_names(ci, names); size -= namelen; } - - - /* virtual xattr names, too */ - if (vxattrs) { - for (i = 0; vxattrs[i].name; i++) { - size_t this_len; - - if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN) - continue; - if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci)) - continue; - - this_len = strlen(vxattrs[i].name) + 1; - namelen += this_len; - if (len_only) - continue; - - if (this_len > size) { - err = -ERANGE; - goto out; - } - - memcpy(names, vxattrs[i].name, this_len); - names += this_len; - size -= this_len; - } - } err = namelen; out: spin_unlock(&ci->i_ceph_lock); From 120a75ea9f4ba02f852171e75d44f29139b9c83e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:39 +0800 Subject: [PATCH 348/721] libceph: add function that reset client's entity addr This function also re-open connections to OSD/MON, and re-send in-flight OSD requests after re-opening connections to OSD. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 1 + include/linux/ceph/messenger.h | 1 + include/linux/ceph/mon_client.h | 1 + include/linux/ceph/osd_client.h | 1 + net/ceph/ceph_common.c | 8 ++++++++ net/ceph/messenger.c | 6 ++++++ net/ceph/mon_client.c | 7 +++++++ net/ceph/osd_client.c | 18 ++++++++++++++++++ 8 files changed, 43 insertions(+) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 82156da3c650..b9dbda1c26aa 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -293,6 +293,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private); struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); u64 ceph_client_gid(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); +extern void ceph_reset_client_addr(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, unsigned long started); extern int ceph_open_session(struct ceph_client *client); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 23895d178149..c4458dc6a757 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -337,6 +337,7 @@ extern void ceph_msgr_flush(void); extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr); extern void ceph_messenger_fini(struct ceph_messenger *msgr); +extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr); extern void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index b4d134d3312a..dbb8a6959a73 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -109,6 +109,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern void ceph_monc_stop(struct ceph_mon_client *monc); +extern void ceph_monc_reopen_session(struct ceph_mon_client *monc); enum { CEPH_SUB_MONMAP = 0, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ad7fe5d10dcd..d1c3e829f30d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -381,6 +381,7 @@ extern void ceph_osdc_cleanup(void); extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); +extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4eeea4d5c3ef..b412a3ccc4fc 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -694,6 +694,14 @@ void ceph_destroy_client(struct ceph_client *client) } EXPORT_SYMBOL(ceph_destroy_client); +void ceph_reset_client_addr(struct ceph_client *client) +{ + ceph_messenger_reset_nonce(&client->msgr); + ceph_monc_reopen_session(&client->monc); + ceph_osdc_reopen_osds(&client->osdc); +} +EXPORT_SYMBOL(ceph_reset_client_addr); + /* * true if we have the mon map (and have thus joined the cluster) */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 962f521c863e..e4cb3db2ee77 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3031,6 +3031,12 @@ static void con_fault(struct ceph_connection *con) } +void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) +{ + u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; + msgr->inst.addr.nonce = cpu_to_le32(nonce); + encode_my_addr(msgr); +} /* * initialize a new messenger instance diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 0520bf9825aa..7256c402ebaa 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc) __open_session(monc); } +void ceph_monc_reopen_session(struct ceph_mon_client *monc) +{ + mutex_lock(&monc->mutex); + reopen_session(monc); + mutex_unlock(&monc->mutex); +} + static void un_backoff(struct ceph_mon_client *monc) { monc->hunt_mult /= 2; /* reduce by 50% */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 78ae6e8c953d..a252ba80dd82 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5086,6 +5086,24 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_call); +/* + * reset all osd connections + */ +void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + + down_write(&osdc->lock); + for (n = rb_first(&osdc->osds); n; ) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + n = rb_next(n); + if (!reopen_osd(osd)) + kick_osd_requests(osd); + } + up_write(&osdc->lock); +} + /* * init, shutdown */ From 2cef0ba8032ce0df3f29621921fe7d6372f68d3e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:40 +0800 Subject: [PATCH 349/721] libceph: add function that clears osd client's abort_err Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 1 + net/ceph/osd_client.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d1c3e829f30d..eaffbdddf89a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -389,6 +389,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); #define osd_req_op_data(oreq, whch, typ, fld) \ ({ \ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a252ba80dd82..6f49c59e5e58 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2476,6 +2476,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) } EXPORT_SYMBOL(ceph_osdc_abort_requests); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc) +{ + down_write(&osdc->lock); + osdc->abort_err = 0; + up_write(&osdc->lock); +} +EXPORT_SYMBOL(ceph_osdc_clear_abort_err); + static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { if (likely(eb > osdc->epoch_barrier)) { From 7e6906c1e670f0b40554f42ba74cc043d094877c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:41 +0800 Subject: [PATCH 350/721] ceph: allow closing session in restarting/reconnect state CEPH_MDS_SESSION_{RESTARTING,RECONNECTING} are for for mds failover, they are sub-states of CEPH_MDS_SESSION_OPEN. So __close_session() should send close request for session in these two state. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f7c8603484fe..810fd8689dff 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -148,9 +148,9 @@ enum { CEPH_MDS_SESSION_OPENING = 2, CEPH_MDS_SESSION_OPEN = 3, CEPH_MDS_SESSION_HUNG = 4, - CEPH_MDS_SESSION_CLOSING = 5, - CEPH_MDS_SESSION_RESTARTING = 6, - CEPH_MDS_SESSION_RECONNECTING = 7, + CEPH_MDS_SESSION_RESTARTING = 5, + CEPH_MDS_SESSION_RECONNECTING = 6, + CEPH_MDS_SESSION_CLOSING = 7, CEPH_MDS_SESSION_REJECTED = 8, }; From f4b97866223b8dddd1bcb9d2a9546c5a5e430249 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:42 +0800 Subject: [PATCH 351/721] ceph: track and report error of async metadata operation Use errseq_t to track and report errors of async metadata operations, similar to how kernel handles errors during writeback. If any dirty caps or any unsafe request gets dropped during session eviction, record -EIO in corresponding inode's i_meta_err. The error will be reported by subsequent fsync, Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 24 +++++++++++++++++------- fs/ceph/file.c | 6 ++++-- fs/ceph/inode.c | 2 ++ fs/ceph/mds_client.c | 40 +++++++++++++++++++++++++++------------- fs/ceph/super.h | 4 ++++ 5 files changed, 54 insertions(+), 22 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ce0f5658720a..321ba9b30968 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2261,35 +2261,45 @@ static int unsafe_request_wait(struct inode *inode) int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { + struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); u64 flush_tid; - int ret; + int ret, err; int dirty; dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); ret = file_write_and_wait_range(file, start, end); - if (ret < 0) - goto out; - if (datasync) goto out; dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - ret = unsafe_request_wait(inode); + err = unsafe_request_wait(inode); /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - ret = wait_event_interruptible(ci->i_cap_wq, + if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { + err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } + + if (err < 0) + ret = err; + + if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { + spin_lock(&file->f_lock); + err = errseq_check_and_advance(&ci->i_meta_err, + &fi->meta_err); + spin_unlock(&file->f_lock); + if (err < 0) + ret = err; + } out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f657fabcb3ee..75a1d12ec46d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -201,6 +201,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -211,7 +212,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, struct ceph_dir_file_info *dfi = kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); if (!dfi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + ceph_put_fmode(ci, fmode); /* clean up */ return -ENOMEM; } @@ -222,7 +223,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, } else { fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); if (!fi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + ceph_put_fmode(ci, fmode); /* clean up */ return -ENOMEM; } @@ -232,6 +233,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, fi->fmode = fmode; spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); + fi->meta_err = errseq_sample(&ci->i_meta_err); return 0; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 3b537e7038c7..332433b490f5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -515,6 +515,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ceph_fscache_inode_init(ci); + ci->i_meta_err = 0; + return &ci->vfs_inode; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 920e9f048bd8..df4bea231017 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1270,6 +1270,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, { struct ceph_mds_request *req; struct rb_node *p; + struct ceph_inode_info *ci; dout("cleanup_session_requests mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); @@ -1278,6 +1279,16 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); + if (req->r_target_inode) { + /* dropping unsafe change of inode's attributes */ + ci = ceph_inode(req->r_target_inode); + errseq_set(&ci->i_meta_err, -EIO); + } + if (req->r_unsafe_dir) { + /* dropping unsafe directory operation */ + ci = ceph_inode(req->r_unsafe_dir); + errseq_set(&ci->i_meta_err, -EIO); + } __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1370,7 +1381,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); - bool drop = false; + bool dirty_dropped = false; bool invalidate = false; dout("removing cap %p, ci is %p, inode is %p\n", @@ -1405,7 +1416,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, inode, ceph_ino(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); - drop = true; + dirty_dropped = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited( @@ -1415,10 +1426,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; - drop = true; + dirty_dropped = true; } spin_unlock(&mdsc->cap_dirty_lock); + if (dirty_dropped) { + errseq_set(&ci->i_meta_err, -EIO); + + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } + if (atomic_read(&ci->i_filelock_ref) > 0) { /* make further file lock syscall return -EIO */ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; @@ -1430,15 +1453,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } - - if (drop && - ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ceph_put_snap_context(ci->i_head_snapc); - ci->i_head_snapc = NULL; - } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -1452,7 +1466,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); - if (drop) + if (dirty_dropped) iput(inode); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6b9f1ee7de85..66dfe5ebf006 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -395,6 +395,8 @@ struct ceph_inode_info { struct fscache_cookie *fscache; u32 i_fscache_gen; #endif + errseq_t i_meta_err; + struct inode vfs_inode; /* at end */ }; @@ -703,6 +705,8 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; + + errseq_t meta_err; }; struct ceph_dir_file_info { From 5e3ded1bb642f2d7a6ded6deeafb155d5b5312f2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:43 +0800 Subject: [PATCH 352/721] ceph: pass filp to ceph_get_caps() Also change several other functions' arguments, no logical changes. This is preparetion for later patch that checks filp error. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 15 +++++++++------ fs/ceph/caps.c | 32 +++++++++++++++++--------------- fs/ceph/file.c | 35 +++++++++++++++++------------------ fs/ceph/super.h | 6 +++--- 4 files changed, 46 insertions(+), 42 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b3c8b886bf64..2d6e23e32e30 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -323,7 +323,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, + true, &got); if (ret < 0) { dout("start_read %p, error getting cap\n", inode); } else if (!(got & want)) { @@ -1452,7 +1453,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); if (err < 0) goto out_restore; @@ -1568,7 +1570,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got, NULL); if (err < 0) goto out_free; @@ -1989,10 +1991,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, return err; } -int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) +int ceph_pool_perm_check(struct inode *inode, int need) { - s64 pool; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_string *pool_ns; + s64 pool; int ret, flags; if (ci->i_vino.snap != CEPH_NOSNAP) { @@ -2004,7 +2007,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) return 0; } - if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), + if (ceph_test_mount_opt(ceph_inode_to_client(inode), NOPOOLPERM)) return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 321ba9b30968..bde81aaa3750 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2570,10 +2570,10 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * * FIXME: how does a 0 return differ from -EAGAIN? */ -static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, +static int try_get_cap_refs(struct inode *inode, int need, int want, loff_t endoff, bool nonblock, int *got) { - struct inode *inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; @@ -2741,18 +2741,18 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, +int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { int ret; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); - ret = ceph_pool_perm_check(ci, need); + ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; - ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); + ret = try_get_cap_refs(inode, need, want, 0, nonblock, got); return ret == -EAGAIN ? 0 : ret; } @@ -2761,21 +2761,23 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); int _got, ret; - ret = ceph_pool_perm_check(ci, need); + ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; while (true) { if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); + check_max_size(inode, endoff); _got = 0; - ret = try_get_cap_refs(ci, need, want, endoff, + ret = try_get_cap_refs(inode, need, want, endoff, false, &_got); if (ret == -EAGAIN) continue; @@ -2783,8 +2785,8 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&ci->i_cap_wq, &wait); - while (!(ret = try_get_cap_refs(ci, need, want, endoff, - true, &_got))) { + while (!(ret = try_get_cap_refs(inode, need, want, + endoff, true, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -2799,7 +2801,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (ret < 0) { if (ret == -ESTALE) { /* session was killed, try renew caps */ - ret = ceph_renew_caps(&ci->vfs_inode); + ret = ceph_renew_caps(inode); if (ret == 0) continue; } @@ -2808,9 +2810,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (ci->i_inline_version != CEPH_INLINE_NONE && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(&ci->vfs_inode) > 0) { + i_size_read(inode) > 0) { struct page *page = - find_get_page(ci->vfs_inode.i_mapping, 0); + find_get_page(inode->i_mapping, 0); if (page) { if (PageUptodate(page)) { *pinned_page = page; @@ -2829,7 +2831,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, * getattr request will bring inline data into * page cache */ - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + ret = __ceph_do_getattr(inode, NULL, CEPH_STAT_CAP_INLINE_DATA, true); if (ret < 0) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 75a1d12ec46d..fb007b75fb17 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1262,7 +1262,8 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); if (ret < 0) return ret; @@ -1459,7 +1460,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got, NULL); if (err < 0) goto out; @@ -1783,7 +1784,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); if (ret < 0) goto unlock; @@ -1812,16 +1813,15 @@ static long ceph_fallocate(struct file *file, int mode, * src_ci. Two attempts are made to obtain both caps, and an error is return if * this fails; zero is returned on success. */ -static int get_rd_wr_caps(struct ceph_inode_info *src_ci, - loff_t src_endoff, int *src_got, - struct ceph_inode_info *dst_ci, +static int get_rd_wr_caps(struct file *src_filp, int *src_got, + struct file *dst_filp, loff_t dst_endoff, int *dst_got) { int ret = 0; bool retrying = false; retry_caps: - ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, dst_endoff, dst_got, NULL); if (ret < 0) return ret; @@ -1831,24 +1831,24 @@ static int get_rd_wr_caps(struct ceph_inode_info *src_ci, * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some * retry dance instead to try to get both capabilities. */ - ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + ret = ceph_try_get_caps(file_inode(src_filp), + CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, false, src_got); if (ret <= 0) { /* Start by dropping dst_ci caps and getting src_ci caps */ - ceph_put_cap_refs(dst_ci, *dst_got); + ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); if (retrying) { if (!ret) /* ceph_try_get_caps masks EAGAIN */ ret = -EAGAIN; return ret; } - ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, - CEPH_CAP_FILE_SHARED, src_endoff, - src_got, NULL); + ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, -1, src_got, NULL); if (ret < 0) return ret; /*... drop src_ci caps too, and retry */ - ceph_put_cap_refs(src_ci, *src_got); + ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); retrying = true; goto retry_caps; } @@ -1960,8 +1960,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * clients may have dirty data in their caches. And OSDs know nothing * about caps, so they can't safely do the remote object copies. */ - err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, - dst_ci, (dst_off + len), &dst_got); + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); if (err < 0) { dout("get_rd_wr_caps returned %d\n", err); ret = -EOPNOTSUPP; @@ -2018,9 +2018,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, goto out; } len -= ret; - err = get_rd_wr_caps(src_ci, (src_off + len), - &src_got, dst_ci, - (dst_off + len), &dst_got); + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); if (err < 0) goto out; err = is_file_size_ok(src_inode, dst_inode, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 66dfe5ebf006..1452f62445c3 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1062,9 +1062,9 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); -extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, +extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page); -extern int ceph_try_get_caps(struct ceph_inode_info *ci, +extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); /* for counting open files by mode */ @@ -1075,7 +1075,7 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); extern const struct address_space_operations ceph_aops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); extern int ceph_uninline_data(struct file *filp, struct page *locked_page); -extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); +extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ From d468e729b74eafdfc8306ca8f77e1f26478d67da Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:44 +0800 Subject: [PATCH 353/721] ceph: add helper function that forcibly reconnects to ceph cluster. It closes mds sessions, drop all caps and invalidates page caches, then use new entity address to reconnect to the cluster. After reconnect, all dirty data/metadata are dropped, file locks get lost sliently. Open files continue to work because client will try renewing caps on later read/write. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 15 ++++++++++++--- fs/ceph/super.c | 26 +++++++++++++++++++++++++- fs/ceph/super.h | 3 ++- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index df4bea231017..ed4d20a54af7 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1394,9 +1394,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = fsc->mdsc; - if (ci->i_wrbuffer_ref > 0 && - READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) - invalidate = true; + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (inode->i_data.nrpages > 0) + invalidate = true; + if (ci->i_wrbuffer_ref > 0) + mapping_set_error(&inode->i_data, -EIO); + } while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, @@ -4369,7 +4372,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) session = __ceph_lookup_mds_session(mdsc, mds); if (!session) continue; + + if (session->s_state == CEPH_MDS_SESSION_REJECTED) + __unregister_session(mdsc, session); + __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); __close_session(mdsc, session); if (session->s_state == CEPH_MDS_SESSION_CLOSING) { @@ -4378,6 +4386,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); kick_requests(mdsc, mds); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index ab4868c7308e..a95dd13bb628 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -829,7 +829,6 @@ static void ceph_umount_begin(struct super_block *sb) fsc->mount_state = CEPH_MOUNT_SHUTDOWN; ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); ceph_mdsc_force_umount(fsc->mdsc); - return; } static int ceph_remount(struct super_block *sb, int *flags, char *data) @@ -1152,6 +1151,31 @@ static struct file_system_type ceph_fs_type = { }; MODULE_ALIAS_FS("ceph"); +int ceph_force_reconnect(struct super_block *sb) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + int err = 0; + + ceph_umount_begin(sb); + + /* Make sure all page caches get invalidated. + * see remove_session_caps_cb() */ + flush_workqueue(fsc->inode_wq); + + /* In case that we were blacklisted. This also reset + * all mon/osd connections */ + ceph_reset_client_addr(fsc->client); + + ceph_osdc_clear_abort_err(&fsc->client->osdc); + fsc->mount_state = CEPH_MOUNT_MOUNTED; + + if (sb->s_root) { + err = __ceph_do_getattr(d_inode(sb->s_root), NULL, + CEPH_STAT_CAP_INODE, true); + } + return err; +} + static int __init init_ceph(void) { int ret = init_caches(); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1452f62445c3..7af7f2ba8da5 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -846,7 +846,8 @@ static inline int default_congestion_kb(void) } - +/* super.c */ +extern int ceph_force_reconnect(struct super_block *sb); /* snap.c */ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); From ff5d913dfc7142974eb1694d5fd6284658e46bc6 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:45 +0800 Subject: [PATCH 354/721] ceph: return -EIO if read/write against filp that lost file locks After mds evicts session, file locks get lost sliently. It's not safe to let programs continue to do read/write. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 28 ++++++++++++++++++++++------ fs/ceph/locks.c | 8 ++++++-- fs/ceph/super.h | 1 + 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bde81aaa3750..102192c90edd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2570,8 +2570,13 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * * FIXME: how does a 0 return differ from -EAGAIN? */ +enum { + NON_BLOCKING = 1, + CHECK_FILELOCK = 2, +}; + static int try_get_cap_refs(struct inode *inode, int need, int want, - loff_t endoff, bool nonblock, int *got) + loff_t endoff, int flags, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; @@ -2586,6 +2591,13 @@ static int try_get_cap_refs(struct inode *inode, int need, int want, again: spin_lock(&ci->i_ceph_lock); + if ((flags & CHECK_FILELOCK) && + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { + dout("try_get_cap_refs %p error filelock\n", inode); + ret = -EIO; + goto out_unlock; + } + /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); if ((file_wanted & need) != need) { @@ -2647,7 +2659,7 @@ static int try_get_cap_refs(struct inode *inode, int need, int want, * we can not call down_read() when * task isn't in TASK_RUNNING state */ - if (nonblock) { + if (flags & NON_BLOCKING) { ret = -EAGAIN; goto out_unlock; } @@ -2752,7 +2764,8 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, if (ret < 0) return ret; - ret = try_get_cap_refs(inode, need, want, 0, nonblock, got); + ret = try_get_cap_refs(inode, need, want, 0, + (nonblock ? NON_BLOCKING : 0), got); return ret == -EAGAIN ? 0 : ret; } @@ -2764,9 +2777,10 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { + struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - int _got, ret; + int ret, _got, flags; ret = ceph_pool_perm_check(inode, need); if (ret < 0) @@ -2776,17 +2790,19 @@ int ceph_get_caps(struct file *filp, int need, int want, if (endoff > 0) check_max_size(inode, endoff); + flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0; _got = 0; ret = try_get_cap_refs(inode, need, want, endoff, - false, &_got); + flags, &_got); if (ret == -EAGAIN) continue; if (!ret) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&ci->i_cap_wq, &wait); + flags |= NON_BLOCKING; while (!(ret = try_get_cap_refs(inode, need, want, - endoff, true, &_got))) { + endoff, flags, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 5083e238ad15..544e9e85b120 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -32,14 +32,18 @@ void __init ceph_flock_init(void) static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { - struct inode *inode = file_inode(src->fl_file); + struct ceph_file_info *fi = dst->fl_file->private_data; + struct inode *inode = file_inode(dst->fl_file); atomic_inc(&ceph_inode(inode)->i_filelock_ref); + atomic_inc(&fi->num_locks); } static void ceph_fl_release_lock(struct file_lock *fl) { + struct ceph_file_info *fi = fl->fl_file->private_data; struct inode *inode = file_inode(fl->fl_file); struct ceph_inode_info *ci = ceph_inode(inode); + atomic_dec(&fi->num_locks); if (atomic_dec_and_test(&ci->i_filelock_ref)) { /* clear error when all locks are released */ spin_lock(&ci->i_ceph_lock); @@ -73,7 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, * window. Caller function will decrease the counter. */ fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ceph_inode(inode)->i_filelock_ref); + fl->fl_ops->fl_copy_lock(fl, NULL); } if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7af7f2ba8da5..736318210bc9 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -707,6 +707,7 @@ struct ceph_file_info { struct list_head rw_contexts; errseq_t meta_err; + atomic_t num_locks; }; struct ceph_dir_file_info { From 81f148a910045cd0a139f589a0b42764b172f8f5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:46 +0800 Subject: [PATCH 355/721] ceph: invalidate all write mode filp after reconnect Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 13 +++++++++++++ fs/ceph/file.c | 1 + fs/ceph/super.c | 2 ++ fs/ceph/super.h | 3 +++ 4 files changed, 19 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 102192c90edd..d17bde5d4f9a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2780,12 +2780,17 @@ int ceph_get_caps(struct file *filp, int need, int want, struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int ret, _got, flags; ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) + return -EBADF; + while (true) { if (endoff > 0) check_max_size(inode, endoff); @@ -2814,6 +2819,14 @@ int ceph_get_caps(struct file *filp, int need, int want, if (ret == -EAGAIN) continue; } + + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) { + if (ret >= 0 && _got) + ceph_put_cap_refs(ci, _got); + return -EBADF; + } + if (ret < 0) { if (ret == -ESTALE) { /* session was killed, try renew caps */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index fb007b75fb17..779bf68afd60 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -234,6 +234,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); fi->meta_err = errseq_sample(&ci->i_meta_err); + fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); return 0; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a95dd13bb628..630549ac4f48 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -664,6 +664,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->sb = NULL; fsc->mount_state = CEPH_MOUNT_MOUNTING; + fsc->filp_gen = 1; atomic_long_set(&fsc->writeback_count, 0); @@ -829,6 +830,7 @@ static void ceph_umount_begin(struct super_block *sb) fsc->mount_state = CEPH_MOUNT_SHUTDOWN; ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); ceph_mdsc_force_umount(fsc->mdsc); + fsc->filp_gen++; // invalidate open files } static int ceph_remount(struct super_block *sb, int *flags, char *data) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 736318210bc9..f5e5f6a6bfb8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -101,6 +101,8 @@ struct ceph_fs_client { struct ceph_client *client; unsigned long mount_state; + + u32 filp_gen; loff_t max_file_size; struct ceph_mds_client *mdsc; @@ -707,6 +709,7 @@ struct ceph_file_info { struct list_head rw_contexts; errseq_t meta_err; + u32 filp_gen; atomic_t num_locks; }; From 131d7eb4faa1fc06b08b633aff0b59ae85f1938e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 25 Jul 2019 20:16:47 +0800 Subject: [PATCH 356/721] ceph: auto reconnect after blacklisted Make client use osd reply and session message to infer if itself is blacklisted. Client reconnect to cluster using new entity addr if it is blacklisted. Auto reconnect is limited to once every 30 minutes. Auto reconnect is disabled by default. It can be enabled/disabled by recover_session= mount option. In 'clean' mode, client drops any dirty data/metadata, invalidates page caches and invalidates all writable file handles. After reconnect, file locks become stale because MDS loses track of them. If an inode contains any stale file locks, read/write on the indoe are not allowed until applications release all stale file locks. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- Documentation/filesystems/ceph.txt | 14 ++++++++++++ fs/ceph/addr.c | 22 ++++++++++++++----- fs/ceph/file.c | 8 ++++++- fs/ceph/mds_client.c | 34 ++++++++++++++++++++++++++++-- fs/ceph/super.c | 19 +++++++++++++++++ fs/ceph/super.h | 4 ++++ 6 files changed, 93 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index d2c6a5ccf0f5..b19b6a03f91c 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -158,6 +158,20 @@ Mount Options copies. Currently, it's only used in copy_file_range, which will revert to the default VFS implementation if this option is used. + recover_session= + Set auto reconnect mode in the case where the client is blacklisted. The + available modes are "no" and "clean". The default is "no". + + * no: never attempt to reconnect when client detects that it has been + blacklisted. Operations will generally fail after being blacklisted. + + * clean: client reconnects to the ceph cluster automatically when it + detects that it has been blacklisted. During reconnect, client drops + dirty data/metadata, invalidates page caches and writable file handles. + After reconnect, file locks become stale because the MDS loses track + of them. If an inode contains any stale file locks, read/write on the + inode is not allowed until applications release all stale file locks. + More Information ================ diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 2d6e23e32e30..62602283557f 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int err = 0; u64 off = page_offset(page); u64 len = PAGE_SIZE; @@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); - err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - off, &len, + err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, off, &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) @@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) if (err < 0) { SetPageError(page); ceph_fscache_readpage_cancel(inode, page); + if (err == -EBLACKLISTED) + fsc->blacklisted = true; goto out; } if (err < PAGE_SIZE) @@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req) int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + if (rc == -EBLACKLISTED) + ceph_inode_to_client(inode)->blacklisted = true; /* unlock all pages, zeroing any data we didn't read */ osd_data = osd_req_op_extent_osd_data(req, 0); @@ -641,6 +644,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) end_page_writeback(page); return err; } + if (err == -EBLACKLISTED) + fsc->blacklisted = true; dout("writepage setting page/mapping error %d %p\n", err, page); SetPageError(page); @@ -721,6 +726,8 @@ static void writepages_finish(struct ceph_osd_request *req) if (rc < 0) { mapping_set_error(mapping, rc); ceph_set_error_write(ci); + if (rc == -EBLACKLISTED) + fsc->blacklisted = true; } else { ceph_clear_error_write(ci); } @@ -1948,12 +1955,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, if (err >= 0 || err == -ENOENT) have |= POOL_READ; - else if (err != -EPERM) + else if (err != -EPERM) { + if (err == -EBLACKLISTED) + fsc->blacklisted = true; goto out_unlock; + } if (err2 == 0 || err2 == -EEXIST) have |= POOL_WRITE; else if (err2 != -EPERM) { + if (err2 == -EBLACKLISTED) + fsc->blacklisted = true; err = err2; goto out_unlock; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 779bf68afd60..5182e1a49d6f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -698,7 +698,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ceph_release_page_vector(pages, num_pages); } - if (ret <= 0 || off >= i_size || !more) + if (ret < 0) { + if (ret == -EBLACKLISTED) + fsc->blacklisted = true; + break; + } + + if (off >= i_size || !more) break; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ed4d20a54af7..5bfbff8f2f64 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3032,18 +3032,23 @@ static void handle_forward(struct ceph_mds_client *mdsc, pr_err("mdsc_handle_forward decode error err=%d\n", err); } -static int __decode_and_drop_session_metadata(void **p, void *end) +static int __decode_session_metadata(void **p, void *end, + bool *blacklisted) { /* map */ u32 n; + bool err_str; ceph_decode_32_safe(p, end, n, bad); while (n-- > 0) { u32 len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + err_str = !strncmp(*p, "error_string", len); *p += len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + if (err_str && strnstr(*p, "blacklisted", len)) + *blacklisted = true; *p += len; } return 0; @@ -3067,6 +3072,7 @@ static void handle_session(struct ceph_mds_session *session, u64 seq; unsigned long features = 0; int wake = 0; + bool blacklisted = false; /* decode */ ceph_decode_need(&p, end, sizeof(*h), bad); @@ -3079,7 +3085,7 @@ static void handle_session(struct ceph_mds_session *session, if (msg_version >= 3) { u32 len; /* version >= 2, metadata */ - if (__decode_and_drop_session_metadata(&p, end) < 0) + if (__decode_session_metadata(&p, end, &blacklisted) < 0) goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); @@ -3166,6 +3172,8 @@ static void handle_session(struct ceph_mds_session *session, session->s_state = CEPH_MDS_SESSION_REJECTED; cleanup_session_requests(mdsc, session); remove_session_caps(session); + if (blacklisted) + mdsc->fsc->blacklisted = true; wake = 2; /* for good measure */ break; @@ -4015,7 +4023,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); } +static void maybe_recover_session(struct ceph_mds_client *mdsc) +{ + struct ceph_fs_client *fsc = mdsc->fsc; + if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) + return; + + if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) + return; + + if (!READ_ONCE(fsc->blacklisted)) + return; + + if (fsc->last_auto_reconnect && + time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) + return; + + pr_info("auto reconnect after blacklisted\n"); + fsc->last_auto_reconnect = jiffies; + ceph_force_reconnect(fsc->sb); +} /* * delayed work -- periodically trim expired leases, renew caps with mds @@ -4089,6 +4117,8 @@ static void delayed_work(struct work_struct *work) ceph_trim_snapid_map(mdsc); + maybe_recover_session(mdsc); + schedule_delayed(mdsc); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 630549ac4f48..03b63b1cd32c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -143,6 +143,7 @@ enum { Opt_snapdirname, Opt_mds_namespace, Opt_fscache_uniq, + Opt_recover_session, Opt_last_string, /* string args above */ Opt_dirstat, @@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = { /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, {Opt_mds_namespace, "mds_namespace=%s"}, + {Opt_recover_session, "recover_session=%s"}, {Opt_fscache_uniq, "fsc=%s"}, /* string args above */ {Opt_dirstat, "dirstat"}, @@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private) if (!fsopt->mds_namespace) return -ENOMEM; break; + case Opt_recover_session: + if (!strncmp(argstr[0].from, "no", + argstr[0].to - argstr[0].from)) { + fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; + } else if (!strncmp(argstr[0].from, "clean", + argstr[0].to - argstr[0].from)) { + fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; + } else { + return -EINVAL; + } + break; case Opt_fscache_uniq: kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = kstrndup(argstr[0].from, @@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->mds_namespace) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) + seq_show_option(m, "recover_session", "clean"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -1169,6 +1186,8 @@ int ceph_force_reconnect(struct super_block *sb) ceph_reset_client_addr(fsc->client); ceph_osdc_clear_abort_err(&fsc->client->osdc); + + fsc->blacklisted = false; fsc->mount_state = CEPH_MOUNT_MOUNTED; if (sb->s_root) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f5e5f6a6bfb8..2105c2c7a2a5 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -31,6 +31,7 @@ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ @@ -102,6 +103,9 @@ struct ceph_fs_client { unsigned long mount_state; + unsigned long last_auto_reconnect; + bool blacklisted; + u32 filp_gen; loff_t max_file_size; From b72b13eb203835a8a07c280939dd17584c6a688e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 2 Jul 2019 12:35:52 -0400 Subject: [PATCH 357/721] ceph: don't SetPageError on writepage errors We already mark the mapping in that case, and doing this can cause false positives to occur at fsync time, as well as spurious read errors. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 62602283557f..90e8f8487aaf 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -573,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode, /* * Write a single page, but leave the page locked. * - * If we get a write error, set the page error bit, but still adjust the + * If we get a write error, mark the mapping for error, but still adjust the * dirty page accounting (i.e., page is no longer dirty). */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) @@ -648,7 +648,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) fsc->blacklisted = true; dout("writepage setting page/mapping error %d %p\n", err, page); - SetPageError(page); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { From 5de16b30d3121eca26c8c898cc0eff090e21dda6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 23 Jul 2019 15:09:50 -0400 Subject: [PATCH 358/721] ceph: remove ceph_get_cap_mds and __ceph_get_cap_mds Nothing calls these routines. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 31 ------------------------------- fs/ceph/super.h | 1 - 2 files changed, 32 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d17bde5d4f9a..4615f2501e15 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -457,37 +457,6 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) return cap; } -/* - * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. - */ -static int __ceph_get_cap_mds(struct ceph_inode_info *ci) -{ - struct ceph_cap *cap; - int mds = -1; - struct rb_node *p; - - /* prefer mds with WR|BUFFER|EXCL caps */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - cap = rb_entry(p, struct ceph_cap, ci_node); - mds = cap->mds; - if (cap->issued & (CEPH_CAP_FILE_WR | - CEPH_CAP_FILE_BUFFER | - CEPH_CAP_FILE_EXCL)) - break; - } - return mds; -} - -int ceph_get_cap_mds(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int mds; - spin_lock(&ci->i_ceph_lock); - mds = __ceph_get_cap_mds(ceph_inode(inode)); - spin_unlock(&ci->i_ceph_lock); - return mds; -} - /* * Called under i_ceph_lock. */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2105c2c7a2a5..cf16308800fa 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1052,7 +1052,6 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); -extern int ceph_get_cap_mds(struct inode *inode); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, From 606d102327a45a49d293557527802ee7fbfd7af1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 22 Jul 2019 13:12:01 -0400 Subject: [PATCH 359/721] ceph: fetch cap_gen under spinlock in ceph_add_cap It's protected by the s_gen_ttl_lock, so we should fetch under it and ensure that we're using the same generation in both places. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 4615f2501e15..bdfec8978479 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -614,6 +614,7 @@ void ceph_add_cap(struct inode *inode, struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; + u32 gen; dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); @@ -625,6 +626,10 @@ void ceph_add_cap(struct inode *inode, if (fmode >= 0) wanted |= ceph_caps_for_mode(fmode); + spin_lock(&session->s_gen_ttl_lock); + gen = session->s_cap_gen; + spin_unlock(&session->s_gen_ttl_lock); + cap = __get_cap_for_mds(ci, mds); if (!cap) { cap = *new_cap; @@ -650,7 +655,7 @@ void ceph_add_cap(struct inode *inode, list_move_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); - if (cap->cap_gen < session->s_cap_gen) + if (cap->cap_gen < gen) cap->issued = cap->implemented = CEPH_CAP_PIN; /* @@ -744,7 +749,7 @@ void ceph_add_cap(struct inode *inode, cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; - cap->cap_gen = session->s_cap_gen; + cap->cap_gen = gen; if (fmode >= 0) __ceph_get_fmode(ci, fmode); From 533a2818dd1a00cdd32d638fea0178e25a683053 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 19 Jul 2019 15:22:28 -0400 Subject: [PATCH 360/721] ceph: eliminate session->s_trim_caps It's only used to keep count of caps being trimmed, but that requires that we hold the session->s_mutex to prevent multiple trimming operations from running concurrently. We can achieve the same effect using an integer on the stack, which allows us to (eventually) not need the s_mutex. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 17 ++++++++--------- fs/ceph/mds_client.h | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5bfbff8f2f64..959dcf2ab0b8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -639,7 +639,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); s->s_nr_caps = 0; - s->s_trim_caps = 0; refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); @@ -1722,11 +1721,11 @@ static bool drop_negative_children(struct dentry *dentry) */ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { - struct ceph_mds_session *session = arg; + int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; - if (session->s_trim_caps <= 0) + if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); @@ -1763,7 +1762,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if (oissued) { /* we aren't the only cap.. just remove us */ __ceph_remove_cap(cap, true); - session->s_trim_caps--; + (*remaining)--; } else { struct dentry *dentry; /* try dropping referring dentries */ @@ -1775,7 +1774,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) d_prune_aliases(inode); count = atomic_read(&inode->i_count); if (count == 1) - session->s_trim_caps--; + (*remaining)--; dout("trim_caps_cb %p cap %p pruned, count now %d\n", inode, cap, count); } else { @@ -1801,12 +1800,12 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, dout("trim_caps mds%d start: %d / %d, trim %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { - session->s_trim_caps = trim_caps; - ceph_iterate_session_caps(session, trim_caps_cb, session); + int remaining = trim_caps; + + ceph_iterate_session_caps(session, trim_caps_cb, &remaining); dout("trim_caps mds%d done: %d / %d, trimmed %d\n", session->s_mds, session->s_nr_caps, max_caps, - trim_caps - session->s_trim_caps); - session->s_trim_caps = 0; + trim_caps - remaining); } ceph_flush_cap_releases(mdsc, session); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 810fd8689dff..5cd131b41d84 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -176,7 +176,7 @@ struct ceph_mds_session { spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ struct ceph_cap *s_cap_iterator; - int s_nr_caps, s_trim_caps; + int s_nr_caps; int s_num_cap_releases; int s_cap_reconnect; int s_readonly; From 354c63a0033f1ad4bcb208e796f997bfaf946e6d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 19 Jul 2019 09:41:02 -0400 Subject: [PATCH 361/721] ceph: fix comments over ceph_add_cap We actually need the ci->i_ceph_lock here. The necessity of the s_mutex is less clear. Also add a lockdep assertion for the i_ceph_lock. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bdfec8978479..2563f42445e6 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -597,7 +597,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, /* * Add a capability under the given MDS session. * - * Caller should hold session snap_rwsem (read) and s_mutex. + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock * * @fmode is the open file mode, if we are opening a file, otherwise * it is < 0. (This is so we can atomically add the cap and add an @@ -616,6 +616,8 @@ void ceph_add_cap(struct inode *inode, int actual_wanted; u32 gen; + lockdep_assert_held(&ci->i_ceph_lock); + dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); From 9f3345d8ec5e226d116ddb7fb86ee71a08a3b9a7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Jul 2019 12:27:57 -0400 Subject: [PATCH 362/721] ceph: have __mark_caps_flushing return flush_tid Currently, this function returns ci->i_dirty_caps, but the callers have to check that that isn't 0 before calling this function. Have the callers grab that value directly out of the inode, and have __mark_caps_flushing return the flush_tid instead. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2563f42445e6..6b8300d72cac 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1722,11 +1722,11 @@ static bool __finish_cap_flush(struct ceph_mds_client *mdsc, * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * - * Called under i_ceph_lock. + * Called under i_ceph_lock. Returns the flush tid. */ -static int __mark_caps_flushing(struct inode *inode, +static u64 __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session, bool wake, - u64 *flush_tid, u64 *oldest_flush_tid) + u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); @@ -1765,8 +1765,7 @@ static int __mark_caps_flushing(struct inode *inode, list_add_tail(&cf->i_list, &ci->i_cap_flush_list); - *flush_tid = cf->tid; - return flushing; + return cf->tid; } /* @@ -2056,9 +2055,9 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { - flushing = __mark_caps_flushing(inode, session, false, - &flush_tid, - &oldest_flush_tid); + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, false, + &oldest_flush_tid); } else { flushing = 0; flush_tid = 0; @@ -2137,8 +2136,9 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) goto retry_locked; } - flushing = __mark_caps_flushing(inode, session, true, - &flush_tid, &oldest_flush_tid); + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, true, + &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, From 27b0a392095d30e452772d4ae636150fd7f2172d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Jul 2019 13:26:29 -0400 Subject: [PATCH 363/721] ceph: remove unneeded test in try_flush_caps cap->session is always non-NULL, so we can just do a single test for equality w/o testing explicitly for a NULL pointer. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6b8300d72cac..bb91abaf7559 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2114,7 +2114,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) struct ceph_cap *cap = ci->i_auth_cap; int delayed; - if (!session || session != cap->session) { + if (session != cap->session) { spin_unlock(&ci->i_ceph_lock); if (session) mutex_unlock(&session->s_mutex); From daca8bda95d868b528c2fcf01e3d74c3eaf03889 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Jul 2019 10:55:38 -0400 Subject: [PATCH 364/721] ceph: remove CEPH_I_NOFLUSH Nothing sets this flag. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 10 ---------- fs/ceph/super.h | 19 +++++++++---------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bb91abaf7559..b1c80d837d0d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2003,11 +2003,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, } ack: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - dout(" skipping %p I_NOFLUSH set\n", inode); - continue; - } - if (session && session != cap->session) { dout("oops, wrong session %p mutex\n", session); mutex_unlock(&session->s_mutex); @@ -2105,11 +2100,6 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) retry: spin_lock(&ci->i_ceph_lock); retry_locked: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - spin_unlock(&ci->i_ceph_lock); - dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); - goto out; - } if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; int delayed; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cf16308800fa..03e4828c7635 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -507,16 +507,15 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ -#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ -#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ -#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ -#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ -#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ -#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ -#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ -#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ -#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ +#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ +#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ +#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ /* From 98cd281a76bdf482dd47b282e277c76840ba8eef Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Jul 2019 13:11:51 -0400 Subject: [PATCH 365/721] ceph: remove incorrect comment above __send_cap It doesn't do anything to invalidate the cache when dropping RD caps. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b1c80d837d0d..d3b9c9d5c1bd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1260,10 +1260,6 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. * - * Make half-hearted attempt ot to invalidate page cache if we are - * dropping RDCACHE. Note that this will leave behind locked pages - * that we'll then need to deal with elsewhere. - * * Return non-zero if delayed release, or we experienced an error * such that the caller should requeue + retry later. * From 0ed26f3693fbe0aec107e2e97a053b3eabebfc92 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 29 Jul 2019 11:33:08 +0200 Subject: [PATCH 366/721] ceph: fix indentation in __get_snap_name() Reported-by: kbuild test robot Reported-by: Julia Lawall Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 15ff1b09cfa2..020d39a85ecc 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -458,33 +458,33 @@ static int __get_snap_name(struct dentry *parent, char *name, if (err < 0) goto out; - rinfo = &req->r_reply_info; - for (i = 0; i < rinfo->dir_nr; i++) { - rde = rinfo->dir_entries + i; - BUG_ON(!rde->inode.in); - if (ceph_snap(inode) == - le64_to_cpu(rde->inode.in->snapid)) { - memcpy(name, rde->name, rde->name_len); - name[rde->name_len] = '\0'; - err = 0; - goto out; - } - } + rinfo = &req->r_reply_info; + for (i = 0; i < rinfo->dir_nr; i++) { + rde = rinfo->dir_entries + i; + BUG_ON(!rde->inode.in); + if (ceph_snap(inode) == + le64_to_cpu(rde->inode.in->snapid)) { + memcpy(name, rde->name, rde->name_len); + name[rde->name_len] = '\0'; + err = 0; + goto out; + } + } - if (rinfo->dir_end) - break; + if (rinfo->dir_end) + break; - BUG_ON(rinfo->dir_nr <= 0); - rde = rinfo->dir_entries + (rinfo->dir_nr - 1); - next_offset += rinfo->dir_nr; - last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); - if (!last_name) { - err = -ENOMEM; - goto out; - } + BUG_ON(rinfo->dir_nr <= 0); + rde = rinfo->dir_entries + (rinfo->dir_nr - 1); + next_offset += rinfo->dir_nr; + last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); + if (!last_name) { + err = -ENOMEM; + goto out; + } - ceph_mdsc_put_request(req); - req = NULL; + ceph_mdsc_put_request(req); + req = NULL; } err = -ENOENT; out: From c62498d7f9d37d5e60d61ca2a4e1f88211af7645 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 25 Jul 2019 13:03:32 -0400 Subject: [PATCH 367/721] ceph: update the mtime when truncating up If we have Fx caps, and the we're truncating the size to be larger, then we'll cache the size attribute change, but the mtime won't be updated. Move the size handling before the mtime, and add ATTR_MTIME to ia_valid in that case to make sure the mtime also gets updated. This fixes xfstest generic/313. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 332433b490f5..9f135624ae47 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1989,7 +1989,7 @@ static const struct inode_operations ceph_symlink_iops = { int __ceph_setattr(struct inode *inode, struct iattr *attr) { struct ceph_inode_info *ci = ceph_inode(inode); - const unsigned int ia_valid = attr->ia_valid; + unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf; @@ -2094,6 +2094,26 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } + if (ia_valid & ATTR_SIZE) { + dout("setattr %p size %lld -> %lld\n", inode, + inode->i_size, attr->ia_size); + if ((issued & CEPH_CAP_FILE_EXCL) && + attr->ia_size > inode->i_size) { + i_size_write(inode, attr->ia_size); + inode->i_blocks = calc_inode_blocks(attr->ia_size); + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + ia_valid |= ATTR_MTIME; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + attr->ia_size != inode->i_size) { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = + cpu_to_le64(inode->i_size); + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + } + } if (ia_valid & ATTR_MTIME) { dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, @@ -2116,25 +2136,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } - if (ia_valid & ATTR_SIZE) { - dout("setattr %p size %lld -> %lld\n", inode, - inode->i_size, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && - attr->ia_size > inode->i_size) { - i_size_write(inode, attr->ia_size); - inode->i_blocks = calc_inode_blocks(attr->ia_size); - ci->i_reported_size = attr->ia_size; - dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - attr->ia_size != inode->i_size) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = - cpu_to_le64(inode->i_size); - mask |= CEPH_SETATTR_SIZE; - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; - } - } /* these do nothing */ if (ia_valid & ATTR_CTIME) { From 249c1df59a508969f5263ac7e3aedfce7f52c556 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 1 Aug 2019 10:06:40 -0400 Subject: [PATCH 368/721] ceph: don't freeze during write page faults Prevent freezing operations during write page faults. This is good practice for most filesystems, but especially for ceph since we're monkeying with the signal table here. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 90e8f8487aaf..9efd51926792 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1548,6 +1548,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) if (!prealloc_cf) return VM_FAULT_OOM; + sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); if (ci->i_inline_version != CEPH_INLINE_NONE) { @@ -1622,6 +1623,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_put_cap_refs(ci, got); out_free: ceph_restore_sigs(&oldset); + sb_end_pagefault(inode->i_sb); ceph_free_cap_flush(prealloc_cf); if (err < 0) ret = vmf_error(err); From 3e8730fac951bf89b3adbb040445bb688af1fd22 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 1 Aug 2019 18:06:58 -0700 Subject: [PATCH 369/721] ceph: don't return a value from void function This fixes a build warning to that effect. Fixes: 1a829ff2a6c3 ("ceph: no need to check return value of debugfs_create functions") Signed-off-by: John Hubbard Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 2eb88ed22993..facb387c2735 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -294,7 +294,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - return 0; } void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) From 4766815b1179dbe4fe263a5f95795710e29276e2 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Wed, 3 Jul 2019 17:59:20 +0200 Subject: [PATCH 370/721] libceph: handle OSD op ceph_pagelist_append() errors osd_req_op_cls_init() and osd_req_op_xattr_init() currently propagate ceph_pagelist_alloc() ENOMEM errors but ignore ceph_pagelist_append() memory allocation failures. Add these checks and cleanup on error. Signed-off-by: David Disseldorp Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 6f49c59e5e58..2f44971e1347 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; + int ret; op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); @@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, size = strlen(class); BUG_ON(size > (size_t) U8_MAX); op->cls.class_len = size; - ceph_pagelist_append(pagelist, class, size); + ret = ceph_pagelist_append(pagelist, class, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->cls.method_name = method; size = strlen(method); BUG_ON(size > (size_t) U8_MAX); op->cls.method_len = size; - ceph_pagelist_append(pagelist, method, size); + ret = ceph_pagelist_append(pagelist, method, size); + if (ret) + goto err_pagelist_free; payload_len += size; osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); - op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; + int ret; BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); @@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, payload_len = strlen(name); op->xattr.name_len = payload_len; - ceph_pagelist_append(pagelist, name, payload_len); + ret = ceph_pagelist_append(pagelist, name, payload_len); + if (ret) + goto err_pagelist_free; op->xattr.value_len = size; - ceph_pagelist_append(pagelist, value, size); + ret = ceph_pagelist_append(pagelist, value, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->xattr.cmp_op = cmp_op; @@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_xattr_init); From 321fe13c939876b55fbb7780a243c86e577a2151 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 2 Aug 2019 13:15:39 -0400 Subject: [PATCH 371/721] ceph: add buffered/direct exclusionary locking for reads and writes xfstest generic/451 intermittently fails. The test does O_DIRECT writes to a file, and then reads back the result using buffered I/O, while running a separate set of tasks that are also doing buffered reads. The client will invalidate the cache prior to a direct write, but it's easy for one of the other readers' replies to race in and reinstantiate the invalidated range with stale data. To fix this, we must to serialize direct I/O writes and buffered reads. We could just sprinkle in some shared locks on the i_rwsem for reads, and increase the exclusive footprint on the write side, but that would cause O_DIRECT writes to end up serialized vs. other direct requests. Instead, borrow the scheme used by nfs.ko. Buffered writes take the i_rwsem exclusively, but buffered reads take a shared lock, allowing them to run in parallel. O_DIRECT requests also take a shared lock, but we need for them to not run in parallel with buffered reads. A flag on the ceph_inode_info is used to indicate whether it's in direct or buffered I/O mode. When a conflicting request is submitted, it will block until the inode can be flipped to the necessary mode. Link: https://tracker.ceph.com/issues/40985 Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/Makefile | 2 +- fs/ceph/file.c | 35 ++++++---- fs/ceph/io.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ceph/io.h | 12 ++++ fs/ceph/super.h | 4 +- 5 files changed, 200 insertions(+), 16 deletions(-) create mode 100644 fs/ceph/io.c create mode 100644 fs/ceph/io.h diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index a699e320393f..c1da294418d1 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ - export.o caps.o snap.o xattr.o quota.o \ + export.o caps.o snap.o xattr.o quota.o io.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5182e1a49d6f..ff17a81bf2e2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -15,6 +15,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "io.h" static __le32 ceph_flags_sys2wire(u32 flags) { @@ -930,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_aio_request *aio_req = NULL; int num_pages = 0; int flags; - int ret; + int ret = 0; struct timespec64 mtime = current_time(inode); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; @@ -944,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (write ? "write" : "read"), file, pos, (unsigned)count, snapc, snapc ? snapc->seq : 0); - ret = filemap_write_and_wait_range(inode->i_mapping, - pos, pos + count - 1); - if (ret < 0) - return ret; - if (write) { int ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, @@ -1284,12 +1280,16 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) if (ci->i_inline_version == CEPH_INLINE_NONE) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + ceph_start_io_direct(inode); ret = ceph_direct_read_write(iocb, to, NULL, NULL); + ceph_end_io_direct(inode); if (ret >= 0 && ret < len) retry_op = CHECK_EOF; } else { + ceph_start_io_read(inode); ret = ceph_sync_read(iocb, to, &retry_op); + ceph_end_io_read(inode); } } else { retry_op = READ_INLINE; @@ -1300,7 +1300,9 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); ceph_add_rw_context(fi, &rw_ctx); + ceph_start_io_read(inode); ret = generic_file_read_iter(iocb, to); + ceph_end_io_read(inode); ceph_del_rw_context(fi, &rw_ctx); } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", @@ -1409,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) return -ENOMEM; retry_snap: - inode_lock(inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_start_io_direct(inode); + else + ceph_start_io_write(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1480,7 +1485,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { struct ceph_snap_context *snapc; struct iov_iter data; - inode_unlock(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { @@ -1497,11 +1501,14 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) /* we might need to revert back to that point */ data = *from; - if (iocb->ki_flags & IOCB_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) { written = ceph_direct_read_write(iocb, &data, snapc, &prealloc_cf); - else + ceph_end_io_direct(inode); + } else { written = ceph_sync_write(iocb, &data, pos, snapc); + ceph_end_io_write(inode); + } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); @@ -1516,7 +1523,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - inode_unlock(inode); + ceph_end_io_write(inode); } if (written >= 0) { @@ -1551,9 +1558,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } goto out_unlocked; - out: - inode_unlock(inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_write(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; diff --git a/fs/ceph/io.c b/fs/ceph/io.c new file mode 100644 index 000000000000..97602ea92ff4 --- /dev/null +++ b/fs/ceph/io.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Trond Myklebust + * Copyright (c) 2019 Jeff Layton + * + * I/O and data path helper functionality. + * + * Heavily borrowed from equivalent code in fs/nfs/io.c + */ + +#include + +#include +#include +#include +#include + +#include "super.h" +#include "io.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + inode_dio_wait(inode); + } +} + +/** + * ceph_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +ceph_start_io_read(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * ceph_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered write operation is about to start, and ensure + * that we block all direct I/O. + */ +void +ceph_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ceph_inode(inode), inode); +} + +/** + * ceph_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +ceph_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + /* FIXME: unmap_mapping_range? */ + filemap_write_and_wait(inode->i_mapping); + } +} + +/** + * ceph_end_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +ceph_start_io_direct(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_buffered(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/fs/ceph/io.h b/fs/ceph/io.h new file mode 100644 index 000000000000..fa594cd77348 --- /dev/null +++ b/fs/ceph/io.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_IO_H +#define _FS_CEPH_IO_H + +void ceph_start_io_read(struct inode *inode); +void ceph_end_io_read(struct inode *inode); +void ceph_start_io_write(struct inode *inode); +void ceph_end_io_write(struct inode *inode); +void ceph_start_io_direct(struct inode *inode); +void ceph_end_io_direct(struct inode *inode); + +#endif /* FS_CEPH_IO_H */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 03e4828c7635..98d7190289c8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -513,10 +513,10 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ #define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ #define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ -#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ +#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ #define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ - +#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */ /* * Masks of ceph inode work. From 668959a53578076d836905c0bb51f5cfadf1c343 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 6 Aug 2019 09:07:51 -0400 Subject: [PATCH 372/721] ceph: turn ceph_security_invalidate_secctx into static inline No need to do an extra jump here. Also add some comments on the endifs. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.h | 6 +++++- fs/ceph/xattr.c | 9 ++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 98d7190289c8..f98d9247f9cb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -971,7 +972,10 @@ static inline bool ceph_security_xattr_wanted(struct inode *in) #ifdef CONFIG_CEPH_FS_SECURITY_LABEL extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx); -extern void ceph_security_invalidate_secctx(struct inode *inode); +static inline void ceph_security_invalidate_secctx(struct inode *inode) +{ + security_inode_invalidate_secctx(inode); +} #else static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2fba06b50f25..5c608caf0190 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1265,11 +1265,6 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, return err; } -void ceph_security_invalidate_secctx(struct inode *inode) -{ - security_inode_invalidate_secctx(inode); -} - static int ceph_xattr_set_security_label(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *key, const void *buf, @@ -1298,8 +1293,8 @@ static const struct xattr_handler ceph_security_label_handler = { .get = ceph_xattr_get_security_label, .set = ceph_xattr_set_security_label, }; -#endif -#endif +#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ +#endif /* CONFIG_SECURITY */ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) { From 026105ebb0364fb2155ef3eba65f250efc7aeb45 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 6 Aug 2019 10:41:40 -0400 Subject: [PATCH 373/721] ceph: only set CEPH_I_SEC_INITED if we got a MAC label __ceph_getxattr will set the CEPH_I_SEC_INITED flag whenever it gets any xattr that starts with "security.". We only want to set that flag when fetching the MAC label for the currently-active LSM, however. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5c608caf0190..410eaf1ba211 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -892,7 +892,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, memcpy(value, xattr->val, xattr->val_len); if (current->journal_info && - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN)) ci->i_ceph_flags |= CEPH_I_SEC_INITED; out: spin_unlock(&ci->i_ceph_lock); From b8fe918b090442447d821b32b7dd6e17d5b5dfc1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 6 Aug 2019 10:55:41 -0400 Subject: [PATCH 374/721] ceph: allow arbitrary security.* xattrs Most filesystems don't limit what security.* xattrs can be set or fetched. I see no reason that we need to limit that on cephfs either. Drop the special xattr handler for "security." xattrs, and allow the "other" xattr handler to handle security xattrs as well. In addition to fixing xfstest generic/093, this allows us to support per-file capabilities (a'la setcap(8)). Link: https://tracker.ceph.com/issues/41135 Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 410eaf1ba211..cb18ee637cb7 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -20,7 +20,8 @@ static int __remove_xattr(struct ceph_inode_info *ci, static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || + return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -1265,35 +1266,6 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, ceph_pagelist_release(pagelist); return err; } - -static int ceph_xattr_set_security_label(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *key, const void *buf, - size_t buflen, int flags) -{ - if (security_ismaclabel(key)) { - const char *name = xattr_full_name(handler, key); - return __ceph_setxattr(inode, name, buf, buflen, flags); - } - return -EOPNOTSUPP; -} - -static int ceph_xattr_get_security_label(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *key, void *buf, size_t buflen) -{ - if (security_ismaclabel(key)) { - const char *name = xattr_full_name(handler, key); - return __ceph_getxattr(inode, name, buf, buflen); - } - return -EOPNOTSUPP; -} - -static const struct xattr_handler ceph_security_label_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = ceph_xattr_get_security_label, - .set = ceph_xattr_set_security_label, -}; #endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ #endif /* CONFIG_SECURITY */ @@ -1318,9 +1290,6 @@ const struct xattr_handler *ceph_xattr_handlers[] = { #ifdef CONFIG_CEPH_FS_POSIX_ACL &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, -#endif -#ifdef CONFIG_CEPH_FS_SECURITY_LABEL - &ceph_security_label_handler, #endif &ceph_other_xattr_handler, NULL, From 5435d2069503e2aa89c34a94154f4f2fa4a0c9c4 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Fri, 9 Aug 2019 07:05:27 +0000 Subject: [PATCH 375/721] rbd: fix response length parameter for encoded strings rbd_dev_image_id() allocates space for length but passes a smaller value to rbd_obj_method_sync(). rbd_dev_v2_object_prefix() doesn't allocate space for length. Fix both to be consistent. Signed-off-by: Dongsheng Yang Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c8fb886aebd4..69db7385c8df 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5669,17 +5669,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) { + size_t size; void *reply_buf; int ret; void *p; - reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); + /* Response will be an encoded string, which includes a length */ + size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX; + reply_buf = kzalloc(size, GFP_KERNEL); if (!reply_buf) return -ENOMEM; ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, &rbd_dev->header_oloc, "get_object_prefix", - NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); + NULL, 0, reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -6696,7 +6699,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) dout("rbd id object name is %s\n", oid.name); /* Response will be an encoded string, which includes a length */ - size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; response = kzalloc(size, GFP_NOIO); if (!response) { @@ -6708,7 +6710,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX); + response, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret == -ENOENT) { image_id = kstrdup("", GFP_KERNEL); From 96ac9158a230e467d4be737c82702355c6838fc4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 8 Aug 2019 20:56:47 -0700 Subject: [PATCH 376/721] ceph: use release_pages() directly release_pages() has been available to modules since Oct, 2010, when commit 0be8557bcd34 ("fuse: use release_pages()") added EXPORT_SYMBOL(release_pages). However, this ceph code was still using a workaround. Remove the workaround, and call release_pages() directly. Signed-off-by: John Hubbard Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9efd51926792..7ab616601141 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -684,23 +684,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) return err; } -/* - * lame release_pages helper. release_pages() isn't exported to - * modules. - */ -static void ceph_release_pages(struct page **pages, int num) -{ - struct pagevec pvec; - int i; - - pagevec_init(&pvec); - for (i = 0; i < num; i++) { - if (pagevec_add(&pvec, pages[i]) == 0) - pagevec_release(&pvec); - } - pagevec_release(&pvec); -} - /* * async writeback completion handler. * @@ -776,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req) dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", inode, osd_data->length, rc >= 0 ? num_pages : 0); - ceph_release_pages(osd_data->pages, num_pages); + release_pages(osd_data->pages, num_pages); } ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); From 8edf84ba4d0ecad7d1c5b393ba004b959dae76ea Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 21 Aug 2019 13:57:18 +0200 Subject: [PATCH 377/721] libceph: drop unused con parameter of calc_target() This bit was omitted from a561372405cf ("libceph: fix PG split vs OSD (re)connect race") to avoid backport conflicts. Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2f44971e1347..ba45b074a362 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1505,7 +1505,6 @@ enum calc_target_result { static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osd_request_target *t, - struct ceph_connection *con, bool any_change) { struct ceph_pg_pool_info *pi; @@ -2289,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) goto promote; @@ -3112,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; } - calc_target(osdc, &lreq->t, NULL, false); + calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); @@ -3729,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; enum calc_target_result ct_res; - ct_res = calc_target(osdc, &lreq->t, NULL, true); + ct_res = calc_target(osdc, &lreq->t, true); if (ct_res == CALC_TARGET_NEED_RESEND) { struct ceph_osd *osd; @@ -3801,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd, n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, - false); + ct_res = calc_target(osdc, &req->r_t, false); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || @@ -3911,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc, n = rb_next(n); if (req->r_t.epoch < osdc->osdmap->epoch) { - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE) { erase_request(need_resend, req); check_pool_dne(req); From 71a228bc8d65900179e37ac309e678f8c523f133 Mon Sep 17 00:00:00 2001 From: Erqi Chen Date: Wed, 28 Aug 2019 21:22:45 +0800 Subject: [PATCH 378/721] ceph: reconnect connection if session hang in opening state If client mds session is evicted in CEPH_MDS_SESSION_OPENING state, mds won't send session msg to client, and delayed_work skip CEPH_MDS_SESSION_OPENING state session, the session hang forever. Allow ceph_con_keepalive to reconnect a session in OPENING to avoid session hang. Also, ensure that we skip sessions in RESTARTING and REJECTED states since those states can't be resurrected by issuing a keepalive. Link: https://tracker.ceph.com/issues/41551 Signed-off-by: Erqi Chen chenerqi@gmail.com Reviewed-by: "Yan, Zheng" Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 959dcf2ab0b8..a8a8f84f3bbf 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4088,7 +4088,9 @@ static void delayed_work(struct work_struct *work) pr_info("mds%d hung\n", s->s_mds); } } - if (s->s_state < CEPH_MDS_SESSION_OPEN) { + if (s->s_state == CEPH_MDS_SESSION_NEW || + s->s_state == CEPH_MDS_SESSION_RESTARTING || + s->s_state == CEPH_MDS_SESSION_REJECTED) { /* this mds is failed or recovering, just wait */ ceph_put_mds_session(s); continue; From 21ed05a8bae76b5163996e0bbcaa35cd9eff41d7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 30 Aug 2019 17:31:06 +0200 Subject: [PATCH 379/721] rbd: pull rbd_img_request_create() dout out into the callers Make it more informative: log op_type, offset and length for block layer requests and initiating obj_req for child requests. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 69db7385c8df..7c4350c0fb77 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create( mutex_init(&img_request->state_mutex); kref_init(&img_request->kref); - dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, - obj_op_name(op_type), img_request); return img_request; } @@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) __set_bit(IMG_REQ_CHILD, &child_img_req->flags); child_img_req->obj_request = obj_req; + dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req, + obj_req); + if (!rbd_img_is_write(img_req)) { switch (img_req->data_type) { case OBJ_REQUEST_BIO: @@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work) img_request->rq = rq; snapc = NULL; /* img_request consumes a ref */ + dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev, + img_request, obj_op_name(op_type), offset, length); + if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) result = rbd_img_fill_nodata(img_request, offset, length); else From 536cc331a4a714289334f0a0ce84318bb7e680d4 Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczynski Date: Sat, 31 Aug 2019 23:57:12 +0200 Subject: [PATCH 380/721] ceph: move static keyword to the front of declarations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the static keyword to the front of declarations of snap_handle_length, handle_length and connected_handle_length, and resolve the following compiler warnings that can be seen when building with warnings enabled (W=1): fs/ceph/export.c:38:2: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] fs/ceph/export.c:88:2: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] fs/ceph/export.c:90:2: warning: ‘static’ is not at beginning of declaration [-Wold-style-declaration] Signed-off-by: Krzysztof Wilczynski Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 020d39a85ecc..b6bfa94332c3 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -35,7 +35,7 @@ struct ceph_nfs_snapfh { static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { - const static int snap_handle_length = + static const int snap_handle_length = sizeof(struct ceph_nfs_snapfh) >> 2; struct ceph_nfs_snapfh *sfh = (void *)rawfh; u64 snapid = ceph_snap(inode); @@ -85,9 +85,9 @@ static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { - const static int handle_length = + static const int handle_length = sizeof(struct ceph_nfs_fh) >> 2; - const static int connected_handle_length = + static const int connected_handle_length = sizeof(struct ceph_nfs_confh) >> 2; int type; From 48f930ea6de6f48fd20be54ec4716c545751a6d9 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 5 Sep 2019 17:29:29 +0200 Subject: [PATCH 381/721] ceph: include ceph_debug.h in cache.c Any file that uses dout() should include ceph_debug.h at the top. Signed-off-by: Ilya Dryomov --- fs/ceph/cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index bc90cf6ad7ed..b2ec29eeb4c4 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -6,6 +6,8 @@ * Written by Milosz Tanski (milosz@adfin.com) */ +#include + #include "super.h" #include "cache.h" From 6fd4e634835208ddb331234bfa51d75396a5c42c Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 9 Sep 2019 16:48:54 +0100 Subject: [PATCH 382/721] ceph: allow object copies across different filesystems in the same cluster OSDs are able to perform object copies across different pools. Thus, there's no need to prevent copy_file_range from doing remote copies if the source and destination superblocks are different. Only return -EXDEV if they have different fsid (the cluster ID). Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ff17a81bf2e2..d277f71abe0b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1922,6 +1922,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, struct ceph_inode_info *src_ci = ceph_inode(src_inode); struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); struct ceph_cap_flush *prealloc_cf; + struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); struct ceph_object_locator src_oloc, dst_oloc; struct ceph_object_id src_oid, dst_oid; loff_t endoff = 0, size; @@ -1931,8 +1932,16 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, int src_got = 0, dst_got = 0, err, dirty; bool do_final_copy = false; - if (src_inode->i_sb != dst_inode->i_sb) - return -EXDEV; + if (src_inode->i_sb != dst_inode->i_sb) { + struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); + + if (ceph_fsid_compare(&src_fsc->client->fsid, + &dst_fsc->client->fsid)) { + dout("Copying files across clusters: src: %pU dst: %pU\n", + &src_fsc->client->fsid, &dst_fsc->client->fsid); + return -EXDEV; + } + } if (ceph_snap(dst_inode) != CEPH_NOSNAP) return -EROFS; @@ -1944,7 +1953,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * efficient). */ - if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) + if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) return -EOPNOTSUPP; if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || @@ -2059,7 +2068,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, dst_ci->i_vino.ino, dst_objnum); /* Do an object remote copy */ err = ceph_osdc_copy_from( - &ceph_inode_to_client(src_inode)->client->osdc, + &src_fsc->client->osdc, src_ci->i_vino.snap, 0, &src_oid, &src_oloc, CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | From 10c12851a022662bf6085bd4384b4ebed4c447ce Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 30 Aug 2019 20:44:24 +0200 Subject: [PATCH 383/721] libceph: avoid a __vmalloc() deadlock in ceph_kvmalloc() The vmalloc allocator doesn't fully respect the specified gfp mask: while the actual pages are allocated as requested, the page table pages are always allocated with GFP_KERNEL. ceph_kvmalloc() may be called with GFP_NOFS and GFP_NOIO (for ceph and rbd respectively), so this may result in a deadlock. There is no real reason for the current PAGE_ALLOC_COSTLY_ORDER logic, it's just something that seemed sensible at the time (ceph_kvmalloc() predates kvmalloc()). kvmalloc() is smarter: in an attempt to reduce long term fragmentation, it first tries to kmalloc non-disruptively. Switch to kvmalloc() and set the respective PF_MEMALLOC_* flag using the scope API to avoid the deadlock. Note that kvmalloc() needs to be passed GFP_KERNEL to enable the fallback. Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/ceph_common.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index b412a3ccc4fc..2d568246803f 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); +/* + * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are + * compatible with (a superset of) GFP_KERNEL. This is because while the + * actual pages are allocated with the specified flags, the page table pages + * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take + * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * + * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. + */ void *ceph_kvmalloc(size_t size, gfp_t flags) { - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *ptr = kmalloc(size, flags | __GFP_NOWARN); - if (ptr) - return ptr; + void *p; + + if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { + p = kvmalloc(size, flags); + } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { + unsigned int nofs_flag = memalloc_nofs_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + } else { + unsigned int noio_flag = memalloc_noio_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_noio_restore(noio_flag); } - return __vmalloc(size, flags, PAGE_KERNEL); + return p; } - static int parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; From cf73d882cc51c1f245a890cccb79952a260302d3 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 4 Sep 2019 15:40:03 +0200 Subject: [PATCH 384/721] libceph: use ceph_kvmalloc() for osdmap arrays osdmap has a bunch of arrays that grow linearly with the number of OSDs. osd_state, osd_weight and osd_primary_affinity take 4 bytes per OSD. osd_addr takes 136 bytes per OSD because of sockaddr_storage. The CRUSH workspace area also grows linearly with the number of OSDs. Normally these arrays are allocated at client startup. The osdmap is usually updated in small incrementals, but once in a while a full map may need to be processed. For a cluster with 10000 OSDs, this means a bunch of 40K allocations followed by a 1.3M allocation, all of which are currently required to be physically contiguous. This results in sporadic ENOMEM errors, hanging the client. Go back to manually (re)allocating arrays and use ceph_kvmalloc() to fall back to non-contiguous allocation when necessary. Link: https://tracker.ceph.com/issues/40481 Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/osdmap.c | 69 +++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 90437906b7bc..4e0de14f80bb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) struct ceph_pg_pool_info, node); __remove_pg_pool(&map->pg_pools, pi); } - kfree(map->osd_state); - kfree(map->osd_weight); - kfree(map->osd_addr); - kfree(map->osd_primary_affinity); - kfree(map->crush_workspace); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + kvfree(map->osd_primary_affinity); + kvfree(map->crush_workspace); kfree(map); } @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) * * The new elements are properly initialized. */ -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) { u32 *state; u32 *weight; struct ceph_entity_addr *addr; + u32 to_copy; int i; - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); - if (!state) + dout("%s old %u new %u\n", __func__, map->max_osd, max); + if (max == map->max_osd) + return 0; + + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + if (!state || !weight || !addr) { + kvfree(state); + kvfree(weight); + kvfree(addr); return -ENOMEM; + } + + to_copy = min(map->max_osd, max); + if (map->osd_state) { + memcpy(state, map->osd_state, to_copy * sizeof(*state)); + memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); + memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + } + map->osd_state = state; - - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); - if (!weight) - return -ENOMEM; map->osd_weight = weight; - - addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); - if (!addr) - return -ENOMEM; map->osd_addr = addr; - for (i = map->max_osd; i < max; i++) { map->osd_state[i] = 0; map->osd_weight[i] = CEPH_OSD_OUT; @@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = krealloc(map->osd_primary_affinity, - max*sizeof(*affinity), GFP_NOFS); + affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + GFP_NOFS); if (!affinity) return -ENOMEM; - map->osd_primary_affinity = affinity; + memcpy(affinity, map->osd_primary_affinity, + to_copy * sizeof(*affinity)); + kvfree(map->osd_primary_affinity); + + map->osd_primary_affinity = affinity; for (i = map->max_osd; i < max; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; @@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - workspace = kmalloc(work_size, GFP_NOIO); + workspace = ceph_kvmalloc(work_size, GFP_NOIO); if (!workspace) { crush_destroy(crush); return -ENOMEM; @@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) if (map->crush) crush_destroy(map->crush); - kfree(map->crush_workspace); + kvfree(map->crush_workspace); map->crush = crush; map->crush_workspace = workspace; return 0; @@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = kmalloc_array(map->max_osd, - sizeof(u32), - GFP_NOFS); + map->osd_primary_affinity = ceph_kvmalloc( + array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), + GFP_NOFS); if (!map->osd_primary_affinity) return -ENOMEM; @@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end, ceph_decode_32_safe(p, end, len, e_inval); if (len == 0) { - kfree(map->osd_primary_affinity); + kvfree(map->osd_primary_affinity); map->osd_primary_affinity = NULL; return 0; } From 3ee5a7015c8b7cb4de21f7345f8381946f2fce55 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 12 Sep 2019 08:07:56 -0400 Subject: [PATCH 385/721] ceph: call ceph_mdsc_destroy from destroy_fs_client They're always called in succession. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 03b63b1cd32c..7173679c8ed7 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -731,6 +731,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); + ceph_mdsc_destroy(fsc); destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->cap_wq); @@ -1105,7 +1106,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } if (ceph_sb_to_client(sb) != fsc) { - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); fsc = ceph_sb_to_client(sb); dout("get_sb got existing client %p\n", fsc); @@ -1131,7 +1131,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, goto out_final; out: - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); out_final: dout("ceph_mount fail %ld\n", PTR_ERR(res)); @@ -1155,8 +1154,6 @@ static void ceph_kill_sb(struct super_block *s) ceph_fscache_unregister_fs(fsc); - ceph_mdsc_destroy(fsc); - destroy_fs_client(fsc); free_anon_bdev(dev); } From 4a36a60c34f42f75e8b4f8cd24fcfade26111334 Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:00:39 +0300 Subject: [PATCH 386/721] PCI: Add Amazon's Annapurna Labs vendor ID Add Amazon's Annapurna Labs vendor ID to pci_ids.h. Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Acked-by: Bjorn Helgaas --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index c842735a4f45..5ce83544f38b 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2570,6 +2570,8 @@ #define PCI_VENDOR_ID_ASMEDIA 0x1b21 +#define PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS 0x1c36 + #define PCI_VENDOR_ID_CIRCUITCO 0x1cc8 #define PCI_SUBSYSTEM_ID_CIRCUITCO_MINNOWBOARD 0x0001 From 76e67e9e0f0f2debf9df192502a759bdd0cb4dab Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Thu, 12 Sep 2019 16:00:40 +0300 Subject: [PATCH 387/721] PCI: Add ACS quirk for Amazon Annapurna Labs root ports The Amazon's Annapurna Labs root ports don't advertise an ACS capability, but they don't allow peer-to-peer transactions and do validate bus numbers through the SMMU. Additionally, it's not possible for one RP to pass traffic to another RP. Signed-off-by: Ali Saidi Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Gustavo Pimentel Reviewed-by: Andrew Murray Acked-by: Bjorn Helgaas --- drivers/pci/quirks.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 208aacf39329..e96c03b4e494 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4366,6 +4366,24 @@ static int pci_quirk_qcom_rp_acs(struct pci_dev *dev, u16 acs_flags) return ret; } +static int pci_quirk_al_acs(struct pci_dev *dev, u16 acs_flags) +{ + if (pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) + return -ENOTTY; + + /* + * Amazon's Annapurna Labs root ports don't include an ACS capability, + * but do include ACS-like functionality. The hardware doesn't support + * peer-to-peer transactions via the root port and each has a unique + * segment number. + * + * Additionally, the root ports cannot send traffic to each other. + */ + acs_flags &= ~(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); + + return acs_flags ? 0 : 1; +} + /* * Sunrise Point PCH root ports implement ACS, but unfortunately as shown in * the datasheet (Intel 100 Series Chipset Family PCH Datasheet, Vol. 2, @@ -4559,6 +4577,8 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_AMPERE, 0xE00A, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00B, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00C, pci_quirk_xgene_acs }, + /* Amazon Annapurna Labs */ + { PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, pci_quirk_al_acs }, { 0 } }; From a638b5de205af40bdadd867b1cb77320bbb2628e Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:00:41 +0300 Subject: [PATCH 388/721] PCI/VPD: Prevent VPD access for Amazon's Annapurna Labs Root Port The Amazon Annapurna Labs PCIe Root Port exposes the VPD capability, but there is no actual support for it. Trying to access the VPD (for example, as part of lspci -vv or when reading the vpd sysfs file), results in the following warning print: pcieport 0001:00:00.0: VPD access failed. This is likely a firmware bug on this device. Contact the card vendor for a firmware update Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Gustavo Pimentel Reviewed-by: Andrew Murray Acked-by: Bjorn Helgaas --- drivers/pci/vpd.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 4963c2e2bd4c..7915d10f9aa1 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -571,6 +571,12 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LSI_LOGIC, 0x005f, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, PCI_ANY_ID, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_QLOGIC, 0x2261, quirk_blacklist_vpd); +/* + * The Amazon Annapurna Labs 0x0031 device id is reused for other non Root Port + * device types, so the quirk is registered for the PCI_CLASS_BRIDGE_PCI class. + */ +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, + PCI_CLASS_BRIDGE_PCI, 8, quirk_blacklist_vpd); /* * For Broadcom 5706, 5708, 5709 rev. A nics, any read beyond the From 738cb37b013e5ba6abd725a22abbd8baebc95082 Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:00:42 +0300 Subject: [PATCH 389/721] PCI: Add quirk to disable MSI-X support for Amazon's Annapurna Labs Root Port The Root Port (identified by [1c36:0031]) doesn't support MSI-X. On some platforms it is configured to not advertise the capability at all, while on others it (mistakenly) does. This causes a panic during initialization by the pcieport driver, since it tries to configure the MSI-X capability. Specifically, when trying to access the MSI-X table a "non-existing addr" exception occurs. Example stacktrace snippet: SError Interrupt on CPU2, code 0xbf000000 -- SError CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.2.0-rc1-Jonny-14847-ge76f1d4a1828-dirty #33 Hardware name: Annapurna Labs Alpine V3 EVP (DT) pstate: 80000005 (Nzcv daif -PAN -UAO) pc : __pci_enable_msix_range+0x4e4/0x608 lr : __pci_enable_msix_range+0x498/0x608 sp : ffffff80117db700 x29: ffffff80117db700 x28: 0000000000000001 x27: 0000000000000001 x26: 0000000000000000 x25: ffffffd3e9d8c0b0 x24: 0000000000000000 x23: 0000000000000000 x22: 0000000000000000 x21: 0000000000000001 x20: 0000000000000000 x19: ffffffd3e9d8c000 x18: ffffffffffffffff x17: 0000000000000000 x16: 0000000000000000 x15: ffffff80116496c8 x14: ffffffd3e9844503 x13: ffffffd3e9844502 x12: 0000000000000038 x11: ffffffffffffff00 x10: 0000000000000040 x9 : ffffff801165e270 x8 : ffffff801165e268 x7 : 0000000000000002 x6 : 00000000000000b2 x5 : ffffffd3e9d8c2c0 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffffd3e9844680 Kernel panic - not syncing: Asynchronous SError Interrupt CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.2.0-rc1-Jonny-14847-ge76f1d4a1828-dirty #33 Hardware name: Annapurna Labs Alpine V3 EVP (DT) Call trace: dump_backtrace+0x0/0x140 show_stack+0x14/0x20 dump_stack+0xa8/0xcc panic+0x140/0x334 nmi_panic+0x6c/0x70 arm64_serror_panic+0x74/0x88 __pte_error+0x0/0x28 el1_error+0x84/0xf8 __pci_enable_msix_range+0x4e4/0x608 pci_alloc_irq_vectors_affinity+0xdc/0x150 pcie_port_device_register+0x2b8/0x4e0 pcie_portdrv_probe+0x34/0xf0 Notice that this quirk also disables MSI (which may work, but hasn't been tested nor has a current use case), since currently there is no standard way to disable only MSI-X. Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Gustavo Pimentel Reviewed-by: Andrew Murray --- drivers/pci/quirks.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index e96c03b4e494..44524d351a26 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2925,6 +2925,24 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, 0x10a1, quirk_msi_intx_disable_qca_bug); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, 0xe091, quirk_msi_intx_disable_qca_bug); + +/* + * Amazon's Annapurna Labs 1c36:0031 Root Ports don't support MSI-X, so it + * should be disabled on platforms where the device (mistakenly) advertises it. + * + * Notice that this quirk also disables MSI (which may work, but hasn't been + * tested), since currently there is no standard way to disable only MSI-X. + * + * The 0031 device id is reused for other non Root Port device types, + * therefore the quirk is registered for the PCI_CLASS_BRIDGE_PCI class. + */ +static void quirk_al_msi_disable(struct pci_dev *dev) +{ + dev->no_msi = 1; + pci_warn(dev, "Disabling MSI/MSI-X\n"); +} +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, + PCI_CLASS_BRIDGE_PCI, 8, quirk_al_msi_disable); #endif /* CONFIG_PCI_MSI */ /* From ed4381da34d4765626aefc9b9ef084cd8e6bb749 Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:02:36 +0300 Subject: [PATCH 390/721] dt-bindings: PCI: Add Amazon's Annapurna Labs PCIe host bridge binding Document Amazon's Annapurna Labs PCIe host bridge. Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Reviewed-by: Rob Herring --- .../devicetree/bindings/pci/pcie-al.txt | 46 +++++++++++++++++++ MAINTAINERS | 3 +- 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 Documentation/devicetree/bindings/pci/pcie-al.txt diff --git a/Documentation/devicetree/bindings/pci/pcie-al.txt b/Documentation/devicetree/bindings/pci/pcie-al.txt new file mode 100644 index 000000000000..557a5089229d --- /dev/null +++ b/Documentation/devicetree/bindings/pci/pcie-al.txt @@ -0,0 +1,46 @@ +* Amazon Annapurna Labs PCIe host bridge + +Amazon's Annapurna Labs PCIe Host Controller is based on the Synopsys DesignWare +PCI core. It inherits common properties defined in +Documentation/devicetree/bindings/pci/designware-pcie.txt. + +Properties of the host controller node that differ from it are: + +- compatible: + Usage: required + Value type: + Definition: Value should contain + - "amazon,al-alpine-v2-pcie" for alpine_v2 + - "amazon,al-alpine-v3-pcie" for alpine_v3 + +- reg: + Usage: required + Value type: + Definition: Register ranges as listed in the reg-names property + +- reg-names: + Usage: required + Value type: + Definition: Must include the following entries + - "config" PCIe ECAM space + - "controller" AL proprietary registers + - "dbi" Designware PCIe registers + +Example: + + pcie-external0: pcie@fb600000 { + compatible = "amazon,al-alpine-v3-pcie"; + reg = <0x0 0xfb600000 0x0 0x00100000 + 0x0 0xfd800000 0x0 0x00010000 + 0x0 0xfd810000 0x0 0x00001000>; + reg-names = "config", "controller", "dbi"; + bus-range = <0 255>; + device_type = "pci"; + #address-cells = <3>; + #size-cells = <2>; + #interrupt-cells = <1>; + interrupts = ; + interrupt-map-mask = <0x00 0 0 7>; + interrupt-map = <0x0000 0 0 1 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>; /* INTa */ + ranges = <0x02000000 0x0 0xc0010000 0x0 0xc0010000 0x0 0x07ff0000>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index 783569e3c4b4..d200b16fa95c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12448,10 +12448,11 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/ S: Supported F: drivers/pci/controller/ -PCIE DRIVER FOR ANNAPURNA LABS +PCIE DRIVER FOR AMAZON ANNAPURNA LABS M: Jonathan Chocron L: linux-pci@vger.kernel.org S: Maintained +F: Documentation/devicetree/bindings/pci/pcie-al.txt F: drivers/pci/controller/dwc/pcie-al.c PCIE DRIVER FOR AMLOGIC MESON From a8daea94754989f6c48dafda840482cbc9f882f9 Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:02:37 +0300 Subject: [PATCH 391/721] PCI: dwc: al: Add Amazon Annapurna Labs PCIe controller driver This driver is DT based and utilizes the DesignWare APIs. It allows using a smaller ECAM range for a larger bus range - usually an entire bus uses 1MB of address space, but the driver can use it for a larger number of buses. This is achieved by using a HW mechanism which allows changing the BUS part of the "final" outgoing config transaction. There are 2 HW regs, one which is basically a bitmask determining which bits to take from the AXI transaction itself and another which holds the complementary part programmed by the driver. All link initializations are handled by the boot FW. Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Gustavo Pimentel Reviewed-by: Andrew Murray --- drivers/pci/controller/dwc/Kconfig | 12 + drivers/pci/controller/dwc/pcie-al.c | 365 +++++++++++++++++++++++++++ 2 files changed, 377 insertions(+) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 6ea778ae4877..3c6094cbcc3b 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -230,4 +230,16 @@ config PCIE_UNIPHIER Say Y here if you want PCIe controller support on UniPhier SoCs. This driver supports LD20 and PXs3 SoCs. +config PCIE_AL + bool "Amazon Annapurna Labs PCIe controller" + depends on OF && (ARM64 || COMPILE_TEST) + depends on PCI_MSI_IRQ_DOMAIN + select PCIE_DW_HOST + help + Say Y here to enable support of the Amazon's Annapurna Labs PCIe + controller IP on Amazon SoCs. The PCIe controller uses the DesignWare + core plus Annapurna Labs proprietary hardware wrappers. This is + required only for DT-based platforms. ACPI platforms with the + Annapurna Labs PCIe controller don't need to enable this. + endmenu diff --git a/drivers/pci/controller/dwc/pcie-al.c b/drivers/pci/controller/dwc/pcie-al.c index 3ab58f0584a8..1eeda2f6371f 100644 --- a/drivers/pci/controller/dwc/pcie-al.c +++ b/drivers/pci/controller/dwc/pcie-al.c @@ -91,3 +91,368 @@ struct pci_ecam_ops al_pcie_ops = { }; #endif /* defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) */ + +#ifdef CONFIG_PCIE_AL + +#include +#include "pcie-designware.h" + +#define AL_PCIE_REV_ID_2 2 +#define AL_PCIE_REV_ID_3 3 +#define AL_PCIE_REV_ID_4 4 + +#define AXI_BASE_OFFSET 0x0 + +#define DEVICE_ID_OFFSET 0x16c + +#define DEVICE_REV_ID 0x0 +#define DEVICE_REV_ID_DEV_ID_MASK GENMASK(31, 16) + +#define DEVICE_REV_ID_DEV_ID_X4 0 +#define DEVICE_REV_ID_DEV_ID_X8 2 +#define DEVICE_REV_ID_DEV_ID_X16 4 + +#define OB_CTRL_REV1_2_OFFSET 0x0040 +#define OB_CTRL_REV3_5_OFFSET 0x0030 + +#define CFG_TARGET_BUS 0x0 +#define CFG_TARGET_BUS_MASK_MASK GENMASK(7, 0) +#define CFG_TARGET_BUS_BUSNUM_MASK GENMASK(15, 8) + +#define CFG_CONTROL 0x4 +#define CFG_CONTROL_SUBBUS_MASK GENMASK(15, 8) +#define CFG_CONTROL_SEC_BUS_MASK GENMASK(23, 16) + +struct al_pcie_reg_offsets { + unsigned int ob_ctrl; +}; + +struct al_pcie_target_bus_cfg { + u8 reg_val; + u8 reg_mask; + u8 ecam_mask; +}; + +struct al_pcie { + struct dw_pcie *pci; + void __iomem *controller_base; /* base of PCIe unit (not DW core) */ + struct device *dev; + resource_size_t ecam_size; + unsigned int controller_rev_id; + struct al_pcie_reg_offsets reg_offsets; + struct al_pcie_target_bus_cfg target_bus_cfg; +}; + +#define PCIE_ECAM_DEVFN(x) (((x) & 0xff) << 12) + +#define to_al_pcie(x) dev_get_drvdata((x)->dev) + +static inline u32 al_pcie_controller_readl(struct al_pcie *pcie, u32 offset) +{ + return readl_relaxed(pcie->controller_base + offset); +} + +static inline void al_pcie_controller_writel(struct al_pcie *pcie, u32 offset, + u32 val) +{ + writel_relaxed(val, pcie->controller_base + offset); +} + +static int al_pcie_rev_id_get(struct al_pcie *pcie, unsigned int *rev_id) +{ + u32 dev_rev_id_val; + u32 dev_id_val; + + dev_rev_id_val = al_pcie_controller_readl(pcie, AXI_BASE_OFFSET + + DEVICE_ID_OFFSET + + DEVICE_REV_ID); + dev_id_val = FIELD_GET(DEVICE_REV_ID_DEV_ID_MASK, dev_rev_id_val); + + switch (dev_id_val) { + case DEVICE_REV_ID_DEV_ID_X4: + *rev_id = AL_PCIE_REV_ID_2; + break; + case DEVICE_REV_ID_DEV_ID_X8: + *rev_id = AL_PCIE_REV_ID_3; + break; + case DEVICE_REV_ID_DEV_ID_X16: + *rev_id = AL_PCIE_REV_ID_4; + break; + default: + dev_err(pcie->dev, "Unsupported dev_id_val (0x%x)\n", + dev_id_val); + return -EINVAL; + } + + dev_dbg(pcie->dev, "dev_id_val: 0x%x\n", dev_id_val); + + return 0; +} + +static int al_pcie_reg_offsets_set(struct al_pcie *pcie) +{ + switch (pcie->controller_rev_id) { + case AL_PCIE_REV_ID_2: + pcie->reg_offsets.ob_ctrl = OB_CTRL_REV1_2_OFFSET; + break; + case AL_PCIE_REV_ID_3: + case AL_PCIE_REV_ID_4: + pcie->reg_offsets.ob_ctrl = OB_CTRL_REV3_5_OFFSET; + break; + default: + dev_err(pcie->dev, "Unsupported controller rev_id: 0x%x\n", + pcie->controller_rev_id); + return -EINVAL; + } + + return 0; +} + +static inline void al_pcie_target_bus_set(struct al_pcie *pcie, + u8 target_bus, + u8 mask_target_bus) +{ + u32 reg; + + reg = FIELD_PREP(CFG_TARGET_BUS_MASK_MASK, mask_target_bus) | + FIELD_PREP(CFG_TARGET_BUS_BUSNUM_MASK, target_bus); + + al_pcie_controller_writel(pcie, AXI_BASE_OFFSET + + pcie->reg_offsets.ob_ctrl + CFG_TARGET_BUS, + reg); +} + +static void __iomem *al_pcie_conf_addr_map(struct al_pcie *pcie, + unsigned int busnr, + unsigned int devfn) +{ + struct al_pcie_target_bus_cfg *target_bus_cfg = &pcie->target_bus_cfg; + unsigned int busnr_ecam = busnr & target_bus_cfg->ecam_mask; + unsigned int busnr_reg = busnr & target_bus_cfg->reg_mask; + struct pcie_port *pp = &pcie->pci->pp; + void __iomem *pci_base_addr; + + pci_base_addr = (void __iomem *)((uintptr_t)pp->va_cfg0_base + + (busnr_ecam << 20) + + PCIE_ECAM_DEVFN(devfn)); + + if (busnr_reg != target_bus_cfg->reg_val) { + dev_dbg(pcie->pci->dev, "Changing target bus busnum val from 0x%x to 0x%x\n", + target_bus_cfg->reg_val, busnr_reg); + target_bus_cfg->reg_val = busnr_reg; + al_pcie_target_bus_set(pcie, + target_bus_cfg->reg_val, + target_bus_cfg->reg_mask); + } + + return pci_base_addr; +} + +static int al_pcie_rd_other_conf(struct pcie_port *pp, struct pci_bus *bus, + unsigned int devfn, int where, int size, + u32 *val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct al_pcie *pcie = to_al_pcie(pci); + unsigned int busnr = bus->number; + void __iomem *pci_addr; + int rc; + + pci_addr = al_pcie_conf_addr_map(pcie, busnr, devfn); + + rc = dw_pcie_read(pci_addr + where, size, val); + + dev_dbg(pci->dev, "%d-byte config read from %04x:%02x:%02x.%d offset 0x%x (pci_addr: 0x%px) - val:0x%x\n", + size, pci_domain_nr(bus), bus->number, + PCI_SLOT(devfn), PCI_FUNC(devfn), where, + (pci_addr + where), *val); + + return rc; +} + +static int al_pcie_wr_other_conf(struct pcie_port *pp, struct pci_bus *bus, + unsigned int devfn, int where, int size, + u32 val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct al_pcie *pcie = to_al_pcie(pci); + unsigned int busnr = bus->number; + void __iomem *pci_addr; + int rc; + + pci_addr = al_pcie_conf_addr_map(pcie, busnr, devfn); + + rc = dw_pcie_write(pci_addr + where, size, val); + + dev_dbg(pci->dev, "%d-byte config write to %04x:%02x:%02x.%d offset 0x%x (pci_addr: 0x%px) - val:0x%x\n", + size, pci_domain_nr(bus), bus->number, + PCI_SLOT(devfn), PCI_FUNC(devfn), where, + (pci_addr + where), val); + + return rc; +} + +static void al_pcie_config_prepare(struct al_pcie *pcie) +{ + struct al_pcie_target_bus_cfg *target_bus_cfg; + struct pcie_port *pp = &pcie->pci->pp; + unsigned int ecam_bus_mask; + u32 cfg_control_offset; + u8 subordinate_bus; + u8 secondary_bus; + u32 cfg_control; + u32 reg; + + target_bus_cfg = &pcie->target_bus_cfg; + + ecam_bus_mask = (pcie->ecam_size >> 20) - 1; + if (ecam_bus_mask > 255) { + dev_warn(pcie->dev, "ECAM window size is larger than 256MB. Cutting off at 256\n"); + ecam_bus_mask = 255; + } + + /* This portion is taken from the transaction address */ + target_bus_cfg->ecam_mask = ecam_bus_mask; + /* This portion is taken from the cfg_target_bus reg */ + target_bus_cfg->reg_mask = ~target_bus_cfg->ecam_mask; + target_bus_cfg->reg_val = pp->busn->start & target_bus_cfg->reg_mask; + + al_pcie_target_bus_set(pcie, target_bus_cfg->reg_val, + target_bus_cfg->reg_mask); + + secondary_bus = pp->busn->start + 1; + subordinate_bus = pp->busn->end; + + /* Set the valid values of secondary and subordinate buses */ + cfg_control_offset = AXI_BASE_OFFSET + pcie->reg_offsets.ob_ctrl + + CFG_CONTROL; + + cfg_control = al_pcie_controller_readl(pcie, cfg_control_offset); + + reg = cfg_control & + ~(CFG_CONTROL_SEC_BUS_MASK | CFG_CONTROL_SUBBUS_MASK); + + reg |= FIELD_PREP(CFG_CONTROL_SUBBUS_MASK, subordinate_bus) | + FIELD_PREP(CFG_CONTROL_SEC_BUS_MASK, secondary_bus); + + al_pcie_controller_writel(pcie, cfg_control_offset, reg); +} + +static int al_pcie_host_init(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct al_pcie *pcie = to_al_pcie(pci); + int rc; + + rc = al_pcie_rev_id_get(pcie, &pcie->controller_rev_id); + if (rc) + return rc; + + rc = al_pcie_reg_offsets_set(pcie); + if (rc) + return rc; + + al_pcie_config_prepare(pcie); + + return 0; +} + +static const struct dw_pcie_host_ops al_pcie_host_ops = { + .rd_other_conf = al_pcie_rd_other_conf, + .wr_other_conf = al_pcie_wr_other_conf, + .host_init = al_pcie_host_init, +}; + +static int al_add_pcie_port(struct pcie_port *pp, + struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + int ret; + + pp->ops = &al_pcie_host_ops; + + ret = dw_pcie_host_init(pp); + if (ret) { + dev_err(dev, "failed to initialize host\n"); + return ret; + } + + return 0; +} + +static const struct dw_pcie_ops dw_pcie_ops = { +}; + +static int al_pcie_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *controller_res; + struct resource *ecam_res; + struct resource *dbi_res; + struct al_pcie *al_pcie; + struct dw_pcie *pci; + + al_pcie = devm_kzalloc(dev, sizeof(*al_pcie), GFP_KERNEL); + if (!al_pcie) + return -ENOMEM; + + pci = devm_kzalloc(dev, sizeof(*pci), GFP_KERNEL); + if (!pci) + return -ENOMEM; + + pci->dev = dev; + pci->ops = &dw_pcie_ops; + + al_pcie->pci = pci; + al_pcie->dev = dev; + + dbi_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + pci->dbi_base = devm_pci_remap_cfg_resource(dev, dbi_res); + if (IS_ERR(pci->dbi_base)) { + dev_err(dev, "couldn't remap dbi base %pR\n", dbi_res); + return PTR_ERR(pci->dbi_base); + } + + ecam_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "config"); + if (!ecam_res) { + dev_err(dev, "couldn't find 'config' reg in DT\n"); + return -ENOENT; + } + al_pcie->ecam_size = resource_size(ecam_res); + + controller_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "controller"); + al_pcie->controller_base = devm_ioremap_resource(dev, controller_res); + if (IS_ERR(al_pcie->controller_base)) { + dev_err(dev, "couldn't remap controller base %pR\n", + controller_res); + return PTR_ERR(al_pcie->controller_base); + } + + dev_dbg(dev, "From DT: dbi_base: %pR, controller_base: %pR\n", + dbi_res, controller_res); + + platform_set_drvdata(pdev, al_pcie); + + return al_add_pcie_port(&pci->pp, pdev); +} + +static const struct of_device_id al_pcie_of_match[] = { + { .compatible = "amazon,al-alpine-v2-pcie", + }, + { .compatible = "amazon,al-alpine-v3-pcie", + }, + {}, +}; + +static struct platform_driver al_pcie_driver = { + .driver = { + .name = "al-pcie", + .of_match_table = al_pcie_of_match, + .suppress_bind_attrs = true, + }, + .probe = al_pcie_probe, +}; +builtin_platform_driver(al_pcie_driver); + +#endif /* CONFIG_PCIE_AL*/ From 0b24134f7888175c9638e6fd1900e23e44fc172f Mon Sep 17 00:00:00 2001 From: Jonathan Chocron Date: Thu, 12 Sep 2019 16:02:38 +0300 Subject: [PATCH 392/721] PCI: dwc: Add validation that PCIe core is set to correct mode Some PCIe controllers can be set to either Host or EP according to some early boot FW. To make sure there is no discrepancy (e.g. FW configured the port to EP mode while the DT specifies it as a host bridge or vice versa), a check has been added for each mode. Signed-off-by: Jonathan Chocron Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Acked-by: Gustavo Pimentel --- drivers/pci/controller/dwc/pcie-designware-ep.c | 8 ++++++++ .../pci/controller/dwc/pcie-designware-host.c | 16 ++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 2bf5a35c0570..0b9a9b27175c 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -531,6 +531,7 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) int ret; u32 reg; void *addr; + u8 hdr_type; unsigned int nbars; unsigned int offset; struct pci_epc *epc; @@ -595,6 +596,13 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) if (ep->ops->ep_init) ep->ops->ep_init(ep); + hdr_type = dw_pcie_readb_dbi(pci, PCI_HEADER_TYPE); + if (hdr_type != PCI_HEADER_TYPE_NORMAL) { + dev_err(pci->dev, "PCIe controller is not set to EP mode (hdr_type:0x%x)!\n", + hdr_type); + return -EIO; + } + ret = of_property_read_u8(np, "max-functions", &epc->max_functions); if (ret < 0) epc->max_functions = 1; diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index f93252d0da5b..bb275b5e787a 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -323,6 +323,7 @@ int dw_pcie_host_init(struct pcie_port *pp) struct pci_bus *child; struct pci_host_bridge *bridge; struct resource *cfg_res; + u32 hdr_type; int ret; raw_spin_lock_init(&pci->pp.lock); @@ -464,6 +465,21 @@ int dw_pcie_host_init(struct pcie_port *pp) goto err_free_msi; } + ret = dw_pcie_rd_own_conf(pp, PCI_HEADER_TYPE, 1, &hdr_type); + if (ret != PCIBIOS_SUCCESSFUL) { + dev_err(pci->dev, "Failed reading PCI_HEADER_TYPE cfg space reg (ret: 0x%x)\n", + ret); + ret = pcibios_err_to_errno(ret); + goto err_free_msi; + } + if (hdr_type != PCI_HEADER_TYPE_BRIDGE) { + dev_err(pci->dev, + "PCIe controller is not set to bridge type (hdr_type: 0x%x)!\n", + hdr_type); + ret = -EIO; + goto err_free_msi; + } + pp->root_bus_nr = pp->busn->start; bridge->dev.parent = dev; From 3a9236e97207f2469254b4098995159b80174d95 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 16 Sep 2019 19:18:51 +0900 Subject: [PATCH 393/721] ALSA: dice: fix wrong packet parameter for Alesis iO26 At higher sampling rate (e.g. 192.0 kHz), Alesis iO26 transfers 4 data channels per data block in CIP. Both iO14 and iO26 have the same contents in their configuration ROM. For this reason, ALSA Dice driver attempts to distinguish them according to the value of TX0_AUDIO register at probe callback. Although the way is valid at lower and middle sampling rate, it's lastly invalid at higher sampling rate because because the two models returns the same value for read transaction to the register. In the most cases, users just plug-in the device and ALSA dice driver detects it. In the case, the device runs at lower sampling rate and the driver detects expectedly. For this reason, this commit leaves the way to detect as is. Fixes: 28b208f600a3 ("ALSA: dice: add parameters of stream formats for models produced by Alesis") Cc: # v4.18+ Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20190916101851.30409-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- sound/firewire/dice/dice-alesis.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/firewire/dice/dice-alesis.c b/sound/firewire/dice/dice-alesis.c index 218292bdace6..f5b325263b67 100644 --- a/sound/firewire/dice/dice-alesis.c +++ b/sound/firewire/dice/dice-alesis.c @@ -15,7 +15,7 @@ alesis_io14_tx_pcm_chs[MAX_STREAMS][SND_DICE_RATE_MODE_COUNT] = { static const unsigned int alesis_io26_tx_pcm_chs[MAX_STREAMS][SND_DICE_RATE_MODE_COUNT] = { - {10, 10, 8}, /* Tx0 = Analog + S/PDIF. */ + {10, 10, 4}, /* Tx0 = Analog + S/PDIF. */ {16, 8, 0}, /* Tx1 = ADAT1 + ADAT2. */ }; From 543242211879ebe021b00f7e33eb4e733bcd6c47 Mon Sep 17 00:00:00 2001 From: James McDonnell Date: Mon, 16 Sep 2019 14:53:38 +0000 Subject: [PATCH 394/721] ALSA: hda/realtek - Fix alienware headset mic Headset microphone quirk for alienware 15r3. Without this using a headset with mic attached will not work. Signed-off-by: James McDonnell Link: https://lore.kernel.org/r/QB1PR01MB2337D0367C2E3ADB0010134F808C0@QB1PR01MB2337.CANPRD01.PROD.OUTLOOK.COM Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index da1695418731..e27ef434b60d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5817,6 +5817,7 @@ enum { ALC292_FIXUP_DELL_E7X, ALC292_FIXUP_DISABLE_AAMIX, ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK, + ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE, ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, ALC298_FIXUP_DELL_AIO_MIC_NO_PRESENCE, ALC275_FIXUP_DELL_XPS, @@ -6506,6 +6507,15 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC292_FIXUP_DISABLE_AAMIX }, + [ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x18, 0x01a1913c }, /* headset mic w/o jack detect */ + { } + }, + .chained_before = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE, + }, [ALC298_FIXUP_DELL1_MIC_NO_PRESENCE] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -7770,6 +7780,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x17, 0x90170110}, {0x1a, 0x03011020}, {0x21, 0x03211030}), + SND_HDA_PIN_QUIRK(0x10ec0298, 0x1028, "Dell", ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE, + {0x12, 0xb7a60140}, + {0x17, 0x90170110}, + {0x1a, 0x03a11030}, + {0x21, 0x03211020}), SND_HDA_PIN_QUIRK(0x10ec0299, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, ALC225_STANDARD_PINS, {0x12, 0xb7a60130}, From 029d2c0fd61eac74700fb4ffff36fc63bfff7e5e Mon Sep 17 00:00:00 2001 From: Ilya Pshonkin Date: Tue, 17 Sep 2019 10:49:34 +0300 Subject: [PATCH 395/721] ALSA: usb-audio: Add Hiby device family to quirks for native DSD support This patch adds quirk VID ID for Hiby portable players family with native DSD playback support. Signed-off-by: Ilya Pshonkin Cc: Link: https://lore.kernel.org/r/20190917074937.157802-1-ilya.pshonkin@netforce.ua Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 25faf2d3c639..5fd4ccce452d 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1658,6 +1658,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, case 0x25ce: /* Mytek devices */ case 0x278b: /* Rotel? */ case 0x2ab6: /* T+A devices */ + case 0xc502: /* HiBy devices */ if (fp->dsd_raw) return SNDRV_PCM_FMTBIT_DSD_U32_BE; break; From e75f4940e8ad0dd76527302a10c06b58bf7eb590 Mon Sep 17 00:00:00 2001 From: Mihai Serban Date: Fri, 13 Sep 2019 22:28:05 +0300 Subject: [PATCH 396/721] ASoC: fsl_sai: Fix noise when using EDMA EDMA requires the period size to be multiple of maxburst. Otherwise the remaining bytes are not transferred and thus noise is produced. We can handle this issue by adding a constraint on SNDRV_PCM_HW_PARAM_PERIOD_SIZE to be multiple of tx/rx maxburst value. Signed-off-by: Mihai Serban Signed-off-by: Daniel Baluta Acked-by: Nicolin Chen Link: https://lore.kernel.org/r/20190913192807.8423-2-daniel.baluta@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_sai.c | 15 +++++++++++++++ sound/soc/fsl/fsl_sai.h | 1 + 2 files changed, 16 insertions(+) diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index ef0b74693093..b517e4bc1b87 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -628,6 +628,16 @@ static int fsl_sai_startup(struct snd_pcm_substream *substream, FSL_SAI_CR3_TRCE_MASK, FSL_SAI_CR3_TRCE); + /* + * EDMA controller needs period size to be a multiple of + * tx/rx maxburst + */ + if (sai->soc_data->use_edma) + snd_pcm_hw_constraint_step(substream->runtime, 0, + SNDRV_PCM_HW_PARAM_PERIOD_SIZE, + tx ? sai->dma_params_tx.maxburst : + sai->dma_params_rx.maxburst); + ret = snd_pcm_hw_constraint_list(substream->runtime, 0, SNDRV_PCM_HW_PARAM_RATE, &fsl_sai_rate_constraints); @@ -1026,30 +1036,35 @@ static int fsl_sai_remove(struct platform_device *pdev) static const struct fsl_sai_soc_data fsl_sai_vf610_data = { .use_imx_pcm = false, + .use_edma = false, .fifo_depth = 32, .reg_offset = 0, }; static const struct fsl_sai_soc_data fsl_sai_imx6sx_data = { .use_imx_pcm = true, + .use_edma = false, .fifo_depth = 32, .reg_offset = 0, }; static const struct fsl_sai_soc_data fsl_sai_imx7ulp_data = { .use_imx_pcm = true, + .use_edma = false, .fifo_depth = 16, .reg_offset = 8, }; static const struct fsl_sai_soc_data fsl_sai_imx8mq_data = { .use_imx_pcm = true, + .use_edma = false, .fifo_depth = 128, .reg_offset = 8, }; static const struct fsl_sai_soc_data fsl_sai_imx8qm_data = { .use_imx_pcm = true, + .use_edma = true, .fifo_depth = 64, .reg_offset = 0, }; diff --git a/sound/soc/fsl/fsl_sai.h b/sound/soc/fsl/fsl_sai.h index b12cb578f6d0..76b15deea80c 100644 --- a/sound/soc/fsl/fsl_sai.h +++ b/sound/soc/fsl/fsl_sai.h @@ -157,6 +157,7 @@ struct fsl_sai_soc_data { bool use_imx_pcm; + bool use_edma; unsigned int fifo_depth; unsigned int reg_offset; }; From a0a4bf57a977ed37bcbdfc8027a44485fe086a3d Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Tue, 17 Sep 2019 05:03:53 +0800 Subject: [PATCH 397/721] ASoC: core: delete component->card_list in soc_remove_component only We add component->card_list in the end of soc_probe_component(). In other words, component->card_list will not be added if there is an error in the soc_probe_component() function. So we can't delete component->card_list in the error handling of soc_probe_component(). Fixes: 22d1423187e5 ("ASoC: soc-core: add soc_cleanup_component()") Signed-off-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20190916210353.6318-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 35f48e9c5ead..aff4b4bf4d07 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -978,7 +978,6 @@ static void soc_cleanup_component(struct snd_soc_component *component) /* For framework level robustness */ snd_soc_component_set_jack(component, NULL, NULL); - list_del(&component->card_list); snd_soc_dapm_free(snd_soc_component_get_dapm(component)); soc_cleanup_component_debugfs(component); component->card = NULL; @@ -991,7 +990,7 @@ static void soc_remove_component(struct snd_soc_component *component) return; snd_soc_component_remove(component); - + list_del(&component->card_list); soc_cleanup_component(component); } From e3dffa4f6c3612dea337c9c59191bd418afc941b Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 16 Sep 2019 07:54:34 -0600 Subject: [PATCH 398/721] PCI: vmd: Fix config addressing when using bus offsets VMD maps child device config spaces to the VMD Config BAR linearly regardless of the starting bus offset. Because of this, the config address decode must ignore starting bus offsets when mapping the BDF to the config space address. Fixes: 2a5a9c9a20f9 ("PCI: vmd: Add offset to bus numbers if necessary") Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org # v5.2+ --- drivers/pci/controller/vmd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 4575e0c6dc4b..2e4da3f51d6b 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -94,6 +94,7 @@ struct vmd_dev { struct resource resources[3]; struct irq_domain *irq_domain; struct pci_bus *bus; + u8 busn_start; struct dma_map_ops dma_ops; struct dma_domain dma_domain; @@ -440,7 +441,8 @@ static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus, unsigned int devfn, int reg, int len) { char __iomem *addr = vmd->cfgbar + - (bus->number << 20) + (devfn << 12) + reg; + ((bus->number - vmd->busn_start) << 20) + + (devfn << 12) + reg; if ((addr - vmd->cfgbar) + len >= resource_size(&vmd->dev->resource[VMD_CFGBAR])) @@ -563,7 +565,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) unsigned long flags; LIST_HEAD(resources); resource_size_t offset[2] = {0}; - resource_size_t membar2_offset = 0x2000, busn_start = 0; + resource_size_t membar2_offset = 0x2000; struct pci_bus *child; /* @@ -606,14 +608,14 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_read_config_dword(vmd->dev, PCI_REG_VMCONFIG, &vmconfig); if (BUS_RESTRICT_CAP(vmcap) && (BUS_RESTRICT_CFG(vmconfig) == 0x1)) - busn_start = 128; + vmd->busn_start = 128; } res = &vmd->dev->resource[VMD_CFGBAR]; vmd->resources[0] = (struct resource) { .name = "VMD CFGBAR", - .start = busn_start, - .end = busn_start + (resource_size(res) >> 20) - 1, + .start = vmd->busn_start, + .end = vmd->busn_start + (resource_size(res) >> 20) - 1, .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED, }; @@ -681,8 +683,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]); pci_add_resource_offset(&resources, &vmd->resources[2], offset[1]); - vmd->bus = pci_create_root_bus(&vmd->dev->dev, busn_start, &vmd_ops, - sd, &resources); + vmd->bus = pci_create_root_bus(&vmd->dev->dev, vmd->busn_start, + &vmd_ops, sd, &resources); if (!vmd->bus) { pci_free_resource_list(&resources); irq_domain_remove(vmd->irq_domain); From a1a30170138c9c5157bd514ccd4d76b47060f29b Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Mon, 16 Sep 2019 07:54:35 -0600 Subject: [PATCH 399/721] PCI: vmd: Fix shadow offsets to reflect spec changes The shadow offset scratchpad was moved to 0x2000-0x2010. Update the location to get the correct shadow offset. Fixes: 6788958e4f3c ("PCI: vmd: Assign membar addresses from shadow registers") Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org # v5.2+ --- drivers/pci/controller/vmd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 2e4da3f51d6b..a35d3f3996d7 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -31,6 +31,9 @@ #define PCI_REG_VMLOCK 0x70 #define MB2_SHADOW_EN(vmlock) (vmlock & 0x2) +#define MB2_SHADOW_OFFSET 0x2000 +#define MB2_SHADOW_SIZE 16 + enum vmd_features { /* * Device may contain registers which hint the physical location of the @@ -578,7 +581,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) u32 vmlock; int ret; - membar2_offset = 0x2018; + membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE; ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, &vmlock); if (ret || vmlock == ~0) return -ENODEV; @@ -590,9 +593,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) if (!membar2) return -ENOMEM; offset[0] = vmd->dev->resource[VMD_MEMBAR1].start - - readq(membar2 + 0x2008); + readq(membar2 + MB2_SHADOW_OFFSET); offset[1] = vmd->dev->resource[VMD_MEMBAR2].start - - readq(membar2 + 0x2010); + readq(membar2 + MB2_SHADOW_OFFSET + 8); pci_iounmap(vmd->dev, membar2); } } From 21ab8580b383f27b7f59b84ac1699cb26d6c3d69 Mon Sep 17 00:00:00 2001 From: Micah Morton Date: Tue, 17 Sep 2019 11:27:05 -0700 Subject: [PATCH 400/721] LSM: SafeSetID: Stop releasing uninitialized ruleset The first time a rule set is configured for SafeSetID, we shouldn't be trying to release the previously configured ruleset, since there isn't one. Currently, the pointer that would point to a previously configured ruleset is uninitialized on first rule set configuration, leading to a crash when we try to call release_ruleset with that pointer. Acked-by: Jann Horn Signed-off-by: Micah Morton --- security/safesetid/securityfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c index d568e17dd773..74a13d432ed8 100644 --- a/security/safesetid/securityfs.c +++ b/security/safesetid/securityfs.c @@ -187,7 +187,8 @@ static ssize_t handle_policy_update(struct file *file, out_free_buf: kfree(buf); out_free_pol: - release_ruleset(pol); + if (pol) + release_ruleset(pol); return err; } From cc204d01262a69218b2d0db5cdea371de85871d9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Sep 2019 13:01:35 -0400 Subject: [PATCH 401/721] SUNRPC: Dequeue the request from the receive queue while we're re-encoding Ensure that we dequeue the request from the transport receive queue while we're re-encoding to prevent issues like use-after-free when we release the bvec. Fixes: 7536908982047 ("SUNRPC: Ensure the bvecs are reset when we re-encode...") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/clnt.c | 6 ++--- net/sunrpc/xprt.c | 54 +++++++++++++++++++++---------------- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 13e108bcc9eb..d783e15ba898 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -352,6 +352,7 @@ bool xprt_prepare_transmit(struct rpc_task *task); void xprt_request_enqueue_transmit(struct rpc_task *task); void xprt_request_enqueue_receive(struct rpc_task *task); void xprt_request_wait_receive(struct rpc_task *task); +void xprt_request_dequeue_xprt(struct rpc_task *task); bool xprt_request_need_retransmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); void xprt_end_transmit(struct rpc_task *task); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d8679b6027e9..0359466947e2 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1862,6 +1862,7 @@ rpc_xdr_encode(struct rpc_task *task) req->rq_rbuffer, req->rq_rcvsize); + req->rq_reply_bytes_recvd = 0; req->rq_snd_buf.head[0].iov_len = 0; xdr_init_encode(&xdr, &req->rq_snd_buf, req->rq_snd_buf.head[0].iov_base, req); @@ -1881,6 +1882,8 @@ call_encode(struct rpc_task *task) if (!rpc_task_need_encode(task)) goto out; dprint_status(task); + /* Dequeue task from the receive queue while we're encoding */ + xprt_request_dequeue_xprt(task); /* Encode here so that rpcsec_gss can use correct sequence number. */ rpc_xdr_encode(task); /* Did the encode result in an error condition? */ @@ -2501,9 +2504,6 @@ call_decode(struct rpc_task *task) return; case -EAGAIN: task->tk_status = 0; - xdr_free_bvec(&req->rq_rcv_buf); - req->rq_reply_bytes_recvd = 0; - req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 783748dc5e6f..02d5b2125c07 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1323,6 +1323,36 @@ xprt_request_dequeue_transmit(struct rpc_task *task) spin_unlock(&xprt->queue_lock); } +/** + * xprt_request_dequeue_xprt - remove a task from the transmit+receive queue + * @task: pointer to rpc_task + * + * Remove a task from the transmit and receive queues, and ensure that + * it is not pinned by the receive work item. + */ +void +xprt_request_dequeue_xprt(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || + test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || + xprt_is_pinned_rqst(req)) { + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); + while (xprt_is_pinned_rqst(req)) { + set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + xprt_wait_on_pinned_rqst(req); + spin_lock(&xprt->queue_lock); + clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + } + spin_unlock(&xprt->queue_lock); + } +} + /** * xprt_request_prepare - prepare an encoded request for transport * @req: pointer to rpc_rqst @@ -1754,28 +1784,6 @@ void xprt_retry_reserve(struct rpc_task *task) xprt_do_reserve(xprt, task); } -static void -xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req) -{ - struct rpc_xprt *xprt = req->rq_xprt; - - if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || - test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || - xprt_is_pinned_rqst(req)) { - spin_lock(&xprt->queue_lock); - xprt_request_dequeue_transmit_locked(task); - xprt_request_dequeue_receive_locked(task); - while (xprt_is_pinned_rqst(req)) { - set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); - spin_unlock(&xprt->queue_lock); - xprt_wait_on_pinned_rqst(req); - spin_lock(&xprt->queue_lock); - clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); - } - spin_unlock(&xprt->queue_lock); - } -} - /** * xprt_release - release an RPC request slot * @task: task which is finished with the slot @@ -1795,7 +1803,7 @@ void xprt_release(struct rpc_task *task) } xprt = req->rq_xprt; - xprt_request_dequeue_all(task, req); + xprt_request_dequeue_xprt(task); spin_lock(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) From 45835a63d039fc3bfb1d6c72cedaf785cd920e4a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 12 Sep 2019 08:04:25 -0400 Subject: [PATCH 402/721] SUNRPC: Don't receive TCP data into a request buffer that has been reset If we've removed the request from the receive list, and have added it back after resetting the request receive buffer, then we should only receive message data if it is a new reply (i.e. if transport->recv.copied is zero). Fixes: 277e4ab7d530b ("SUNRPC: Simplify TCP receive code by switching...") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e2176c167a57..9ac88722fa83 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -562,10 +562,14 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) printk(KERN_WARNING "Callback slot table overflowed\n"); return -ESHUTDOWN; } + if (transport->recv.copied && !req->rq_private_buf.len) + return -ESHUTDOWN; ret = xs_read_stream_request(transport, msg, flags, req); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_bc_request(req, transport->recv.copied); + else + req->rq_private_buf.len = transport->recv.copied; return ret; } @@ -587,7 +591,7 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) /* Look up and lock the request corresponding to the given XID */ spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, transport->recv.xid); - if (!req) { + if (!req || (transport->recv.copied && !req->rq_private_buf.len)) { msg->msg_flags |= MSG_TRUNC; goto out; } @@ -599,6 +603,8 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) spin_lock(&xprt->queue_lock); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_rqst(req->rq_task, transport->recv.copied); + else + req->rq_private_buf.len = transport->recv.copied; xprt_unpin_rqst(req); out: spin_unlock(&xprt->queue_lock); From 714fbc73888f59321854e7f6c2f224213923bcad Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 12 Sep 2019 08:06:51 -0400 Subject: [PATCH 403/721] SUNRPC: RPC level errors should always set task->tk_rpc_status Ensure that we set task->tk_rpc_status for all RPC level errors so that the caller can distinguish between those and server reply status errors. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 6 +++--- net/sunrpc/sched.c | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0359466947e2..44d5cf318813 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1837,7 +1837,7 @@ call_allocate(struct rpc_task *task) return; } - rpc_exit(task, -ERESTARTSYS); + rpc_call_rpcerror(task, -ERESTARTSYS); } static int @@ -2544,7 +2544,7 @@ rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr) return 0; out_fail: trace_rpc_bad_callhdr(task); - rpc_exit(task, error); + rpc_call_rpcerror(task, error); return error; } @@ -2611,7 +2611,7 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) return -EAGAIN; } out_err: - rpc_exit(task, error); + rpc_call_rpcerror(task, error); return error; out_unparsable: diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f25c4b9ba185..360afe153193 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -911,8 +911,10 @@ static void __rpc_execute(struct rpc_task *task) /* * Signalled tasks should exit rather than sleep. */ - if (RPC_SIGNALLED(task)) + if (RPC_SIGNALLED(task)) { + task->tk_rpc_status = -ERESTARTSYS; rpc_exit(task, -ERESTARTSYS); + } /* * The queue->lock protects against races with @@ -948,6 +950,7 @@ static void __rpc_execute(struct rpc_task *task) */ dprintk("RPC: %5u got signal\n", task->tk_pid); set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); + task->tk_rpc_status = -ERESTARTSYS; rpc_exit(task, -ERESTARTSYS); } dprintk("RPC: %5u sync task resuming\n", task->tk_pid); From 5eaed68dd38c14907be2ce5a45c73a5f2acb75f4 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 16 Sep 2019 18:44:28 +0300 Subject: [PATCH 404/721] block: use symbolic constants for t10_pi type Replace all hard-coded values with T10_PI_TYPES to make the code more readable. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Max Gurtovoy Signed-off-by: Jens Axboe --- block/t10-pi.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/block/t10-pi.c b/block/t10-pi.c index 0c0094609dd6..7fed58705d1a 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -27,7 +27,7 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len) * tag. */ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, - csum_fn *fn, unsigned int type) + csum_fn *fn, enum t10_dif_type type) { unsigned int i; @@ -37,7 +37,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, pi->guard_tag = fn(iter->data_buf, iter->interval); pi->app_tag = 0; - if (type == 1) + if (type == T10_PI_TYPE1_PROTECTION) pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); else pi->ref_tag = 0; @@ -51,7 +51,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, } static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - csum_fn *fn, unsigned int type) + csum_fn *fn, enum t10_dif_type type) { unsigned int i; @@ -60,8 +60,8 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, __be16 csum; switch (type) { - case 1: - case 2: + case T10_PI_TYPE1_PROTECTION: + case T10_PI_TYPE2_PROTECTION: if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -74,7 +74,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, return BLK_STS_PROTECTION; } break; - case 3: + case T10_PI_TYPE3_PROTECTION: if (pi->app_tag == T10_PI_APP_ESCAPE && pi->ref_tag == T10_PI_REF_ESCAPE) goto next; @@ -102,42 +102,42 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_crc_fn, 1); + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_ip_fn, 1); + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_crc_fn, 1); + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_ip_fn, 1); + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_crc_fn, 3); + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); } static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_ip_fn, 3); + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); } static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_crc_fn, 3); + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); } static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_ip_fn, 3); + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); } const struct blk_integrity_profile t10_pi_type1_crc = { From 54d4e6ab91eb24b47a58403d8561206e916f0242 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 16 Sep 2019 18:44:29 +0300 Subject: [PATCH 405/721] block: centralize PI remapping logic to the block layer Currently t10_pi_prepare/t10_pi_complete functions are called during the NVMe and SCSi layers command preparetion/completion, but their actual place should be the block layer since T10-PI is a general data integrity feature that is used by block storage protocols. Introduce .prepare_fn and .complete_fn callbacks within the integrity profile that each type can implement according to its needs. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Suggested-by: Martin K. Petersen Reviewed-by: Martin K. Petersen Signed-off-by: Max Gurtovoy Fixed to not call queue integrity functions if BLK_DEV_INTEGRITY isn't defined in the config. Signed-off-by: Jens Axboe --- block/blk-core.c | 7 ++ block/blk-integrity.c | 11 +++ block/blk-mq.c | 6 ++ block/t10-pi.c | 144 ++++++++++++++++++++------------------ drivers/md/dm-integrity.c | 10 +++ drivers/nvme/host/core.c | 9 --- drivers/scsi/sd.c | 8 --- include/linux/blkdev.h | 4 ++ include/linux/t10-pi.h | 14 ---- 9 files changed, 114 insertions(+), 99 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 875e8d105067..d5e668ec751b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1436,6 +1437,12 @@ bool blk_update_request(struct request *req, blk_status_t error, if (!req->bio) return false; +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + error == BLK_STS_OK) + req->q->integrity.profile->complete_fn(req, nr_bytes); +#endif + if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET))) print_req_error(req, error, __func__); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ca39b4624cf8..ff1070edbb40 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -368,10 +368,21 @@ static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) return BLK_STS_OK; } +static void blk_integrity_nop_prepare(struct request *rq) +{ +} + +static void blk_integrity_nop_complete(struct request *rq, + unsigned int nr_bytes) +{ +} + static const struct blk_integrity_profile nop_profile = { .name = "nop", .generate_fn = blk_integrity_nop_fn, .verify_fn = blk_integrity_nop_fn, + .prepare_fn = blk_integrity_nop_prepare, + .complete_fn = blk_integrity_nop_complete, }; /** diff --git a/block/blk-mq.c b/block/blk-mq.c index 20a49be536b5..29275f5a996f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -30,6 +30,7 @@ #include #include +#include #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -700,6 +701,11 @@ void blk_mq_start_request(struct request *rq) */ rq->nr_phys_segments++; } + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) + q->integrity.profile->prepare_fn(rq); +#endif } EXPORT_SYMBOL(blk_mq_start_request); diff --git a/block/t10-pi.c b/block/t10-pi.c index 7fed58705d1a..0c0120a672f9 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -120,76 +120,22 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } -static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -const struct blk_integrity_profile t10_pi_type1_crc = { - .name = "T10-DIF-TYPE1-CRC", - .generate_fn = t10_pi_type1_generate_crc, - .verify_fn = t10_pi_type1_verify_crc, -}; -EXPORT_SYMBOL(t10_pi_type1_crc); - -const struct blk_integrity_profile t10_pi_type1_ip = { - .name = "T10-DIF-TYPE1-IP", - .generate_fn = t10_pi_type1_generate_ip, - .verify_fn = t10_pi_type1_verify_ip, -}; -EXPORT_SYMBOL(t10_pi_type1_ip); - -const struct blk_integrity_profile t10_pi_type3_crc = { - .name = "T10-DIF-TYPE3-CRC", - .generate_fn = t10_pi_type3_generate_crc, - .verify_fn = t10_pi_type3_verify_crc, -}; -EXPORT_SYMBOL(t10_pi_type3_crc); - -const struct blk_integrity_profile t10_pi_type3_ip = { - .name = "T10-DIF-TYPE3-IP", - .generate_fn = t10_pi_type3_generate_ip, - .verify_fn = t10_pi_type3_verify_ip, -}; -EXPORT_SYMBOL(t10_pi_type3_ip); - /** - * t10_pi_prepare - prepare PI prior submitting request to device + * t10_pi_type1_prepare - prepare PI prior submitting request to device * @rq: request with PI that should be prepared - * @protection_type: PI type (Type 1/Type 2/Type 3) * * For Type 1/Type 2, the virtual start sector is the one that was * originally submitted by the block layer for the ref_tag usage. Due to * partitioning, MD/DM cloning, etc. the actual physical start sector is * likely to be different. Remap protection information to match the * physical LBA. - * - * Type 3 does not have a reference tag so no remapping is required. */ -void t10_pi_prepare(struct request *rq, u8 protection_type) +static void t10_pi_type1_prepare(struct request *rq) { const int tuple_sz = rq->q->integrity.tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); struct bio *bio; - if (protection_type == T10_PI_TYPE3_PROTECTION) - return; - __rq_for_each_bio(bio, rq) { struct bio_integrity_payload *bip = bio_integrity(bio); u32 virt = bip_get_seed(bip) & 0xffffffff; @@ -222,13 +168,11 @@ void t10_pi_prepare(struct request *rq, u8 protection_type) bip->bip_flags |= BIP_MAPPED_INTEGRITY; } } -EXPORT_SYMBOL(t10_pi_prepare); /** - * t10_pi_complete - prepare PI prior returning request to the block layer + * t10_pi_type1_complete - prepare PI prior returning request to the blk layer * @rq: request with PI that should be prepared - * @protection_type: PI type (Type 1/Type 2/Type 3) - * @intervals: total elements to prepare + * @nr_bytes: total bytes to prepare * * For Type 1/Type 2, the virtual start sector is the one that was * originally submitted by the block layer for the ref_tag usage. Due to @@ -236,19 +180,14 @@ EXPORT_SYMBOL(t10_pi_prepare); * likely to be different. Since the physical start sector was submitted * to the device, we should remap it back to virtual values expected by the * block layer. - * - * Type 3 does not have a reference tag so no remapping is required. */ -void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals) +static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { + unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; const int tuple_sz = rq->q->integrity.tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); struct bio *bio; - if (protection_type == T10_PI_TYPE3_PROTECTION) - return; - __rq_for_each_bio(bio, rq) { struct bio_integrity_payload *bip = bio_integrity(bio); u32 virt = bip_get_seed(bip) & 0xffffffff; @@ -276,4 +215,73 @@ void t10_pi_complete(struct request *rq, u8 protection_type, } } } -EXPORT_SYMBOL(t10_pi_complete); + +static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +/** + * Type 3 does not have a reference tag so no remapping is required. + */ +static void t10_pi_type3_prepare(struct request *rq) +{ +} + +/** + * Type 3 does not have a reference tag so no remapping is required. + */ +static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) +{ +} + +const struct blk_integrity_profile t10_pi_type1_crc = { + .name = "T10-DIF-TYPE1-CRC", + .generate_fn = t10_pi_type1_generate_crc, + .verify_fn = t10_pi_type1_verify_crc, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_crc); + +const struct blk_integrity_profile t10_pi_type1_ip = { + .name = "T10-DIF-TYPE1-IP", + .generate_fn = t10_pi_type1_generate_ip, + .verify_fn = t10_pi_type1_verify_ip, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_ip); + +const struct blk_integrity_profile t10_pi_type3_crc = { + .name = "T10-DIF-TYPE3-CRC", + .generate_fn = t10_pi_type3_generate_crc, + .verify_fn = t10_pi_type3_verify_crc, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_crc); + +const struct blk_integrity_profile t10_pi_type3_ip = { + .name = "T10-DIF-TYPE3-IP", + .generate_fn = t10_pi_type3_generate_ip, + .verify_fn = t10_pi_type3_verify_ip, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_ip); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 9118ab85cb3a..dab4446fe7d8 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -345,6 +345,14 @@ static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) #endif +static void dm_integrity_prepare(struct request *rq) +{ +} + +static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ +} + /* * DM Integrity profile, protection is performed layer above (dm-crypt) */ @@ -352,6 +360,8 @@ static const struct blk_integrity_profile dm_integrity_profile = { .name = "DM-DIF-EXT-TAG", .generate_fn = NULL, .verify_fn = NULL, + .prepare_fn = dm_integrity_prepare, + .complete_fn = dm_integrity_complete, }; static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1ede1763a5ee..108f60b46804 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -666,8 +666,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) return BLK_STS_NOTSUPP; control |= NVME_RW_PRINFO_PRACT; - } else if (req_op(req) == REQ_OP_WRITE) { - t10_pi_prepare(req, ns->pi_type); } switch (ns->pi_type) { @@ -690,13 +688,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, void nvme_cleanup_cmd(struct request *req) { - if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && - nvme_req(req)->status == 0) { - struct nvme_ns *ns = req->rq_disk->private_data; - - t10_pi_complete(req, ns->pi_type, - blk_rq_bytes(req) >> ns->lba_shift); - } if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { struct nvme_ns *ns = req->rq_disk->private_data; struct page *page = req->special_vec.bv_page; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 4b925552458f..0e8941caaa0d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1211,9 +1211,6 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); - if (write && dix) - t10_pi_prepare(cmd->request, sdkp->protection_type); - if (dif || dix) protect = sd_setup_protect_cmnd(cmd, dix, dif); else @@ -2054,11 +2051,6 @@ static int sd_done(struct scsi_cmnd *SCpnt) "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); - if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) && - good_bytes) - t10_pi_complete(SCpnt->request, sdkp->protection_type, - good_bytes / scsi_prot_interval(SCpnt)); - return good_bytes; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3094f2d513b2..6032bb740cf4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1522,10 +1522,14 @@ struct blk_integrity_iter { }; typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); +typedef void (integrity_prepare_fn) (struct request *); +typedef void (integrity_complete_fn) (struct request *, unsigned int); struct blk_integrity_profile { integrity_processing_fn *generate_fn; integrity_processing_fn *verify_fn; + integrity_prepare_fn *prepare_fn; + integrity_complete_fn *complete_fn; const char *name; }; diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 3e2a80cc7b56..96305a64a5a7 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -53,18 +53,4 @@ extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; extern const struct blk_integrity_profile t10_pi_type3_ip; -#ifdef CONFIG_BLK_DEV_INTEGRITY -extern void t10_pi_prepare(struct request *rq, u8 protection_type); -extern void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals); -#else -static inline void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals) -{ -} -static inline void t10_pi_prepare(struct request *rq, u8 protection_type) -{ -} -#endif - #endif From 23ed570accc94cf9de9036b71102aa7c0e0e8dc6 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 22 Aug 2019 17:20:34 +0200 Subject: [PATCH 406/721] block, bfq: update inject limit only after injection occurred BFQ updates the injection limit of each bfq_queue as a function of how much the limit inflates the service times experienced by the I/O requests of the queue. So only service times affected by injection must be taken into account. Unfortunately, in the current implementation of this update scheme, the service time of an I/O request rq not affected by injection may happen to be considered in the following case: there is no I/O request in service when rq arrives. This commit fixes this issue by making sure that only service times affected by injection are considered for updating the injection limit. In particular, the service time of an I/O request rq is now considered only if at least one of the following two conditions holds: - the destination bfq_queue for rq underwent injection before rq arrival, and there is still I/O in service in the drive on rq arrival (the service of such unfinished I/O may delay the service of rq); - injection occurs between the arrival and the completion time of rq. Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b33be928d164..5a2bbd8613a8 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2025,7 +2025,21 @@ static void bfq_add_request(struct request *rq) * be set when rq will be dispatched. */ bfqd->wait_dispatch = true; - bfqd->rqs_injected = false; + /* + * If there is no I/O in service in the drive, + * then possible injection occurred before the + * arrival of rq will not affect the total + * service time of rq. So the injection limit + * must not be updated as a function of such + * total service time, unless new injection + * occurs before rq is completed. To have the + * injection limit updated only in the latter + * case, reset rqs_injected here (rqs_injected + * will be set in case injection is performed + * on bfqq before rq is completed). + */ + if (bfqd->rq_in_driver == 0) + bfqd->rqs_injected = false; } } @@ -5784,7 +5798,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; unsigned int old_limit = bfqq->inject_limit; - if (bfqq->last_serv_time_ns > 0) { + if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { u64 threshold = (bfqq->last_serv_time_ns * 3)>>1; if (tot_time_ns >= threshold && old_limit > 0) { @@ -5830,6 +5844,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, /* update complete, not waiting for any request completion any longer */ bfqd->waited_rq = NULL; + bfqd->rqs_injected = false; } /* From c1e0a18228822258cbeb49a923d93f8f01c74a76 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 22 Aug 2019 17:20:35 +0200 Subject: [PATCH 407/721] block, bfq: reduce upper bound for inject limit to max_rq_in_driver+1 Upon an increment attempt of the injection limit, the latter is constrained not to become higher than twice the maximum number max_rq_in_driver of I/O requests that have happened to be in service in the drive. This high bound allows the injection limit to grow beyond max_rq_in_driver, which may then cause max_rq_in_driver itself to grow. However, since the limit is incremented by only one unit at a time, there is no need for such a high bound, and just max_rq_in_driver+1 is enough. Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 5a2bbd8613a8..e114282204f6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5805,7 +5805,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, bfqq->inject_limit--; bfqq->decrease_time_jif = jiffies; } else if (tot_time_ns < threshold && - old_limit < bfqd->max_rq_in_driver<<1) + old_limit <= bfqd->max_rq_in_driver) bfqq->inject_limit++; } From 17c3d2660268501d36b75c7c7083dc8f2bbb93c3 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 22 Aug 2019 17:20:36 +0200 Subject: [PATCH 408/721] block, bfq: increase update frequency of inject limit The update period of the injection limit has been tentatively set to 100 ms, to reduce fluctuations. This value however proved to cause, occasionally, the limit to be decremented for some bfq_queue only after the queue underwent excessive injection for a lot of time. This commit reduces the period to 10 ms. Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e114282204f6..ddac93e910fa 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2016,7 +2016,7 @@ static void bfq_add_request(struct request *rq) (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + - msecs_to_jiffies(100))) { + msecs_to_jiffies(10))) { bfqd->last_empty_occupied_ns = ktime_get_ns(); /* * Start the state machine for measuring the From 58494c980f40274c465ebfdece02d401def088bf Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 22 Aug 2019 17:20:37 +0200 Subject: [PATCH 409/721] block, bfq: push up injection only after setting service time If equal to 0, the injection limit for a bfq_queue is pushed to 1 after a first sample of the total service time of the I/O requests of the queue is computed (to allow injection to start). Yet, because of a mistake in the branch that performs this action, the push may happen also in some other case. This commit fixes this issue. Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ddac93e910fa..0319d6339822 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5823,12 +5823,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, */ if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || tot_time_ns < bfqq->last_serv_time_ns) { + if (bfqq->last_serv_time_ns == 0) { + /* + * Now we certainly have a base value: make sure we + * start trying injection. + */ + bfqq->inject_limit = max_t(unsigned int, 1, old_limit); + } bfqq->last_serv_time_ns = tot_time_ns; - /* - * Now we certainly have a base value: make sure we - * start trying injection. - */ - bfqq->inject_limit = max_t(unsigned int, 1, old_limit); } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) /* * No I/O injected and no request still in service in From ec76a7b922e42df1437e39b44c564ba892676f0e Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 17 Sep 2019 17:26:05 +0530 Subject: [PATCH 410/721] nbd: rename the runtime flags as NBD_RT_ prefixed Preparing for the destory when disconnecting crash fixing. Reviewed-by: Josef Bacik Signed-off-by: Xiubo Li Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 74 ++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a8e3815295fe..7e0501c47153 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -71,14 +71,14 @@ struct link_dead_args { int index; }; -#define NBD_TIMEDOUT 0 -#define NBD_DISCONNECT_REQUESTED 1 -#define NBD_DISCONNECTED 2 -#define NBD_HAS_PID_FILE 3 -#define NBD_HAS_CONFIG_REF 4 -#define NBD_BOUND 5 -#define NBD_DESTROY_ON_DISCONNECT 6 -#define NBD_DISCONNECT_ON_CLOSE 7 +#define NBD_RT_TIMEDOUT 0 +#define NBD_RT_DISCONNECT_REQUESTED 1 +#define NBD_RT_DISCONNECTED 2 +#define NBD_RT_HAS_PID_FILE 3 +#define NBD_RT_HAS_CONFIG_REF 4 +#define NBD_RT_BOUND 5 +#define NBD_RT_DESTROY_ON_DISCONNECT 6 +#define NBD_RT_DISCONNECT_ON_CLOSE 7 struct nbd_config { u32 flags; @@ -238,8 +238,8 @@ static void nbd_put(struct nbd_device *nbd) static int nbd_disconnected(struct nbd_config *config) { - return test_bit(NBD_DISCONNECTED, &config->runtime_flags) || - test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags); + return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) || + test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); } static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, @@ -257,9 +257,9 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, if (!nsock->dead) { kernel_sock_shutdown(nsock->sock, SHUT_RDWR); if (atomic_dec_return(&nbd->config->live_connections) == 0) { - if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED, + if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED, &nbd->config->runtime_flags)) { - set_bit(NBD_DISCONNECTED, + set_bit(NBD_RT_DISCONNECTED, &nbd->config->runtime_flags); dev_info(nbd_to_dev(nbd), "Disconnected due to user request.\n"); @@ -333,7 +333,7 @@ static void sock_shutdown(struct nbd_device *nbd) if (config->num_connections == 0) return; - if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return; for (i = 0; i < config->num_connections; i++) { @@ -427,7 +427,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, } dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); - set_bit(NBD_TIMEDOUT, &config->runtime_flags); + set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); sock_shutdown(nbd); @@ -795,7 +795,7 @@ static int find_fallback(struct nbd_device *nbd, int index) struct nbd_sock *nsock = config->socks[index]; int fallback = nsock->fallback_index; - if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return new_index; if (config->num_connections <= 1) { @@ -836,7 +836,7 @@ static int wait_for_reconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; if (!config->dead_conn_timeout) return 0; - if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return 0; return wait_event_timeout(config->conn_wait, atomic_read(&config->live_connections) > 0, @@ -969,12 +969,12 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, return err; if (!netlink && !nbd->task_setup && - !test_bit(NBD_BOUND, &config->runtime_flags)) + !test_bit(NBD_RT_BOUND, &config->runtime_flags)) nbd->task_setup = current; if (!netlink && (nbd->task_setup != current || - test_bit(NBD_BOUND, &config->runtime_flags))) { + test_bit(NBD_RT_BOUND, &config->runtime_flags))) { dev_err(disk_to_dev(nbd->disk), "Device being setup by another task"); sockfd_put(sock); @@ -1053,7 +1053,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) mutex_unlock(&nsock->tx_lock); sockfd_put(old); - clear_bit(NBD_DISCONNECTED, &config->runtime_flags); + clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); /* We take the tx_mutex in an error path in the recv_work, so we * need to queue_work outside of the tx_mutex. @@ -1124,7 +1124,7 @@ static int nbd_disconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); - set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags); + set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); send_disconnects(nbd); return 0; } @@ -1143,7 +1143,7 @@ static void nbd_config_put(struct nbd_device *nbd) struct nbd_config *config = nbd->config; nbd_dev_dbg_close(nbd); nbd_size_clear(nbd); - if (test_and_clear_bit(NBD_HAS_PID_FILE, + if (test_and_clear_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags)) device_remove_file(disk_to_dev(nbd->disk), &pid_attr); nbd->task_recv = NULL; @@ -1209,7 +1209,7 @@ static int nbd_start_device(struct nbd_device *nbd) dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); return error; } - set_bit(NBD_HAS_PID_FILE, &config->runtime_flags); + set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags); nbd_dev_dbg_init(nbd); for (i = 0; i < num_connections; i++) { @@ -1256,9 +1256,9 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b mutex_lock(&nbd->config_lock); nbd_bdev_reset(bdev); /* user requested, ignore socket errors */ - if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags)) ret = 0; - if (test_bit(NBD_TIMEDOUT, &config->runtime_flags)) + if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags)) ret = -ETIMEDOUT; return ret; } @@ -1269,7 +1269,7 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd, sock_shutdown(nbd); __invalidate_device(bdev, true); nbd_bdev_reset(bdev); - if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); } @@ -1364,7 +1364,7 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode, /* Don't allow ioctl operations on a nbd device that was created with * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine. */ - if (!test_bit(NBD_BOUND, &config->runtime_flags) || + if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK)) error = __nbd_ioctl(bdev, nbd, cmd, arg); else @@ -1435,7 +1435,7 @@ static void nbd_release(struct gendisk *disk, fmode_t mode) struct nbd_device *nbd = disk->private_data; struct block_device *bdev = bdget_disk(disk, 0); - if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && + if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && bdev->bd_openers == 0) nbd_disconnect_and_put(nbd); @@ -1833,7 +1833,7 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; } refcount_set(&nbd->config_refs, 1); - set_bit(NBD_BOUND, &config->runtime_flags); + set_bit(NBD_RT_BOUND, &config->runtime_flags); ret = nbd_genl_size_set(info, nbd); if (ret) @@ -1853,12 +1853,12 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { - set_bit(NBD_DESTROY_ON_DISCONNECT, + set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags); put_dev = true; } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { - set_bit(NBD_DISCONNECT_ON_CLOSE, + set_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } } @@ -1897,7 +1897,7 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) out: mutex_unlock(&nbd->config_lock); if (!ret) { - set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags); + set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags); refcount_inc(&nbd->config_refs); nbd_connect_reply(info, nbd->index); } @@ -1919,7 +1919,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) * queue. */ flush_workqueue(nbd->recv_workq); - if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); } @@ -2003,7 +2003,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) mutex_lock(&nbd->config_lock); config = nbd->config; - if (!test_bit(NBD_BOUND, &config->runtime_flags) || + if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || !nbd->task_recv) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); @@ -2026,20 +2026,20 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { - if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, + if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) put_dev = true; } else { - if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, + if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) refcount_inc(&nbd->refs); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { - set_bit(NBD_DISCONNECT_ON_CLOSE, + set_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } else { - clear_bit(NBD_DISCONNECT_ON_CLOSE, + clear_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } } From 8454d68563d400fa09b63dc636361b6702ceb8af Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 17 Sep 2019 17:26:06 +0530 Subject: [PATCH 411/721] nbd: fix possible page fault for nbd disk When the NBD_CFLAG_DESTROY_ON_DISCONNECT flag is set and at the same time when the socket is closed due to the server daemon is restarted, just before the last DISCONNET is totally done if we start a new connection by using the old nbd_index, there will be crashing randomly, like: <3>[ 110.151949] block nbd1: Receive control failed (result -32) <1>[ 110.152024] BUG: unable to handle page fault for address: 0000058000000840 <1>[ 110.152063] #PF: supervisor read access in kernel mode <1>[ 110.152083] #PF: error_code(0x0000) - not-present page <6>[ 110.152094] PGD 0 P4D 0 <4>[ 110.152106] Oops: 0000 [#1] SMP PTI <4>[ 110.152120] CPU: 0 PID: 6698 Comm: kworker/u5:1 Kdump: loaded Not tainted 5.3.0-rc4+ #2 <4>[ 110.152136] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 <4>[ 110.152166] Workqueue: knbd-recv recv_work [nbd] <4>[ 110.152187] RIP: 0010:__dev_printk+0xd/0x67 <4>[ 110.152206] Code: 10 e8 c5 fd ff ff 48 8b 4c 24 18 65 48 33 0c 25 28 00 [...] <4>[ 110.152244] RSP: 0018:ffffa41581f13d18 EFLAGS: 00010206 <4>[ 110.152256] RAX: ffffa41581f13d30 RBX: ffff96dd7374e900 RCX: 0000000000000000 <4>[ 110.152271] RDX: ffffa41581f13d20 RSI: 00000580000007f0 RDI: ffffffff970ec24f <4>[ 110.152285] RBP: ffffa41581f13d80 R08: ffff96dd7fc17908 R09: 0000000000002e56 <4>[ 110.152299] R10: ffffffff970ec24f R11: 0000000000000003 R12: ffff96dd7374e900 <4>[ 110.152313] R13: 0000000000000000 R14: ffff96dd7374e9d8 R15: ffff96dd6e3b02c8 <4>[ 110.152329] FS: 0000000000000000(0000) GS:ffff96dd7fc00000(0000) knlGS:0000000000000000 <4>[ 110.152362] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4>[ 110.152383] CR2: 0000058000000840 CR3: 0000000067cc6002 CR4: 00000000001606f0 <4>[ 110.152401] Call Trace: <4>[ 110.152422] _dev_err+0x6c/0x83 <4>[ 110.152435] nbd_read_stat.cold+0xda/0x578 [nbd] <4>[ 110.152448] ? __switch_to_asm+0x34/0x70 <4>[ 110.152468] ? __switch_to_asm+0x40/0x70 <4>[ 110.152478] ? __switch_to_asm+0x34/0x70 <4>[ 110.152491] ? __switch_to_asm+0x40/0x70 <4>[ 110.152501] ? __switch_to_asm+0x34/0x70 <4>[ 110.152511] ? __switch_to_asm+0x40/0x70 <4>[ 110.152522] ? __switch_to_asm+0x34/0x70 <4>[ 110.152533] recv_work+0x35/0x9e [nbd] <4>[ 110.152547] process_one_work+0x19d/0x340 <4>[ 110.152558] worker_thread+0x50/0x3b0 <4>[ 110.152568] kthread+0xfb/0x130 <4>[ 110.152577] ? process_one_work+0x340/0x340 <4>[ 110.152609] ? kthread_park+0x80/0x80 <4>[ 110.152637] ret_from_fork+0x35/0x40 This is very easy to reproduce by running the nbd-runner. Reviewed-by: Josef Bacik Signed-off-by: Xiubo Li Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7e0501c47153..ac07e8c94c79 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,9 @@ struct link_dead_args { #define NBD_RT_DESTROY_ON_DISCONNECT 6 #define NBD_RT_DISCONNECT_ON_CLOSE 7 +#define NBD_DESTROY_ON_DISCONNECT 0 +#define NBD_DISCONNECT_REQUESTED 1 + struct nbd_config { u32 flags; unsigned long runtime_flags; @@ -113,6 +117,9 @@ struct nbd_device { struct list_head list; struct task_struct *task_recv; struct task_struct *task_setup; + + struct completion *destroy_complete; + unsigned long flags; }; #define NBD_CMD_REQUEUED 1 @@ -223,6 +230,16 @@ static void nbd_dev_remove(struct nbd_device *nbd) disk->private_data = NULL; put_disk(disk); } + + /* + * Place this in the last just before the nbd is freed to + * make sure that the disk and the related kobject are also + * totally removed to avoid duplicate creation of the same + * one. + */ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete) + complete(nbd->destroy_complete); + kfree(nbd); } @@ -1125,6 +1142,7 @@ static int nbd_disconnect(struct nbd_device *nbd) dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); + set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags); send_disconnects(nbd); return 0; } @@ -1636,6 +1654,7 @@ static int nbd_dev_add(int index) nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; + nbd->destroy_complete = NULL; err = blk_mq_alloc_tag_set(&nbd->tag_set); if (err) @@ -1750,6 +1769,7 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { + DECLARE_COMPLETION_ONSTACK(destroy_complete); struct nbd_device *nbd = NULL; struct nbd_config *config; int index = -1; @@ -1801,6 +1821,17 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&nbd_index_mutex); return -EINVAL; } + + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { + nbd->destroy_complete = &destroy_complete; + mutex_unlock(&nbd_index_mutex); + + /* Wait untill the the nbd stuff is totally destroyed */ + wait_for_completion(&destroy_complete); + goto again; + } + if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); if (index == -1) @@ -1855,7 +1886,10 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags); + set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); put_dev = true; + } else { + clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { set_bit(NBD_RT_DISCONNECT_ON_CLOSE, @@ -2029,10 +2063,12 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) put_dev = true; + set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } else { if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) refcount_inc(&nbd->refs); + clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { From 6a090e97972dc7bb6f2661eda3eabd8ae21e9b6d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 16 Sep 2019 18:39:44 -0700 Subject: [PATCH 412/721] arch/microblaze: support get_user() of size 8 bytes arch/microblaze/ is missing support for get_user() of size 8 bytes, so add it by using __copy_from_user(). While there, also drop a lot of the code duplication. Fixes these build errors: drivers/infiniband/core/uverbs_main.o: In function `ib_uverbs_write': drivers/infiniband/core/.tmp_gl_uverbs_main.o:(.text+0x13a4): undefined reference to `__user_bad' drivers/android/binder.o: In function `binder_thread_write': drivers/android/.tmp_gl_binder.o:(.text+0xda6c): undefined reference to `__user_bad' drivers/android/.tmp_gl_binder.o:(.text+0xda98): undefined reference to `__user_bad' drivers/android/.tmp_gl_binder.o:(.text+0xdf10): undefined reference to `__user_bad' drivers/android/.tmp_gl_binder.o:(.text+0xe498): undefined reference to `__user_bad' drivers/android/binder.o:drivers/android/.tmp_gl_binder.o:(.text+0xea78): more undefined references to `__user_bad' follow 'make allmodconfig' now builds successfully for arch/microblaze/. Fixes: 538722ca3b76 ("microblaze: fix get_user/put_user side-effects") Reported-by: kbuild test robot Signed-off-by: Randy Dunlap Acked-by: Linus Torvalds Cc: Al Viro Cc: Steven J. Magnani Cc: Michal Simek Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Andrew Morton Cc: Doug Ledford Signed-off-by: Michal Simek --- arch/microblaze/include/asm/uaccess.h | 42 ++++++--------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h index bff2a71c828a..a1f206b90753 100644 --- a/arch/microblaze/include/asm/uaccess.h +++ b/arch/microblaze/include/asm/uaccess.h @@ -163,44 +163,15 @@ extern long __user_bad(void); * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ -#define get_user(x, ptr) \ - __get_user_check((x), (ptr), sizeof(*(ptr))) - -#define __get_user_check(x, ptr, size) \ -({ \ - unsigned long __gu_val = 0; \ - const typeof(*(ptr)) __user *__gu_addr = (ptr); \ - int __gu_err = 0; \ - \ - if (access_ok(__gu_addr, size)) { \ - switch (size) { \ - case 1: \ - __get_user_asm("lbu", __gu_addr, __gu_val, \ - __gu_err); \ - break; \ - case 2: \ - __get_user_asm("lhu", __gu_addr, __gu_val, \ - __gu_err); \ - break; \ - case 4: \ - __get_user_asm("lw", __gu_addr, __gu_val, \ - __gu_err); \ - break; \ - default: \ - __gu_err = __user_bad(); \ - break; \ - } \ - } else { \ - __gu_err = -EFAULT; \ - } \ - x = (__force typeof(*(ptr)))__gu_val; \ - __gu_err; \ +#define get_user(x, ptr) ({ \ + const typeof(*(ptr)) __user *__gu_ptr = (ptr); \ + access_ok(__gu_ptr, sizeof(*__gu_ptr)) ? \ + __get_user(x, __gu_ptr) : -EFAULT; \ }) #define __get_user(x, ptr) \ ({ \ unsigned long __gu_val = 0; \ - /*unsigned long __gu_ptr = (unsigned long)(ptr);*/ \ long __gu_err; \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -212,6 +183,11 @@ extern long __user_bad(void); case 4: \ __get_user_asm("lw", (ptr), __gu_val, __gu_err); \ break; \ + case 8: \ + __gu_err = __copy_from_user(&__gu_val, ptr, 8); \ + if (__gu_err) \ + __gu_err = -EFAULT; \ + break; \ default: \ /* __gu_val = 0; __gu_err = -EINVAL;*/ __gu_err = __user_bad();\ } \ From 6be76fd94ba31dae3f62e77d687614ddb75d6b3a Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 4 Sep 2019 10:16:22 +0200 Subject: [PATCH 413/721] microblaze: Enable Xilinx AXI emac driver by default Enable Xilinx AXI emac ethernet driver for Microblaze. Signed-off-by: Michal Simek --- arch/microblaze/configs/mmu_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 92fd4e95b488..990f23d5fe50 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -41,6 +41,7 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_NETDEVICES=y CONFIG_XILINX_EMACLITE=y +CONFIG_XILINX_AXI_EMAC=y CONFIG_XILINX_LL_TEMAC=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set From 1c62ed908363e249f3b0e1879d9d982e8431fb3f Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 4 Sep 2019 10:42:12 +0200 Subject: [PATCH 414/721] microblaze: defconfig synchronization Kconfig structure has changed that's why make sense to do synchronization. Signed-off-by: Michal Simek --- arch/microblaze/configs/mmu_defconfig | 19 ++++++++----------- arch/microblaze/configs/nommu_defconfig | 23 +++++++++++------------ 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 990f23d5fe50..b83aff1e86cd 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -5,15 +5,10 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y -CONFIG_KALLSYMS_ALL=y # CONFIG_BASE_FULL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y -# CONFIG_EFI_PARTITION is not set CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_BARREL=1 @@ -25,14 +20,19 @@ CONFIG_MMU=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE_FORCE=y CONFIG_HIGHMEM=y -CONFIG_PCI=y CONFIG_PCI_XILINX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y +# CONFIG_EFI_PARTITION is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set CONFIG_BRIDGE=m +CONFIG_PCI=y CONFIG_MTD=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y @@ -75,8 +75,8 @@ CONFIG_CRAMFS=y CONFIG_ROMFS_FS=y CONFIG_NFS_FS=y CONFIG_CIFS=y -CONFIG_CIFS_STATS=y CONFIG_CIFS_STATS2=y +CONFIG_ENCRYPTED_KEYS=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_SLAB=y CONFIG_DETECT_HUNG_TASK=y @@ -84,6 +84,3 @@ CONFIG_DEBUG_SPINLOCK=y CONFIG_KGDB=y CONFIG_KGDB_TESTS=y CONFIG_KGDB_KDB=y -CONFIG_EARLY_PRINTK=y -CONFIG_KEYS=y -CONFIG_ENCRYPTED_KEYS=y diff --git a/arch/microblaze/configs/nommu_defconfig b/arch/microblaze/configs/nommu_defconfig index 06d69a6e192d..dc102eb65fca 100644 --- a/arch/microblaze/configs/nommu_defconfig +++ b/arch/microblaze/configs/nommu_defconfig @@ -7,15 +7,10 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y -CONFIG_KALLSYMS_ALL=y # CONFIG_BASE_FULL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y -# CONFIG_EFI_PARTITION is not set CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_BARREL=1 @@ -25,13 +20,18 @@ CONFIG_XILINX_MICROBLAZE0_USE_FPU=2 CONFIG_HZ_100=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE_FORCE=y -CONFIG_PCI=y CONFIG_PCI_XILINX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y +# CONFIG_EFI_PARTITION is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set +CONFIG_PCI=y CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_BLOCK=y @@ -75,11 +75,6 @@ CONFIG_ROMFS_FS=y CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_NLS=y -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_SLAB=y -CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_EARLY_PRINTK=y CONFIG_KEYS=y CONFIG_ENCRYPTED_KEYS=y CONFIG_CRYPTO_ECB=y @@ -87,3 +82,7 @@ CONFIG_CRYPTO_MD4=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_ARC4=y CONFIG_CRYPTO_DES=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_SLAB=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEBUG_SPINLOCK=y From 0dce49efc70536a8c3b4bb5354a71b727ba31b80 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Wed, 18 Sep 2019 12:03:44 +0200 Subject: [PATCH 415/721] ASoC: atmel_ssc_dai: Remove wrong spinlock usage A potential bug was reported in the email "[BUG] atmel_ssc_dai: a possible sleep-in-atomic bug in atmel_ssc_shutdown"[1] Indeed in the function atmel_ssc_shutdown() free_irq() was called in a critical section protected by spinlock. However this spinlock is only used in atmel_ssc_shutdown() and atmel_ssc_startup() functions. After further analysis, it occurred that the call to these function are already protected by mutex used on the calling functions. Then we can remove the spinlock which will fix this bug as a side effect. Thanks to this patch the following message disappears: "BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909" [1]: https://www.spinics.net/lists/alsa-devel/msg71286.html Reviewed-by: Alexandre Belloni Signed-off-by: Gregory CLEMENT Link: https://lore.kernel.org/r/20190918100344.23629-1-gregory.clement@bootlin.com Signed-off-by: Mark Brown --- sound/soc/atmel/atmel_ssc_dai.c | 12 ++---------- sound/soc/atmel/atmel_ssc_dai.h | 1 - 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/sound/soc/atmel/atmel_ssc_dai.c b/sound/soc/atmel/atmel_ssc_dai.c index 48e9eef34c0f..ca603397651c 100644 --- a/sound/soc/atmel/atmel_ssc_dai.c +++ b/sound/soc/atmel/atmel_ssc_dai.c @@ -116,19 +116,16 @@ static struct atmel_pcm_dma_params ssc_dma_params[NUM_SSC_DEVICES][2] = { static struct atmel_ssc_info ssc_info[NUM_SSC_DEVICES] = { { .name = "ssc0", - .lock = __SPIN_LOCK_UNLOCKED(ssc_info[0].lock), .dir_mask = SSC_DIR_MASK_UNUSED, .initialized = 0, }, { .name = "ssc1", - .lock = __SPIN_LOCK_UNLOCKED(ssc_info[1].lock), .dir_mask = SSC_DIR_MASK_UNUSED, .initialized = 0, }, { .name = "ssc2", - .lock = __SPIN_LOCK_UNLOCKED(ssc_info[2].lock), .dir_mask = SSC_DIR_MASK_UNUSED, .initialized = 0, }, @@ -317,13 +314,10 @@ static int atmel_ssc_startup(struct snd_pcm_substream *substream, snd_soc_dai_set_dma_data(dai, substream, dma_params); - spin_lock_irq(&ssc_p->lock); - if (ssc_p->dir_mask & dir_mask) { - spin_unlock_irq(&ssc_p->lock); + if (ssc_p->dir_mask & dir_mask) return -EBUSY; - } + ssc_p->dir_mask |= dir_mask; - spin_unlock_irq(&ssc_p->lock); return 0; } @@ -355,7 +349,6 @@ static void atmel_ssc_shutdown(struct snd_pcm_substream *substream, dir_mask = 1 << dir; - spin_lock_irq(&ssc_p->lock); ssc_p->dir_mask &= ~dir_mask; if (!ssc_p->dir_mask) { if (ssc_p->initialized) { @@ -369,7 +362,6 @@ static void atmel_ssc_shutdown(struct snd_pcm_substream *substream, ssc_p->cmr_div = ssc_p->tcmr_period = ssc_p->rcmr_period = 0; ssc_p->forced_divider = 0; } - spin_unlock_irq(&ssc_p->lock); /* Shutdown the SSC clock. */ pr_debug("atmel_ssc_dai: Stopping clock\n"); diff --git a/sound/soc/atmel/atmel_ssc_dai.h b/sound/soc/atmel/atmel_ssc_dai.h index ae764cb541c7..3470b966e449 100644 --- a/sound/soc/atmel/atmel_ssc_dai.h +++ b/sound/soc/atmel/atmel_ssc_dai.h @@ -93,7 +93,6 @@ struct atmel_ssc_state { struct atmel_ssc_info { char *name; struct ssc_device *ssc; - spinlock_t lock; /* lock for dir_mask */ unsigned short dir_mask; /* 0=unused, 1=playback, 2=capture */ unsigned short initialized; /* true if SSC has been initialized */ unsigned short daifmt; From 337c22ab1d4f316f37e7a7c78bdea3d768270542 Mon Sep 17 00:00:00 2001 From: Joshua Clayton Date: Mon, 12 Aug 2019 09:20:20 -0600 Subject: [PATCH 416/721] HID: core: reformat and reduce hid_printk macros Reformat hid_printk macros to use standard __VA_ARGS__ syntax. Per Joe Perches hid_printk(), hid_emerg(), hid_crit(), and hid_alert() are unlikely ever to be used. Remove them. Signed-off-by: Joshua Clayton Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/include/linux/hid.h b/include/linux/hid.h index d770ab1a0479..e6c7efdb0458 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1154,29 +1154,21 @@ int hid_pidff_init(struct hid_device *hid); #define hid_pidff_init NULL #endif -#define dbg_hid(format, arg...) \ +#define dbg_hid(fmt, ...) \ do { \ if (hid_debug) \ - printk(KERN_DEBUG "%s: " format, __FILE__, ##arg); \ + printk(KERN_DEBUG "%s: " fmt, __FILE__, ##__VA_ARGS__); \ } while (0) -#define hid_printk(level, hid, fmt, arg...) \ - dev_printk(level, &(hid)->dev, fmt, ##arg) -#define hid_emerg(hid, fmt, arg...) \ - dev_emerg(&(hid)->dev, fmt, ##arg) -#define hid_crit(hid, fmt, arg...) \ - dev_crit(&(hid)->dev, fmt, ##arg) -#define hid_alert(hid, fmt, arg...) \ - dev_alert(&(hid)->dev, fmt, ##arg) -#define hid_err(hid, fmt, arg...) \ - dev_err(&(hid)->dev, fmt, ##arg) -#define hid_notice(hid, fmt, arg...) \ - dev_notice(&(hid)->dev, fmt, ##arg) -#define hid_warn(hid, fmt, arg...) \ - dev_warn(&(hid)->dev, fmt, ##arg) -#define hid_info(hid, fmt, arg...) \ - dev_info(&(hid)->dev, fmt, ##arg) -#define hid_dbg(hid, fmt, arg...) \ - dev_dbg(&(hid)->dev, fmt, ##arg) +#define hid_err(hid, fmt, ...) \ + dev_err(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_notice(hid, fmt, ...) \ + dev_notice(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_warn(hid, fmt, ...) \ + dev_warn(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_info(hid, fmt, ...) \ + dev_info(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_dbg(hid, fmt, ...) \ + dev_dbg(&(hid)->dev, fmt, ##__VA_ARGS__) #endif From aaeabb121a6271775cef35ec3197e9ef124fdb2a Mon Sep 17 00:00:00 2001 From: Joshua Clayton Date: Mon, 12 Aug 2019 09:20:21 -0600 Subject: [PATCH 417/721] HID: core: Add printk_once variants to hid_warn() etc hid_warn_once() is needed. Add the others as part of the block. Signed-off-by: Joshua Clayton Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/hid.h b/include/linux/hid.h index e6c7efdb0458..cd41f209043f 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1171,4 +1171,15 @@ do { \ #define hid_dbg(hid, fmt, ...) \ dev_dbg(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_err_once(hid, fmt, ...) \ + dev_err_once(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_notice_once(hid, fmt, ...) \ + dev_notice_once(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_warn_once(hid, fmt, ...) \ + dev_warn_once(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_info_once(hid, fmt, ...) \ + dev_info_once(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_dbg_once(hid, fmt, ...) \ + dev_dbg_once(&(hid)->dev, fmt, ##__VA_ARGS__) + #endif From 0af10eed9b7308187c7865024248b2a2a5aa382a Mon Sep 17 00:00:00 2001 From: Joshua Clayton Date: Mon, 12 Aug 2019 09:20:22 -0600 Subject: [PATCH 418/721] HID: core: fix dmesg flooding if report field larger than 32bit Only warn once of oversize hid report value field On HP spectre x360 convertible the message: hid-sensor-hub 001F:8087:0AC2.0002: hid_field_extract() called with n (192) > 32! (kworker/1:2) is continually printed many times per second, crowding out all else. Protect dmesg by printing the warning only one time. The size of the hid report field data structure should probably be increased. The data structure is treated as a u32 in Linux, but an unlimited number of bits in the USB hid spec, so there is some rearchitecture needed now that devices are sending more than 32 bits. Signed-off-by: Joshua Clayton Signed-off-by: Benjamin Tissoires --- drivers/hid/hid-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 210b81a56e1a..3eaee2c37931 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1311,8 +1311,8 @@ u32 hid_field_extract(const struct hid_device *hid, u8 *report, unsigned offset, unsigned n) { if (n > 32) { - hid_warn(hid, "hid_field_extract() called with n (%d) > 32! (%s)\n", - n, current->comm); + hid_warn_once(hid, "%s() called with n (%d) > 32! (%s)\n", + __func__, n, current->comm); n = 32; } From 954dab193d19cbbff8f83b58c9360bf00ddb273c Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 18 Sep 2019 10:37:52 +0800 Subject: [PATCH 419/721] io_uring: use kmemdup instead of kmalloc and memcpy Just clean up the code, no function changes. Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0dadbdbead0f..42a684ef578a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2098,13 +2098,11 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (sqe_copy) { struct async_list *list; - memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy)); s->sqe = sqe_copy; - memcpy(&req->submit, s, sizeof(*s)); list = io_async_list_from_sqe(ctx, s->sqe); if (!io_add_to_prev_work(list, req)) { From 5f5ad9ced33621d353be6429c3900f8a526fcae8 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 18 Sep 2019 10:37:53 +0800 Subject: [PATCH 420/721] io_uring: fix use-after-free of shadow_req There is a potential dangling pointer problem. we never clean shadow_req, if there are multiple link lists in this series of sqes, then the shadow_req will not reallocate, and continue to use the last one. but in the previous, his memory has been released, thus forming a dangling pointer. let's clean up him and make sure that every new link list can reapply for a new shadow_req. Fixes: 4fe2c963154c ("io_uring: add support for link with drain") Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 42a684ef578a..1621243da1ea 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2357,6 +2357,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, io_queue_link_head(ctx, link, &link->submit, shadow_req, true); link = NULL; + shadow_req = NULL; } prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; @@ -2543,6 +2544,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, io_queue_link_head(ctx, link, &link->submit, shadow_req, force_nonblock); link = NULL; + shadow_req = NULL; } prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; From 6cc47d1d2a9b631f62405f56df651975c7587a97 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Sep 2019 11:18:23 -0600 Subject: [PATCH 421/721] io_uring: ensure poll commands clear ->sqe If we end up getting woken in poll (due to a signal), then we may need to punt the poll request to an async worker. When we do that, we look up the list to queue at, deferefencing req->submit.sqe, however that is only set for requests we initially decided to queue async. This fixes a crash with poll command usage and wakeups that need to punt to async context. Fixes: 54a91f3bb9b9 ("io_uring: limit parallelism of buffered writes") Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1621243da1ea..f9139f5bd158 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -446,16 +446,15 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) static inline void io_queue_async_work(struct io_ring_ctx *ctx, struct io_kiocb *req) { - int rw; + int rw = 0; - switch (req->submit.sqe->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - rw = !(req->rw.ki_flags & IOCB_DIRECT); - break; - default: - rw = 0; - break; + if (req->submit.sqe) { + switch (req->submit.sqe->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + rw = !(req->rw.ki_flags & IOCB_DIRECT); + break; + } } queue_work(ctx->sqo_wq[rw], &req->work); @@ -1714,6 +1713,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!poll->file) return -EBADF; + req->submit.sqe = NULL; INIT_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; From a1041c27b64ce744632147e19701c95fed14fab1 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 18 Sep 2019 17:25:52 +0800 Subject: [PATCH 422/721] io_uring: fix potential crash issue due to io_get_req failure Sometimes io_get_req will return a NUL, then we need to do the correct error handling, otherwise it will cause the kernel null pointer exception. Fixes: 4fe2c963154c ("io_uring: add support for link with drain") Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index f9139f5bd158..854dedd885fa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2364,12 +2364,15 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); + if (unlikely(!shadow_req)) + goto out; shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); refcount_dec(&shadow_req->refs); } shadow_req->sequence = sqes[i].sequence; } +out: if (unlikely(mm_fault)) { io_cqring_add_event(ctx, sqes[i].sqe->user_data, -EFAULT); @@ -2551,12 +2554,15 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); + if (unlikely(!shadow_req)) + goto out; shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); refcount_dec(&shadow_req->refs); } shadow_req->sequence = s.sequence; } +out: s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; From 9831a90ce64362f8429e8fd23838a9db2cdf7803 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Sep 2019 09:48:55 -0600 Subject: [PATCH 423/721] io_uring: use cond_resched() in sqthread If preempt isn't enabled in the kernel, we can run into hang issues with sqthread submissions. Use cond_resched() to play nice instead of cpu_relax(), if we end up starting the loop and not having any events pending for submissions. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 854dedd885fa..05a299e80159 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2438,7 +2438,7 @@ static int io_sq_thread(void *data) * to sleep. */ if (inflight || !time_after(jiffies, timeout)) { - cpu_relax(); + cond_resched(); continue; } From 5262f567987d3c30052b22e78c35c2313d07b230 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Sep 2019 12:26:57 -0600 Subject: [PATCH 424/721] io_uring: IORING_OP_TIMEOUT support There's been a few requests for functionality similar to io_getevents() and epoll_wait(), where the user can specify a timeout for waiting on events. I deliberately did not add support for this through the system call initially to avoid overloading the args, but I can see that the use cases for this are valid. This adds support for IORING_OP_TIMEOUT. If a user wants to get woken when waiting for events, simply submit one of these timeout commands with your wait call (or before). This ensures that the application sleeping on the CQ ring waiting for events will get woken. The timeout command is passed in as a pointer to a struct timespec. Timeouts are relative. The timeout command also includes a way to auto-cancel after N events has passed. Signed-off-by: Jens Axboe --- fs/io_uring.c | 149 ++++++++++++++++++++++++++++++++-- include/uapi/linux/io_uring.h | 2 + 2 files changed, 146 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 05a299e80159..9d8e703bc851 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -200,6 +200,7 @@ struct io_ring_ctx { struct io_uring_sqe *sq_sqes; struct list_head defer_list; + struct list_head timeout_list; } ____cacheline_aligned_in_smp; /* IO offload */ @@ -216,6 +217,7 @@ struct io_ring_ctx { struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; + atomic_t cq_timeouts; } ____cacheline_aligned_in_smp; struct io_rings *rings; @@ -283,6 +285,11 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_timeout { + struct file *file; + struct hrtimer timer; +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -294,6 +301,7 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; + struct io_timeout timeout; }; struct sqe_submit submit; @@ -313,6 +321,7 @@ struct io_kiocb { #define REQ_F_LINK_DONE 128 /* linked sqes done */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ +#define REQ_F_TIMEOUT 1024 /* timeout request */ u64 user_data; u32 result; u32 sequence; @@ -344,6 +353,8 @@ struct io_submit_state { }; static void io_sq_wq_submit_work(struct work_struct *work); +static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res); static void __io_free_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -400,26 +411,30 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->cancel_list); INIT_LIST_HEAD(&ctx->defer_list); + INIT_LIST_HEAD(&ctx->timeout_list); return ctx; } static inline bool io_sequence_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) { - if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) + /* timeout requests always honor sequence */ + if (!(req->flags & REQ_F_TIMEOUT) && + (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; } -static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) +static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, + struct list_head *list) { struct io_kiocb *req; - if (list_empty(&ctx->defer_list)) + if (list_empty(list)) return NULL; - req = list_first_entry(&ctx->defer_list, struct io_kiocb, list); + req = list_first_entry(list, struct io_kiocb, list); if (!io_sequence_defer(ctx, req)) { list_del_init(&req->list); return req; @@ -428,6 +443,16 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) return NULL; } +static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) +{ + return __io_get_deferred_req(ctx, &ctx->defer_list); +} + +static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) +{ + return __io_get_deferred_req(ctx, &ctx->timeout_list); +} + static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; @@ -460,10 +485,36 @@ static inline void io_queue_async_work(struct io_ring_ctx *ctx, queue_work(ctx->sqo_wq[rw], &req->work); } +static void io_kill_timeout(struct io_kiocb *req) +{ + int ret; + + ret = hrtimer_try_to_cancel(&req->timeout.timer); + if (ret != -1) { + atomic_inc(&req->ctx->cq_timeouts); + list_del(&req->list); + io_cqring_fill_event(req->ctx, req->user_data, 0); + __io_free_req(req); + } +} + +static void io_kill_timeouts(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req, *tmp; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + io_kill_timeout(req); + spin_unlock_irq(&ctx->completion_lock); +} + static void io_commit_cqring(struct io_ring_ctx *ctx) { struct io_kiocb *req; + while ((req = io_get_timeout_req(ctx)) != NULL) + io_kill_timeout(req); + __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { @@ -1765,6 +1816,81 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ipt.error; } +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) +{ + struct io_ring_ctx *ctx; + struct io_kiocb *req; + unsigned long flags; + + req = container_of(timer, struct io_kiocb, timeout.timer); + ctx = req->ctx; + atomic_inc(&ctx->cq_timeouts); + + spin_lock_irqsave(&ctx->completion_lock, flags); + list_del(&req->list); + + io_cqring_fill_event(ctx, req->user_data, -ETIME); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + + io_put_req(req); + return HRTIMER_NORESTART; +} + +static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned count, req_dist, tail_index; + struct io_ring_ctx *ctx = req->ctx; + struct list_head *entry; + struct timespec ts; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || + sqe->len != 1) + return -EINVAL; + if (copy_from_user(&ts, (void __user *) (unsigned long) sqe->addr, + sizeof(ts))) + return -EFAULT; + + /* + * sqe->off holds how many events that need to occur for this + * timeout event to be satisfied. + */ + count = READ_ONCE(sqe->off); + if (!count) + count = 1; + + req->sequence = ctx->cached_sq_head + count - 1; + req->flags |= REQ_F_TIMEOUT; + + /* + * Insertion sort, ensuring the first entry in the list is always + * the one we need first. + */ + tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped; + req_dist = req->sequence - tail_index; + spin_lock_irq(&ctx->completion_lock); + list_for_each_prev(entry, &ctx->timeout_list) { + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); + unsigned dist; + + dist = nxt->sequence - tail_index; + if (req_dist >= dist) + break; + } + list_add(&req->list, entry); + spin_unlock_irq(&ctx->completion_lock); + + hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + req->timeout.timer.function = io_timeout_fn; + hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts), + HRTIMER_MODE_REL); + return 0; +} + static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -1842,6 +1968,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_RECVMSG: ret = io_recvmsg(req, s->sqe, force_nonblock); break; + case IORING_OP_TIMEOUT: + ret = io_timeout(req, s->sqe); + break; default: ret = -EINVAL; break; @@ -2599,6 +2728,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { struct io_rings *rings = ctx->rings; + unsigned nr_timeouts; int ret; if (io_cqring_events(rings) >= min_events) @@ -2617,7 +2747,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events); + nr_timeouts = atomic_read(&ctx->cq_timeouts); + /* + * Return if we have enough events, or if a timeout occured since + * we started waiting. For timeouts, we always want to return to + * userspace. + */ + ret = wait_event_interruptible(ctx->wait, + io_cqring_events(rings) >= min_events || + atomic_read(&ctx->cq_timeouts) != nr_timeouts); restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; @@ -3288,6 +3426,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + io_kill_timeouts(ctx); io_poll_remove_all(ctx); io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 96ee9d94b73e..ea57526a5b89 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -28,6 +28,7 @@ struct io_uring_sqe { __u16 poll_events; __u32 sync_range_flags; __u32 msg_flags; + __u32 timeout_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -61,6 +62,7 @@ struct io_uring_sqe { #define IORING_OP_SYNC_FILE_RANGE 8 #define IORING_OP_SENDMSG 9 #define IORING_OP_RECVMSG 10 +#define IORING_OP_TIMEOUT 11 /* * sqe->fsync_flags From 947ec14c7369a87625f03abaab5b3f4d33ac73ba Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 18 Sep 2019 15:02:38 +0900 Subject: [PATCH 425/721] ASoC: rsnd: do error check after rsnd_channel_normalization() SSI need to use rsnd_channel_normalization() for TDM-split mode, thus, channel check need to do after that. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/874l1aw39d.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/sh/rcar/ssi.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c index f6a7466622ea..fc5d089868df 100644 --- a/sound/soc/sh/rcar/ssi.c +++ b/sound/soc/sh/rcar/ssi.c @@ -286,6 +286,11 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod, if (rsnd_ssi_is_multi_slave(mod, io)) return 0; + if (rsnd_runtime_is_tdm_split(io)) + chan = rsnd_io_converted_chan(io); + + chan = rsnd_channel_normalization(chan); + if (ssi->usrcnt > 0) { if (ssi->rate != rate) { dev_err(dev, "SSI parent/child should use same rate\n"); @@ -300,11 +305,6 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod, return 0; } - if (rsnd_runtime_is_tdm_split(io)) - chan = rsnd_io_converted_chan(io); - - chan = rsnd_channel_normalization(chan); - main_rate = rsnd_ssi_clk_query(rdai, rate, chan, &idx); if (!main_rate) { dev_err(dev, "unsupported clock rate\n"); From d2935de7e4fd3f873976f1872b505584b727574c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 Mar 2019 14:58:36 +0000 Subject: [PATCH 426/721] vfs: Convert bpf to use the new mount API Convert the bpf filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells cc: Alexei Starovoitov cc: Daniel Borkmann cc: Martin KaFai Lau cc: Song Liu cc: Yonghong Song cc: netdev@vger.kernel.org cc: bpf@vger.kernel.org Signed-off-by: Al Viro --- kernel/bpf/inode.c | 92 +++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index cc0d0cf114e3..a70f7209cda3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -14,8 +14,9 @@ #include #include #include +#include +#include #include -#include #include #include #include @@ -583,58 +584,52 @@ static const struct super_operations bpf_super_ops = { enum { OPT_MODE, - OPT_ERR, }; -static const match_table_t bpf_mount_tokens = { - { OPT_MODE, "mode=%o" }, - { OPT_ERR, NULL }, +static const struct fs_parameter_spec bpf_param_specs[] = { + fsparam_u32oct ("mode", OPT_MODE), + {} +}; + +static const struct fs_parameter_description bpf_fs_parameters = { + .name = "bpf", + .specs = bpf_param_specs, }; struct bpf_mount_opts { umode_t mode; }; -static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { - substring_t args[MAX_OPT_ARGS]; - int option, token; - char *ptr; + struct bpf_mount_opts *opts = fc->fs_private; + struct fs_parse_result result; + int opt; - opts->mode = S_IRWXUGO; - - while ((ptr = strsep(&data, ",")) != NULL) { - if (!*ptr) - continue; - - token = match_token(ptr, bpf_mount_tokens, args); - switch (token) { - case OPT_MODE: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->mode = option & S_IALLUGO; - break; + opt = fs_parse(fc, &bpf_fs_parameters, param, &result); + if (opt < 0) /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ - } + return opt == -ENOPARAM ? 0 : opt; + + switch (opt) { + case OPT_MODE: + opts->mode = result.uint_32 & S_IALLUGO; + break; } return 0; } -static int bpf_fill_super(struct super_block *sb, void *data, int silent) +static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; - struct bpf_mount_opts opts; + struct bpf_mount_opts *opts = fc->fs_private; struct inode *inode; int ret; - ret = bpf_parse_options(data, &opts); - if (ret) - return ret; - ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -644,21 +639,50 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= S_ISVTX | opts.mode; + inode->i_mode |= S_ISVTX | opts->mode; return 0; } -static struct dentry *bpf_mount(struct file_system_type *type, int flags, - const char *dev_name, void *data) +static int bpf_get_tree(struct fs_context *fc) { - return mount_nodev(type, flags, data, bpf_fill_super); + return get_tree_nodev(fc, bpf_fill_super); +} + +static void bpf_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations bpf_context_ops = { + .free = bpf_free_fc, + .parse_param = bpf_parse_param, + .get_tree = bpf_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int bpf_init_fs_context(struct fs_context *fc) +{ + struct bpf_mount_opts *opts; + + opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + opts->mode = S_IRWXUGO; + + fc->fs_private = opts; + fc->ops = &bpf_context_ops; + return 0; } static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", - .mount = bpf_mount, + .init_fs_context = bpf_init_fs_context, + .parameters = &bpf_fs_parameters, .kill_sb = kill_litter_super, }; From dec90f61f125568088a6ada61832b366c1670e9f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Mar 2019 23:31:05 +0000 Subject: [PATCH 427/721] vfs: Convert functionfs to use the new mount API Convert the functionfs filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells Acked-by: Felipe Balbi Acked-by: Michal Nazarewicz cc: linux-usb@vger.kernel.org Signed-off-by: Al Viro --- drivers/usb/gadget/function/f_fs.c | 237 +++++++++++++++-------------- 1 file changed, 122 insertions(+), 115 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 213ff03c8a9f..59d9d512dcda 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1451,9 +1452,9 @@ struct ffs_sb_fill_data { struct ffs_data *ffs_data; }; -static int ffs_sb_fill(struct super_block *sb, void *_data, int silent) +static int ffs_sb_fill(struct super_block *sb, struct fs_context *fc) { - struct ffs_sb_fill_data *data = _data; + struct ffs_sb_fill_data *data = fc->fs_private; struct inode *inode; struct ffs_data *ffs = data->ffs_data; @@ -1486,147 +1487,152 @@ static int ffs_sb_fill(struct super_block *sb, void *_data, int silent) return 0; } -static int ffs_fs_parse_opts(struct ffs_sb_fill_data *data, char *opts) +enum { + Opt_no_disconnect, + Opt_rmode, + Opt_fmode, + Opt_mode, + Opt_uid, + Opt_gid, +}; + +static const struct fs_parameter_spec ffs_fs_param_specs[] = { + fsparam_bool ("no_disconnect", Opt_no_disconnect), + fsparam_u32 ("rmode", Opt_rmode), + fsparam_u32 ("fmode", Opt_fmode), + fsparam_u32 ("mode", Opt_mode), + fsparam_u32 ("uid", Opt_uid), + fsparam_u32 ("gid", Opt_gid), + {} +}; + +static const struct fs_parameter_description ffs_fs_fs_parameters = { + .name = "kAFS", + .specs = ffs_fs_param_specs, +}; + +static int ffs_fs_parse_param(struct fs_context *fc, struct fs_parameter *param) { + struct ffs_sb_fill_data *data = fc->fs_private; + struct fs_parse_result result; + int opt; + ENTER(); - if (!opts || !*opts) - return 0; + opt = fs_parse(fc, &ffs_fs_fs_parameters, param, &result); + if (opt < 0) + return opt; - for (;;) { - unsigned long value; - char *eq, *comma; + switch (opt) { + case Opt_no_disconnect: + data->no_disconnect = result.boolean; + break; + case Opt_rmode: + data->root_mode = (result.uint_32 & 0555) | S_IFDIR; + break; + case Opt_fmode: + data->perms.mode = (result.uint_32 & 0666) | S_IFREG; + break; + case Opt_mode: + data->root_mode = (result.uint_32 & 0555) | S_IFDIR; + data->perms.mode = (result.uint_32 & 0666) | S_IFREG; + break; - /* Option limit */ - comma = strchr(opts, ','); - if (comma) - *comma = 0; + case Opt_uid: + data->perms.uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(data->perms.uid)) + goto unmapped_value; + break; + case Opt_gid: + data->perms.gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(data->perms.gid)) + goto unmapped_value; + break; - /* Value limit */ - eq = strchr(opts, '='); - if (unlikely(!eq)) { - pr_err("'=' missing in %s\n", opts); - return -EINVAL; - } - *eq = 0; - - /* Parse value */ - if (kstrtoul(eq + 1, 0, &value)) { - pr_err("%s: invalid value: %s\n", opts, eq + 1); - return -EINVAL; - } - - /* Interpret option */ - switch (eq - opts) { - case 13: - if (!memcmp(opts, "no_disconnect", 13)) - data->no_disconnect = !!value; - else - goto invalid; - break; - case 5: - if (!memcmp(opts, "rmode", 5)) - data->root_mode = (value & 0555) | S_IFDIR; - else if (!memcmp(opts, "fmode", 5)) - data->perms.mode = (value & 0666) | S_IFREG; - else - goto invalid; - break; - - case 4: - if (!memcmp(opts, "mode", 4)) { - data->root_mode = (value & 0555) | S_IFDIR; - data->perms.mode = (value & 0666) | S_IFREG; - } else { - goto invalid; - } - break; - - case 3: - if (!memcmp(opts, "uid", 3)) { - data->perms.uid = make_kuid(current_user_ns(), value); - if (!uid_valid(data->perms.uid)) { - pr_err("%s: unmapped value: %lu\n", opts, value); - return -EINVAL; - } - } else if (!memcmp(opts, "gid", 3)) { - data->perms.gid = make_kgid(current_user_ns(), value); - if (!gid_valid(data->perms.gid)) { - pr_err("%s: unmapped value: %lu\n", opts, value); - return -EINVAL; - } - } else { - goto invalid; - } - break; - - default: -invalid: - pr_err("%s: invalid option\n", opts); - return -EINVAL; - } - - /* Next iteration */ - if (!comma) - break; - opts = comma + 1; + default: + return -ENOPARAM; } return 0; + +unmapped_value: + return invalf(fc, "%s: unmapped value: %u", param->key, result.uint_32); } -/* "mount -t functionfs dev_name /dev/function" ends up here */ - -static struct dentry * -ffs_fs_mount(struct file_system_type *t, int flags, - const char *dev_name, void *opts) +/* + * Set up the superblock for a mount. + */ +static int ffs_fs_get_tree(struct fs_context *fc) { - struct ffs_sb_fill_data data = { - .perms = { - .mode = S_IFREG | 0600, - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, - }, - .root_mode = S_IFDIR | 0500, - .no_disconnect = false, - }; - struct dentry *rv; - int ret; + struct ffs_sb_fill_data *ctx = fc->fs_private; void *ffs_dev; struct ffs_data *ffs; ENTER(); - ret = ffs_fs_parse_opts(&data, opts); - if (unlikely(ret < 0)) - return ERR_PTR(ret); + if (!fc->source) + return invalf(fc, "No source specified"); - ffs = ffs_data_new(dev_name); + ffs = ffs_data_new(fc->source); if (unlikely(!ffs)) - return ERR_PTR(-ENOMEM); - ffs->file_perms = data.perms; - ffs->no_disconnect = data.no_disconnect; + return -ENOMEM; + ffs->file_perms = ctx->perms; + ffs->no_disconnect = ctx->no_disconnect; - ffs->dev_name = kstrdup(dev_name, GFP_KERNEL); + ffs->dev_name = kstrdup(fc->source, GFP_KERNEL); if (unlikely(!ffs->dev_name)) { ffs_data_put(ffs); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } - ffs_dev = ffs_acquire_dev(dev_name); + ffs_dev = ffs_acquire_dev(ffs->dev_name); if (IS_ERR(ffs_dev)) { ffs_data_put(ffs); - return ERR_CAST(ffs_dev); + return PTR_ERR(ffs_dev); } - ffs->private_data = ffs_dev; - data.ffs_data = ffs; - rv = mount_nodev(t, flags, &data, ffs_sb_fill); - if (IS_ERR(rv) && data.ffs_data) { - ffs_release_dev(data.ffs_data); - ffs_data_put(data.ffs_data); + ffs->private_data = ffs_dev; + ctx->ffs_data = ffs; + return get_tree_nodev(fc, ffs_sb_fill); +} + +static void ffs_fs_free_fc(struct fs_context *fc) +{ + struct ffs_sb_fill_data *ctx = fc->fs_private; + + if (ctx) { + if (ctx->ffs_data) { + ffs_release_dev(ctx->ffs_data); + ffs_data_put(ctx->ffs_data); + } + + kfree(ctx); } - return rv; +} + +static const struct fs_context_operations ffs_fs_context_ops = { + .free = ffs_fs_free_fc, + .parse_param = ffs_fs_parse_param, + .get_tree = ffs_fs_get_tree, +}; + +static int ffs_fs_init_fs_context(struct fs_context *fc) +{ + struct ffs_sb_fill_data *ctx; + + ctx = kzalloc(sizeof(struct ffs_sb_fill_data), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->perms.mode = S_IFREG | 0600; + ctx->perms.uid = GLOBAL_ROOT_UID; + ctx->perms.gid = GLOBAL_ROOT_GID; + ctx->root_mode = S_IFDIR | 0500; + ctx->no_disconnect = false; + + fc->fs_private = ctx; + fc->ops = &ffs_fs_context_ops; + return 0; } static void @@ -1644,7 +1650,8 @@ ffs_fs_kill_sb(struct super_block *sb) static struct file_system_type ffs_fs_type = { .owner = THIS_MODULE, .name = "functionfs", - .mount = ffs_fs_mount, + .init_fs_context = ffs_fs_init_fs_context, + .parameters = &ffs_fs_fs_parameters, .kill_sb = ffs_fs_kill_sb, }; MODULE_ALIAS_FS("functionfs"); From b54c64f7adeb241423cd46598f458b5486b0375e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 21 Mar 2019 10:08:08 +0000 Subject: [PATCH 428/721] hypfs: Fix error number left in struct pointer member In hypfs_fill_super(), if hypfs_create_update_file() fails, sbi->update_file is left holding an error number. This is passed to hypfs_kill_super() which doesn't check for this. Fix this by not setting sbi->update_value until after we've checked for error. Fixes: 24bbb1faf3f0 ("[PATCH] s390_hypfs filesystem") Signed-off-by: David Howells cc: Martin Schwidefsky cc: Heiko Carstens cc: linux-s390@vger.kernel.org Signed-off-by: Al Viro --- arch/s390/hypfs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index ccad1398abd4..b5cfcad953c2 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -269,7 +269,7 @@ static int hypfs_show_options(struct seq_file *s, struct dentry *root) static int hypfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *root_inode; - struct dentry *root_dentry; + struct dentry *root_dentry, *update_file; int rc = 0; struct hypfs_sb_info *sbi; @@ -300,9 +300,10 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent) rc = hypfs_diag_create_files(root_dentry); if (rc) return rc; - sbi->update_file = hypfs_create_update_file(root_dentry); - if (IS_ERR(sbi->update_file)) - return PTR_ERR(sbi->update_file); + update_file = hypfs_create_update_file(root_dentry); + if (IS_ERR(update_file)) + return PTR_ERR(update_file); + sbi->update_file = update_file; hypfs_update_update(sb); pr_info("Hypervisor filesystem mounted\n"); return 0; From 8f8d8d11f76e991516e558eda7639e7b6a760e41 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 21 Mar 2019 09:35:44 +0000 Subject: [PATCH 429/721] vfs: Convert hypfs to use the new mount API Convert the hypfs filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells cc: Martin Schwidefsky cc: Heiko Carstens cc: linux-s390@vger.kernel.org Signed-off-by: Al Viro --- arch/s390/hypfs/inode.c | 124 ++++++++++++++++++++++------------------ 1 file changed, 68 insertions(+), 56 deletions(-) diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index b5cfcad953c2..498966f6af59 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -12,17 +12,17 @@ #include #include #include +#include +#include #include #include #include #include #include -#include #include #include #include #include -#include #include #include #include "hypfs.h" @@ -207,52 +207,44 @@ static int hypfs_release(struct inode *inode, struct file *filp) return 0; } -enum { opt_uid, opt_gid, opt_err }; +enum { Opt_uid, Opt_gid, }; -static const match_table_t hypfs_tokens = { - {opt_uid, "uid=%u"}, - {opt_gid, "gid=%u"}, - {opt_err, NULL} +static const struct fs_parameter_spec hypfs_param_specs[] = { + fsparam_u32("gid", Opt_gid), + fsparam_u32("uid", Opt_uid), + {} }; -static int hypfs_parse_options(char *options, struct super_block *sb) +static const struct fs_parameter_description hypfs_fs_parameters = { + .name = "hypfs", + .specs = hypfs_param_specs, +}; + +static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *str; - substring_t args[MAX_OPT_ARGS]; + struct hypfs_sb_info *hypfs_info = fc->s_fs_info; + struct fs_parse_result result; kuid_t uid; kgid_t gid; + int opt; - if (!options) - return 0; - while ((str = strsep(&options, ",")) != NULL) { - int token, option; - struct hypfs_sb_info *hypfs_info = sb->s_fs_info; + opt = fs_parse(fc, &hypfs_fs_parameters, param, &result); + if (opt < 0) + return opt; - if (!*str) - continue; - token = match_token(str, hypfs_tokens, args); - switch (token) { - case opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) - return -EINVAL; - hypfs_info->uid = uid; - break; - case opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) - return -EINVAL; - hypfs_info->gid = gid; - break; - case opt_err: - default: - pr_err("%s is not a valid mount option\n", str); - return -EINVAL; - } + switch (opt) { + case Opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return invalf(fc, "Unknown uid"); + hypfs_info->uid = uid; + break; + case Opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return invalf(fc, "Unknown gid"); + hypfs_info->gid = gid; + break; } return 0; } @@ -266,26 +258,18 @@ static int hypfs_show_options(struct seq_file *s, struct dentry *root) return 0; } -static int hypfs_fill_super(struct super_block *sb, void *data, int silent) +static int hypfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct hypfs_sb_info *sbi = sb->s_fs_info; struct inode *root_inode; struct dentry *root_dentry, *update_file; - int rc = 0; - struct hypfs_sb_info *sbi; + int rc; - sbi = kzalloc(sizeof(struct hypfs_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - mutex_init(&sbi->lock); - sbi->uid = current_uid(); - sbi->gid = current_gid(); - sb->s_fs_info = sbi; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = HYPFS_MAGIC; sb->s_op = &hypfs_s_ops; - if (hypfs_parse_options(data, sb)) - return -EINVAL; + root_inode = hypfs_make_inode(sb, S_IFDIR | 0755); if (!root_inode) return -ENOMEM; @@ -309,10 +293,37 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct dentry *hypfs_mount(struct file_system_type *fst, int flags, - const char *devname, void *data) +static int hypfs_get_tree(struct fs_context *fc) { - return mount_single(fst, flags, data, hypfs_fill_super); + return get_tree_single(fc, hypfs_fill_super); +} + +static void hypfs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations hypfs_context_ops = { + .free = hypfs_free_fc, + .parse_param = hypfs_parse_param, + .get_tree = hypfs_get_tree, +}; + +static int hypfs_init_fs_context(struct fs_context *fc) +{ + struct hypfs_sb_info *sbi; + + sbi = kzalloc(sizeof(struct hypfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + mutex_init(&sbi->lock); + sbi->uid = current_uid(); + sbi->gid = current_gid(); + + fc->s_fs_info = sbi; + fc->ops = &hypfs_context_ops; + return 0; } static void hypfs_kill_super(struct super_block *sb) @@ -443,7 +454,8 @@ static const struct file_operations hypfs_file_ops = { static struct file_system_type hypfs_type = { .owner = THIS_MODULE, .name = "s390_hypfs", - .mount = hypfs_mount, + .init_fs_context = hypfs_init_fs_context, + .parameters = &hypfs_fs_parameters, .kill_sb = hypfs_kill_super }; From d2e0981c3b9a7d31b59f81477c6acacb1bcc4513 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 21 Mar 2019 10:34:14 +0000 Subject: [PATCH 430/721] vfs: Convert spufs to use the new mount API Convert the spufs filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.txt for more information. Signed-off-by: David Howells cc: Jeremy Kerr cc: Arnd Bergmann cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Al Viro --- arch/powerpc/platforms/cell/spufs/inode.c | 203 ++++++++++++---------- 1 file changed, 114 insertions(+), 89 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 065ff14b76e1..1d93e55a2de1 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -10,6 +10,8 @@ #include #include +#include +#include #include #include #include @@ -20,7 +22,6 @@ #include #include #include -#include #include #include @@ -30,7 +31,7 @@ #include "spufs.h" struct spufs_sb_info { - int debug; + bool debug; }; static struct kmem_cache *spufs_inode_cache; @@ -574,16 +575,27 @@ long spufs_create(struct path *path, struct dentry *dentry, } /* File system initialization */ -enum { - Opt_uid, Opt_gid, Opt_mode, Opt_debug, Opt_err, +struct spufs_fs_context { + kuid_t uid; + kgid_t gid; + umode_t mode; }; -static const match_table_t spufs_tokens = { - { Opt_uid, "uid=%d" }, - { Opt_gid, "gid=%d" }, - { Opt_mode, "mode=%o" }, - { Opt_debug, "debug" }, - { Opt_err, NULL }, +enum { + Opt_uid, Opt_gid, Opt_mode, Opt_debug, +}; + +static const struct fs_parameter_spec spufs_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_u32oct ("mode", Opt_mode), + fsparam_u32 ("uid", Opt_uid), + fsparam_flag ("debug", Opt_debug), + {} +}; + +static const struct fs_parameter_description spufs_fs_parameters = { + .name = "spufs", + .specs = spufs_param_specs, }; static int spufs_show_options(struct seq_file *m, struct dentry *root) @@ -604,47 +616,41 @@ static int spufs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static int -spufs_parse_options(struct super_block *sb, char *options, struct inode *root) +static int spufs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; + struct spufs_fs_context *ctx = fc->fs_private; + struct spufs_sb_info *sbi = fc->s_fs_info; + struct fs_parse_result result; + kuid_t uid; + kgid_t gid; + int opt; - while ((p = strsep(&options, ",")) != NULL) { - int token, option; + opt = fs_parse(fc, &spufs_fs_parameters, param, &result); + if (opt < 0) + return opt; - if (!*p) - continue; - - token = match_token(p, spufs_tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return 0; - root->i_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(root->i_uid)) - return 0; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - root->i_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(root->i_gid)) - return 0; - break; - case Opt_mode: - if (match_octal(&args[0], &option)) - return 0; - root->i_mode = option | S_IFDIR; - break; - case Opt_debug: - spufs_get_sb_info(sb)->debug = 1; - break; - default: - return 0; - } + switch (opt) { + case Opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return invalf(fc, "Unknown uid"); + ctx->uid = uid; + break; + case Opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return invalf(fc, "Unknown gid"); + ctx->gid = gid; + break; + case Opt_mode: + ctx->mode = result.uint_32 & S_IALLUGO; + break; + case Opt_debug: + sbi->debug = true; + break; } - return 1; + + return 0; } static void spufs_exit_isolated_loader(void) @@ -678,79 +684,98 @@ spufs_init_isolated_loader(void) printk(KERN_INFO "spufs: SPU isolation mode enabled\n"); } -static int -spufs_create_root(struct super_block *sb, void *data) +static int spufs_create_root(struct super_block *sb, struct fs_context *fc) { + struct spufs_fs_context *ctx = fc->fs_private; struct inode *inode; - int ret; - ret = -ENODEV; if (!spu_management_ops) - goto out; + return -ENODEV; - ret = -ENOMEM; - inode = spufs_new_inode(sb, S_IFDIR | 0775); + inode = spufs_new_inode(sb, S_IFDIR | ctx->mode); if (!inode) - goto out; + return -ENOMEM; + inode->i_uid = ctx->uid; + inode->i_gid = ctx->gid; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; SPUFS_I(inode)->i_ctx = NULL; inc_nlink(inode); - ret = -EINVAL; - if (!spufs_parse_options(sb, data, inode)) - goto out_iput; - - ret = -ENOMEM; sb->s_root = d_make_root(inode); if (!sb->s_root) - goto out; - + return -ENOMEM; return 0; -out_iput: - iput(inode); -out: - return ret; } -static int -spufs_fill_super(struct super_block *sb, void *data, int silent) +static const struct super_operations spufs_ops = { + .alloc_inode = spufs_alloc_inode, + .free_inode = spufs_free_inode, + .statfs = simple_statfs, + .evict_inode = spufs_evict_inode, + .show_options = spufs_show_options, +}; + +static int spufs_fill_super(struct super_block *sb, struct fs_context *fc) { - struct spufs_sb_info *info; - static const struct super_operations s_ops = { - .alloc_inode = spufs_alloc_inode, - .free_inode = spufs_free_inode, - .statfs = simple_statfs, - .evict_inode = spufs_evict_inode, - .show_options = spufs_show_options, - }; - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = SPUFS_MAGIC; - sb->s_op = &s_ops; - sb->s_fs_info = info; + sb->s_op = &spufs_ops; - return spufs_create_root(sb, data); + return spufs_create_root(sb, fc); } -static struct dentry * -spufs_mount(struct file_system_type *fstype, int flags, - const char *name, void *data) +static int spufs_get_tree(struct fs_context *fc) { - return mount_single(fstype, flags, data, spufs_fill_super); + return get_tree_single(fc, spufs_fill_super); +} + +static void spufs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations spufs_context_ops = { + .free = spufs_free_fc, + .parse_param = spufs_parse_param, + .get_tree = spufs_get_tree, +}; + +static int spufs_init_fs_context(struct fs_context *fc) +{ + struct spufs_fs_context *ctx; + struct spufs_sb_info *sbi; + + ctx = kzalloc(sizeof(struct spufs_fs_context), GFP_KERNEL); + if (!ctx) + goto nomem; + + sbi = kzalloc(sizeof(struct spufs_sb_info), GFP_KERNEL); + if (!sbi) + goto nomem_ctx; + + ctx->uid = current_uid(); + ctx->gid = current_gid(); + ctx->mode = 0755; + + fc->s_fs_info = sbi; + fc->ops = &spufs_context_ops; + return 0; + +nomem_ctx: + kfree(ctx); +nomem: + return -ENOMEM; } static struct file_system_type spufs_type = { .owner = THIS_MODULE, .name = "spufs", - .mount = spufs_mount, + .init_fs_context = spufs_init_fs_context, + .parameters = &spufs_fs_parameters, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("spufs"); From 1f52aa08d12f8d359e71b4bfd73ca9d5d668e4da Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Wed, 27 Mar 2019 14:46:00 +0000 Subject: [PATCH 431/721] gfs2: Convert gfs2 to fs_context Convert gfs2 and gfs2meta to fs_context. Removes the duplicated vfs code from gfs2_mount and instead uses the new vfs_get_block_super() before switching the ->root to the appropriate dentry. The mount option parsing has been converted to the new API and error reporting for invalid options has been made more precise at the same time. All of the mount/remount code has been moved into ops_fstype.c Signed-off-by: Andrew Price Signed-off-by: David Howells cc: cluster-devel@redhat.com Signed-off-by: Al Viro --- fs/gfs2/incore.h | 8 +- fs/gfs2/ops_fstype.c | 497 ++++++++++++++++++++++++++++++++----------- fs/gfs2/super.c | 333 +---------------------------- fs/gfs2/super.h | 3 +- 4 files changed, 381 insertions(+), 460 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 7a993d7c022e..0662747353e7 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -584,10 +584,10 @@ struct gfs2_args { unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ unsigned int ar_loccookie:1; /* use location based readdir cookies */ - int ar_commit; /* Commit interval */ - int ar_statfs_quantum; /* The fast statfs interval */ - int ar_quota_quantum; /* The quota interval */ - int ar_statfs_percent; /* The % change to force sync */ + s32 ar_commit; /* Commit interval */ + s32 ar_statfs_quantum; /* The fast statfs interval */ + s32 ar_quota_quantum; /* The quota interval */ + s32 ar_statfs_percent; /* The % change to force sync */ }; struct gfs2_tune { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 4a8e5a7310f0..0856f3c6ed8b 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -1030,16 +1031,17 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp) } /** - * fill_super - Read in superblock + * gfs2_fill_super - Read in superblock * @sb: The VFS superblock - * @data: Mount options + * @args: Mount options * @silent: Don't complain if it's not a GFS2 filesystem * - * Returns: errno + * Returns: -errno */ - -static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent) +static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) { + struct gfs2_args *args = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct gfs2_sbd *sdp; struct gfs2_holder mount_gh; int error; @@ -1204,161 +1206,411 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent return error; } -static int set_gfs2_super(struct super_block *s, void *data) +/** + * gfs2_get_tree - Get the GFS2 superblock and root directory + * @fc: The filesystem context + * + * Returns: 0 or -errno on error + */ +static int gfs2_get_tree(struct fs_context *fc) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + struct gfs2_args *args = fc->fs_private; + struct gfs2_sbd *sdp; + int error; + + error = get_tree_bdev(fc, gfs2_fill_super); + if (error) + return error; + + sdp = fc->root->d_sb->s_fs_info; + dput(fc->root); + if (args->ar_meta) + fc->root = dget(sdp->sd_master_dir); + else + fc->root = dget(sdp->sd_root_dir); return 0; } -static int test_gfs2_super(struct super_block *s, void *ptr) +static void gfs2_fc_free(struct fs_context *fc) { - struct block_device *bdev = ptr; - return (bdev == s->s_bdev); + struct gfs2_args *args = fc->fs_private; + + kfree(args); } -/** - * gfs2_mount - Get the GFS2 superblock - * @fs_type: The GFS2 filesystem type - * @flags: Mount flags - * @dev_name: The name of the device - * @data: The mount arguments - * - * Q. Why not use get_sb_bdev() ? - * A. We need to select one of two root directories to mount, independent - * of whether this is the initial, or subsequent, mount of this sb - * - * Returns: 0 or -ve on error - */ +enum gfs2_param { + Opt_lockproto, + Opt_locktable, + Opt_hostdata, + Opt_spectator, + Opt_ignore_local_fs, + Opt_localflocks, + Opt_localcaching, + Opt_debug, + Opt_upgrade, + Opt_acl, + Opt_quota, + Opt_suiddir, + Opt_data, + Opt_meta, + Opt_discard, + Opt_commit, + Opt_errors, + Opt_statfs_quantum, + Opt_statfs_percent, + Opt_quota_quantum, + Opt_barrier, + Opt_rgrplvb, + Opt_loccookie, +}; -static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +enum opt_quota { + Opt_quota_unset = 0, + Opt_quota_off, + Opt_quota_account, + Opt_quota_on, +}; + +static const unsigned int opt_quota_values[] = { + [Opt_quota_off] = GFS2_QUOTA_OFF, + [Opt_quota_account] = GFS2_QUOTA_ACCOUNT, + [Opt_quota_on] = GFS2_QUOTA_ON, +}; + +enum opt_data { + Opt_data_writeback = GFS2_DATA_WRITEBACK, + Opt_data_ordered = GFS2_DATA_ORDERED, +}; + +enum opt_errors { + Opt_errors_withdraw = GFS2_ERRORS_WITHDRAW, + Opt_errors_panic = GFS2_ERRORS_PANIC, +}; + +static const struct fs_parameter_spec gfs2_param_specs[] = { + fsparam_string ("lockproto", Opt_lockproto), + fsparam_string ("locktable", Opt_locktable), + fsparam_string ("hostdata", Opt_hostdata), + fsparam_flag ("spectator", Opt_spectator), + fsparam_flag ("norecovery", Opt_spectator), + fsparam_flag ("ignore_local_fs", Opt_ignore_local_fs), + fsparam_flag ("localflocks", Opt_localflocks), + fsparam_flag ("localcaching", Opt_localcaching), + fsparam_flag_no("debug", Opt_debug), + fsparam_flag ("upgrade", Opt_upgrade), + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("suiddir", Opt_suiddir), + fsparam_enum ("data", Opt_data), + fsparam_flag ("meta", Opt_meta), + fsparam_flag_no("discard", Opt_discard), + fsparam_s32 ("commit", Opt_commit), + fsparam_enum ("errors", Opt_errors), + fsparam_s32 ("statfs_quantum", Opt_statfs_quantum), + fsparam_s32 ("statfs_percent", Opt_statfs_percent), + fsparam_s32 ("quota_quantum", Opt_quota_quantum), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag_no("rgrplvb", Opt_rgrplvb), + fsparam_flag_no("loccookie", Opt_loccookie), + /* quota can be a flag or an enum so it gets special treatment */ + __fsparam(fs_param_is_enum, "quota", Opt_quota, fs_param_neg_with_no|fs_param_v_optional), + {} +}; + +static const struct fs_parameter_enum gfs2_param_enums[] = { + { Opt_quota, "off", Opt_quota_off }, + { Opt_quota, "account", Opt_quota_account }, + { Opt_quota, "on", Opt_quota_on }, + { Opt_data, "writeback", Opt_data_writeback }, + { Opt_data, "ordered", Opt_data_ordered }, + { Opt_errors, "withdraw", Opt_errors_withdraw }, + { Opt_errors, "panic", Opt_errors_panic }, + {} +}; + +const struct fs_parameter_description gfs2_fs_parameters = { + .name = "gfs2", + .specs = gfs2_param_specs, + .enums = gfs2_param_enums, +}; + +/* Parse a single mount parameter */ +static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct block_device *bdev; - struct super_block *s; - fmode_t mode = FMODE_READ | FMODE_EXCL; - int error; - struct gfs2_args args; - struct gfs2_sbd *sdp; + struct gfs2_args *args = fc->fs_private; + struct fs_parse_result result; + int o; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; + o = fs_parse(fc, &gfs2_fs_parameters, param, &result); + if (o < 0) + return o; - bdev = blkdev_get_by_path(dev_name, mode, fs_type); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); - - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = -EBUSY; - goto error_bdev; + switch (o) { + case Opt_lockproto: + strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_locktable: + strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_hostdata: + strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_spectator: + args->ar_spectator = 1; + break; + case Opt_ignore_local_fs: + /* Retained for backwards compat only */ + break; + case Opt_localflocks: + args->ar_localflocks = 1; + break; + case Opt_localcaching: + /* Retained for backwards compat only */ + break; + case Opt_debug: + if (result.boolean && args->ar_errors == GFS2_ERRORS_PANIC) + return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive"); + args->ar_debug = result.boolean; + break; + case Opt_upgrade: + /* Retained for backwards compat only */ + break; + case Opt_acl: + args->ar_posix_acl = result.boolean; + break; + case Opt_quota: + /* The quota option can be a flag or an enum. A non-zero int_32 + result means that we have an enum index. Otherwise we have + to rely on the 'negated' flag to tell us whether 'quota' or + 'noquota' was specified. */ + if (result.negated) + args->ar_quota = GFS2_QUOTA_OFF; + else if (result.int_32 > 0) + args->ar_quota = opt_quota_values[result.int_32]; + else + args->ar_quota = GFS2_QUOTA_ON; + break; + case Opt_suiddir: + args->ar_suiddir = result.boolean; + break; + case Opt_data: + /* The uint_32 result maps directly to GFS2_DATA_* */ + args->ar_data = result.uint_32; + break; + case Opt_meta: + args->ar_meta = 1; + break; + case Opt_discard: + args->ar_discard = result.boolean; + break; + case Opt_commit: + if (result.int_32 <= 0) + return invalf(fc, "gfs2: commit mount option requires a positive numeric argument"); + args->ar_commit = result.int_32; + break; + case Opt_statfs_quantum: + if (result.int_32 < 0) + return invalf(fc, "gfs2: statfs_quantum mount option requires a non-negative numeric argument"); + args->ar_statfs_quantum = result.int_32; + break; + case Opt_quota_quantum: + if (result.int_32 <= 0) + return invalf(fc, "gfs2: quota_quantum mount option requires a positive numeric argument"); + args->ar_quota_quantum = result.int_32; + break; + case Opt_statfs_percent: + if (result.int_32 < 0 || result.int_32 > 100) + return invalf(fc, "gfs2: statfs_percent mount option requires a numeric argument between 0 and 100"); + args->ar_statfs_percent = result.int_32; + break; + case Opt_errors: + if (args->ar_debug && result.uint_32 == GFS2_ERRORS_PANIC) + return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive"); + args->ar_errors = result.uint_32; + break; + case Opt_barrier: + args->ar_nobarrier = result.boolean; + break; + case Opt_rgrplvb: + args->ar_rgrplvb = result.boolean; + break; + case Opt_loccookie: + args->ar_loccookie = result.boolean; + break; + default: + return invalf(fc, "gfs2: invalid mount option: %s", param->key); } - s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = PTR_ERR(s); - if (IS_ERR(s)) - goto error_bdev; + return 0; +} - if (s->s_root) { - /* - * s_umount nests inside bd_mutex during - * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop - * s_umount temporarily. This is safe as we're - * holding an active reference. - */ - up_write(&s->s_umount); - blkdev_put(bdev, mode); - down_write(&s->s_umount); - } else { - /* s_mode must be set before deactivate_locked_super calls */ - s->s_mode = mode; - } +static int gfs2_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args *oldargs = &sdp->sd_args; + struct gfs2_args *newargs = fc->fs_private; + struct gfs2_tune *gt = &sdp->sd_tune; + int error = 0; - memset(&args, 0, sizeof(args)); - args.ar_quota = GFS2_QUOTA_DEFAULT; - args.ar_data = GFS2_DATA_DEFAULT; - args.ar_commit = 30; - args.ar_statfs_quantum = 30; - args.ar_quota_quantum = 60; - args.ar_errors = GFS2_ERRORS_DEFAULT; + sync_filesystem(sb); - error = gfs2_mount_args(&args, data); - if (error) { - pr_warn("can't parse mount arguments\n"); - goto error_super; - } - - if (s->s_root) { - error = -EBUSY; - if ((flags ^ s->s_flags) & SB_RDONLY) - goto error_super; - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0); - if (error) - goto error_super; - s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; - } - - sdp = s->s_fs_info; - if (args.ar_meta) - return dget(sdp->sd_master_dir); + spin_lock(>->gt_spin); + oldargs->ar_commit = gt->gt_logd_secs; + oldargs->ar_quota_quantum = gt->gt_quota_quantum; + if (gt->gt_statfs_slow) + oldargs->ar_statfs_quantum = 0; else - return dget(sdp->sd_root_dir); + oldargs->ar_statfs_quantum = gt->gt_statfs_quantum; + spin_unlock(>->gt_spin); -error_super: - deactivate_locked_super(s); - return ERR_PTR(error); -error_bdev: - blkdev_put(bdev, mode); - return ERR_PTR(error); + if (strcmp(newargs->ar_lockproto, oldargs->ar_lockproto)) { + errorf(fc, "gfs2: reconfiguration of locking protocol not allowed"); + return -EINVAL; + } + if (strcmp(newargs->ar_locktable, oldargs->ar_locktable)) { + errorf(fc, "gfs2: reconfiguration of lock table not allowed"); + return -EINVAL; + } + if (strcmp(newargs->ar_hostdata, oldargs->ar_hostdata)) { + errorf(fc, "gfs2: reconfiguration of host data not allowed"); + return -EINVAL; + } + if (newargs->ar_spectator != oldargs->ar_spectator) { + errorf(fc, "gfs2: reconfiguration of spectator mode not allowed"); + return -EINVAL; + } + if (newargs->ar_localflocks != oldargs->ar_localflocks) { + errorf(fc, "gfs2: reconfiguration of localflocks not allowed"); + return -EINVAL; + } + if (newargs->ar_meta != oldargs->ar_meta) { + errorf(fc, "gfs2: switching between gfs2 and gfs2meta not allowed"); + return -EINVAL; + } + if (oldargs->ar_spectator) + fc->sb_flags |= SB_RDONLY; + + if ((sb->s_flags ^ fc->sb_flags) & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { + error = gfs2_make_fs_ro(sdp); + if (error) + errorf(fc, "gfs2: unable to remount read-only"); + } else { + error = gfs2_make_fs_rw(sdp); + if (error) + errorf(fc, "gfs2: unable to remount read-write"); + } + } + sdp->sd_args = *newargs; + + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + else + clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); + spin_lock(>->gt_spin); + gt->gt_logd_secs = newargs->ar_commit; + gt->gt_quota_quantum = newargs->ar_quota_quantum; + if (newargs->ar_statfs_quantum) { + gt->gt_statfs_slow = 0; + gt->gt_statfs_quantum = newargs->ar_statfs_quantum; + } + else { + gt->gt_statfs_slow = 1; + gt->gt_statfs_quantum = 30; + } + spin_unlock(>->gt_spin); + + gfs2_online_uevent(sdp); + return error; } -static int set_meta_super(struct super_block *s, void *ptr) +static const struct fs_context_operations gfs2_context_ops = { + .free = gfs2_fc_free, + .parse_param = gfs2_parse_param, + .get_tree = gfs2_get_tree, + .reconfigure = gfs2_reconfigure, +}; + +/* Set up the filesystem mount context */ +static int gfs2_init_fs_context(struct fs_context *fc) +{ + struct gfs2_args *args; + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (args == NULL) + return -ENOMEM; + + args->ar_quota = GFS2_QUOTA_DEFAULT; + args->ar_data = GFS2_DATA_DEFAULT; + args->ar_commit = 30; + args->ar_statfs_quantum = 30; + args->ar_quota_quantum = 60; + args->ar_errors = GFS2_ERRORS_DEFAULT; + + fc->fs_private = args; + fc->ops = &gfs2_context_ops; + return 0; +} + +static int set_meta_super(struct super_block *s, struct fs_context *fc) { return -EINVAL; } -static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int test_meta_super(struct super_block *s, struct fs_context *fc) +{ + return (fc->sget_key == s->s_bdev); +} + +static int gfs2_meta_get_tree(struct fs_context *fc) { struct super_block *s; struct gfs2_sbd *sdp; struct path path; int error; - if (!dev_name || !*dev_name) - return ERR_PTR(-EINVAL); + if (!fc->source || !*fc->source) + return -EINVAL; - error = kern_path(dev_name, LOOKUP_FOLLOW, &path); + error = kern_path(fc->source, LOOKUP_FOLLOW, &path); if (error) { pr_warn("path_lookup on %s returned error %d\n", - dev_name, error); - return ERR_PTR(error); + fc->source, error); + return error; } - s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, - path.dentry->d_sb->s_bdev); + fc->fs_type = &gfs2_fs_type; + fc->sget_key = path.dentry->d_sb->s_bdev; + s = sget_fc(fc, test_meta_super, set_meta_super); path_put(&path); if (IS_ERR(s)) { pr_warn("gfs2 mount does not exist\n"); - return ERR_CAST(s); + return PTR_ERR(s); } - if ((flags ^ s->s_flags) & SB_RDONLY) { + if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); - return ERR_PTR(-EBUSY); + return -EBUSY; } sdp = s->s_fs_info; - return dget(sdp->sd_master_dir); + fc->root = dget(sdp->sd_master_dir); + return 0; +} + +static const struct fs_context_operations gfs2_meta_context_ops = { + .get_tree = gfs2_meta_get_tree, +}; + +static int gfs2_meta_init_fs_context(struct fs_context *fc) +{ + int ret = gfs2_init_fs_context(fc); + + if (ret) + return ret; + + fc->ops = &gfs2_meta_context_ops; + return 0; } static void gfs2_kill_sb(struct super_block *sb) @@ -1382,7 +1634,8 @@ static void gfs2_kill_sb(struct super_block *sb) struct file_system_type gfs2_fs_type = { .name = "gfs2", .fs_flags = FS_REQUIRES_DEV, - .mount = gfs2_mount, + .init_fs_context = gfs2_init_fs_context, + .parameters = &gfs2_fs_parameters, .kill_sb = gfs2_kill_sb, .owner = THIS_MODULE, }; @@ -1391,7 +1644,7 @@ MODULE_ALIAS_FS("gfs2"); struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", .fs_flags = FS_REQUIRES_DEV, - .mount = gfs2_mount_meta, + .init_fs_context = gfs2_meta_init_fs_context, .owner = THIS_MODULE, }; MODULE_ALIAS_FS("gfs2meta"); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 0acc5834f653..ffc77e5097cb 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -44,258 +44,6 @@ #include "xattr.h" #include "lops.h" -#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) - -enum { - Opt_lockproto, - Opt_locktable, - Opt_hostdata, - Opt_spectator, - Opt_ignore_local_fs, - Opt_localflocks, - Opt_localcaching, - Opt_debug, - Opt_nodebug, - Opt_upgrade, - Opt_acl, - Opt_noacl, - Opt_quota_off, - Opt_quota_account, - Opt_quota_on, - Opt_quota, - Opt_noquota, - Opt_suiddir, - Opt_nosuiddir, - Opt_data_writeback, - Opt_data_ordered, - Opt_meta, - Opt_discard, - Opt_nodiscard, - Opt_commit, - Opt_err_withdraw, - Opt_err_panic, - Opt_statfs_quantum, - Opt_statfs_percent, - Opt_quota_quantum, - Opt_barrier, - Opt_nobarrier, - Opt_rgrplvb, - Opt_norgrplvb, - Opt_loccookie, - Opt_noloccookie, - Opt_error, -}; - -static const match_table_t tokens = { - {Opt_lockproto, "lockproto=%s"}, - {Opt_locktable, "locktable=%s"}, - {Opt_hostdata, "hostdata=%s"}, - {Opt_spectator, "spectator"}, - {Opt_spectator, "norecovery"}, - {Opt_ignore_local_fs, "ignore_local_fs"}, - {Opt_localflocks, "localflocks"}, - {Opt_localcaching, "localcaching"}, - {Opt_debug, "debug"}, - {Opt_nodebug, "nodebug"}, - {Opt_upgrade, "upgrade"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_quota_off, "quota=off"}, - {Opt_quota_account, "quota=account"}, - {Opt_quota_on, "quota=on"}, - {Opt_quota, "quota"}, - {Opt_noquota, "noquota"}, - {Opt_suiddir, "suiddir"}, - {Opt_nosuiddir, "nosuiddir"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_meta, "meta"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_commit, "commit=%d"}, - {Opt_err_withdraw, "errors=withdraw"}, - {Opt_err_panic, "errors=panic"}, - {Opt_statfs_quantum, "statfs_quantum=%d"}, - {Opt_statfs_percent, "statfs_percent=%d"}, - {Opt_quota_quantum, "quota_quantum=%d"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_rgrplvb, "rgrplvb"}, - {Opt_norgrplvb, "norgrplvb"}, - {Opt_loccookie, "loccookie"}, - {Opt_noloccookie, "noloccookie"}, - {Opt_error, NULL} -}; - -/** - * gfs2_mount_args - Parse mount options - * @args: The structure into which the parsed options will be written - * @options: The options to parse - * - * Return: errno - */ - -int gfs2_mount_args(struct gfs2_args *args, char *options) -{ - char *o; - int token; - substring_t tmp[MAX_OPT_ARGS]; - int rv; - - /* Split the options into tokens with the "," character and - process them */ - - while (1) { - o = strsep(&options, ","); - if (o == NULL) - break; - if (*o == '\0') - continue; - - token = match_token(o, tokens, tmp); - switch (token) { - case Opt_lockproto: - match_strlcpy(args->ar_lockproto, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_locktable: - match_strlcpy(args->ar_locktable, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_hostdata: - match_strlcpy(args->ar_hostdata, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_spectator: - args->ar_spectator = 1; - break; - case Opt_ignore_local_fs: - /* Retained for backwards compat only */ - break; - case Opt_localflocks: - args->ar_localflocks = 1; - break; - case Opt_localcaching: - /* Retained for backwards compat only */ - break; - case Opt_debug: - if (args->ar_errors == GFS2_ERRORS_PANIC) { - pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); - return -EINVAL; - } - args->ar_debug = 1; - break; - case Opt_nodebug: - args->ar_debug = 0; - break; - case Opt_upgrade: - /* Retained for backwards compat only */ - break; - case Opt_acl: - args->ar_posix_acl = 1; - break; - case Opt_noacl: - args->ar_posix_acl = 0; - break; - case Opt_quota_off: - case Opt_noquota: - args->ar_quota = GFS2_QUOTA_OFF; - break; - case Opt_quota_account: - args->ar_quota = GFS2_QUOTA_ACCOUNT; - break; - case Opt_quota_on: - case Opt_quota: - args->ar_quota = GFS2_QUOTA_ON; - break; - case Opt_suiddir: - args->ar_suiddir = 1; - break; - case Opt_nosuiddir: - args->ar_suiddir = 0; - break; - case Opt_data_writeback: - args->ar_data = GFS2_DATA_WRITEBACK; - break; - case Opt_data_ordered: - args->ar_data = GFS2_DATA_ORDERED; - break; - case Opt_meta: - args->ar_meta = 1; - break; - case Opt_discard: - args->ar_discard = 1; - break; - case Opt_nodiscard: - args->ar_discard = 0; - break; - case Opt_commit: - rv = match_int(&tmp[0], &args->ar_commit); - if (rv || args->ar_commit <= 0) { - pr_warn("commit mount option requires a positive numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_statfs_quantum: - rv = match_int(&tmp[0], &args->ar_statfs_quantum); - if (rv || args->ar_statfs_quantum < 0) { - pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_quota_quantum: - rv = match_int(&tmp[0], &args->ar_quota_quantum); - if (rv || args->ar_quota_quantum <= 0) { - pr_warn("quota_quantum mount option requires a positive numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_statfs_percent: - rv = match_int(&tmp[0], &args->ar_statfs_percent); - if (rv || args->ar_statfs_percent < 0 || - args->ar_statfs_percent > 100) { - pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_err_withdraw: - args->ar_errors = GFS2_ERRORS_WITHDRAW; - break; - case Opt_err_panic: - if (args->ar_debug) { - pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); - return -EINVAL; - } - args->ar_errors = GFS2_ERRORS_PANIC; - break; - case Opt_barrier: - args->ar_nobarrier = 0; - break; - case Opt_nobarrier: - args->ar_nobarrier = 1; - break; - case Opt_rgrplvb: - args->ar_rgrplvb = 1; - break; - case Opt_norgrplvb: - args->ar_rgrplvb = 0; - break; - case Opt_loccookie: - args->ar_loccookie = 1; - break; - case Opt_noloccookie: - args->ar_loccookie = 0; - break; - case Opt_error: - default: - pr_warn("invalid mount option: %s\n", o); - return -EINVAL; - } - } - - return 0; -} - /** * gfs2_jindex_free - Clear all the journal index information * @sdp: The GFS2 superblock @@ -847,7 +595,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) * Returns: errno */ -static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { struct gfs2_holder freeze_gh; int error; @@ -1226,84 +974,6 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -/** - * gfs2_remount_fs - called when the FS is remounted - * @sb: the filesystem - * @flags: the remount flags - * @data: extra data passed in (not used right now) - * - * Returns: errno - */ - -static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_args args = sdp->sd_args; /* Default to current settings */ - struct gfs2_tune *gt = &sdp->sd_tune; - int error; - - sync_filesystem(sb); - - spin_lock(>->gt_spin); - args.ar_commit = gt->gt_logd_secs; - args.ar_quota_quantum = gt->gt_quota_quantum; - if (gt->gt_statfs_slow) - args.ar_statfs_quantum = 0; - else - args.ar_statfs_quantum = gt->gt_statfs_quantum; - spin_unlock(>->gt_spin); - error = gfs2_mount_args(&args, data); - if (error) - return error; - - /* Not allowed to change locking details */ - if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || - strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || - strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) - return -EINVAL; - - /* Some flags must not be changed */ - if (args_neq(&args, &sdp->sd_args, spectator) || - args_neq(&args, &sdp->sd_args, localflocks) || - args_neq(&args, &sdp->sd_args, meta)) - return -EINVAL; - - if (sdp->sd_args.ar_spectator) - *flags |= SB_RDONLY; - - if ((sb->s_flags ^ *flags) & SB_RDONLY) { - if (*flags & SB_RDONLY) - error = gfs2_make_fs_ro(sdp); - else - error = gfs2_make_fs_rw(sdp); - } - - sdp->sd_args = args; - if (sdp->sd_args.ar_posix_acl) - sb->s_flags |= SB_POSIXACL; - else - sb->s_flags &= ~SB_POSIXACL; - if (sdp->sd_args.ar_nobarrier) - set_bit(SDF_NOBARRIERS, &sdp->sd_flags); - else - clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); - spin_lock(>->gt_spin); - gt->gt_logd_secs = args.ar_commit; - gt->gt_quota_quantum = args.ar_quota_quantum; - if (args.ar_statfs_quantum) { - gt->gt_statfs_slow = 0; - gt->gt_statfs_quantum = args.ar_statfs_quantum; - } - else { - gt->gt_statfs_slow = 1; - gt->gt_statfs_quantum = 30; - } - spin_unlock(>->gt_spin); - - gfs2_online_uevent(sdp); - return error; -} - /** * gfs2_drop_inode - Drop an inode (test for remote unlink) * @inode: The inode to drop @@ -1748,7 +1418,6 @@ const struct super_operations gfs2_super_ops = { .freeze_super = gfs2_freeze, .thaw_super = gfs2_unfreeze, .statfs = gfs2_statfs, - .remount_fs = gfs2_remount_fs, .drop_inode = gfs2_drop_inode, .show_options = gfs2_show_options, }; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 9d49eaadb9d9..b8bf811a1305 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -24,8 +24,6 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) extern void gfs2_jindex_free(struct gfs2_sbd *sdp); -extern int gfs2_mount_args(struct gfs2_args *args, char *data); - extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); @@ -33,6 +31,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +extern int gfs2_make_fs_ro(struct gfs2_sbd *sdp); extern void gfs2_online_uevent(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, From 7cca9b8b7c5bcc56d627851550840586a25aaa1b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 23 Aug 2019 11:47:28 +0200 Subject: [PATCH 432/721] microblaze: Switch to standard restart handler The microblaze uses the legacy APIs to dig out a GPIO pin defined in the root of the device tree to issue a hard reset of the platform. Asserting a hard reset should be done using the standard DT-enabled and fully GPIO descriptor aware driver in drivers/power/reset/gpio-restart.c using the bindings from Documentation/devicetree/bindings/power/reset/gpio-restart.txt To achieve this, first make sure microblaze makes use of the standard kernel restart path utilizing do_kernel_restart() from . Put in some grace time and an emergency print if the restart does not properly assert. As this is basic platform functionality we patch the DTS file and defconfig in one go for a lockstep change. Cc: Arnd Bergmann Cc: Michal Simek Signed-off-by: Linus Walleij [ Michal: Move machine_restart back to reset.c ] Signed-off-by: Michal Simek --- arch/microblaze/boot/dts/system.dts | 16 ++++- arch/microblaze/configs/mmu_defconfig | 2 + arch/microblaze/configs/nommu_defconfig | 2 + arch/microblaze/kernel/reset.c | 87 ++++--------------------- 4 files changed, 30 insertions(+), 77 deletions(-) diff --git a/arch/microblaze/boot/dts/system.dts b/arch/microblaze/boot/dts/system.dts index 5a8a9d090c37..5b236527176e 100644 --- a/arch/microblaze/boot/dts/system.dts +++ b/arch/microblaze/boot/dts/system.dts @@ -18,7 +18,6 @@ #address-cells = <1>; #size-cells = <1>; compatible = "xlnx,microblaze"; - hard-reset-gpios = <&LEDs_8Bit 2 1>; model = "testing"; DDR2_SDRAM: memory@90000000 { device_type = "memory"; @@ -281,6 +280,21 @@ gpios = <&LEDs_8Bit 7 1>; }; } ; + + gpio-restart { + compatible = "gpio-restart"; + /* + * FIXME: is this active low or active high? + * the current flag (1) indicates active low. + * delay measures are templates, should be adjusted + * to datasheet or trial-and-error with real hardware. + */ + gpios = <&LEDs_8Bit 2 1>; + active-delay = <100>; + inactive-delay = <10>; + wait-delay = <100>; + }; + RS232_Uart_1: serial@84000000 { clock-frequency = <125000000>; compatible = "xlnx,xps-uartlite-1.00.a"; diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index b83aff1e86cd..654edfdc7867 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -60,6 +60,8 @@ CONFIG_SPI_XILINX=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y CONFIG_GPIO_XILINX=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_GPIO_RESTART=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_XILINX_WATCHDOG=y diff --git a/arch/microblaze/configs/nommu_defconfig b/arch/microblaze/configs/nommu_defconfig index dc102eb65fca..377de39ccb8c 100644 --- a/arch/microblaze/configs/nommu_defconfig +++ b/arch/microblaze/configs/nommu_defconfig @@ -62,6 +62,8 @@ CONFIG_SPI_XILINX=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y CONFIG_GPIO_XILINX=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_GPIO_RESTART=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_XILINX_WATCHDOG=y diff --git a/arch/microblaze/kernel/reset.c b/arch/microblaze/kernel/reset.c index fcbe1daf6316..5f4722908164 100644 --- a/arch/microblaze/kernel/reset.c +++ b/arch/microblaze/kernel/reset.c @@ -8,83 +8,9 @@ */ #include +#include #include - -/* Trigger specific functions */ -#ifdef CONFIG_GPIOLIB - -#include - -static int handle; /* reset pin handle */ -static unsigned int reset_val; - -static int of_platform_reset_gpio_probe(void) -{ - int ret; - handle = of_get_named_gpio(of_find_node_by_path("/"), - "hard-reset-gpios", 0); - - if (!gpio_is_valid(handle)) { - pr_info("Skipping unavailable RESET gpio %d (%s)\n", - handle, "reset"); - return -ENODEV; - } - - ret = gpio_request(handle, "reset"); - if (ret < 0) { - pr_info("GPIO pin is already allocated\n"); - return ret; - } - - /* get current setup value */ - reset_val = gpio_get_value(handle); - /* FIXME maybe worth to perform any action */ - pr_debug("Reset: Gpio output state: 0x%x\n", reset_val); - - /* Setup GPIO as output */ - ret = gpio_direction_output(handle, 0); - if (ret < 0) - goto err; - - /* Setup output direction */ - gpio_set_value(handle, 0); - - pr_info("RESET: Registered gpio device: %d, current val: %d\n", - handle, reset_val); - return 0; -err: - gpio_free(handle); - return ret; -} -device_initcall(of_platform_reset_gpio_probe); - - -static void gpio_system_reset(void) -{ - if (gpio_is_valid(handle)) - gpio_set_value(handle, 1 - reset_val); - else - pr_notice("Reset GPIO unavailable - halting!\n"); -} -#else -static void gpio_system_reset(void) -{ - pr_notice("No reset GPIO present - halting!\n"); -} - -void of_platform_reset_gpio_probe(void) -{ - return; -} -#endif - -void machine_restart(char *cmd) -{ - pr_notice("Machine restart...\n"); - gpio_system_reset(); - while (1) - ; -} +#include void machine_shutdown(void) { @@ -106,3 +32,12 @@ void machine_power_off(void) while (1) ; } + +void machine_restart(char *cmd) +{ + do_kernel_restart(cmd); + /* Give the restart hook 1 s to take us down */ + mdelay(1000); + pr_emerg("Reboot failed -- System halted\n"); + while (1); +} From f71fee2711a788b94ff0acb02fbd2bfe2de7e0a3 Mon Sep 17 00:00:00 2001 From: Ingo Franzki Date: Tue, 20 Aug 2019 14:57:20 +0200 Subject: [PATCH 433/721] s390/pkey: Add sysfs attributes to emit AES CIPHER key blobs Now that the pkey kernel module also supports CCA AES CIPHER keys: Add binary read-only sysfs attributes for the pkey module that can be used to read random CCA AES CIPHER secure keys from, similar to the already existing sysfs attributes for AES DATA and random protected keys. Keys are read from these attributes using a cat-like interface. A typical use case for those keys is to encrypt a swap device using the paes cipher. During processing of /etc/crypttab, the CCA random AES CIPHER secure key to encrypt the swap device is read from one of the attributes. The following attributes are added: ccacipher/ccacipher_aes_128 ccacipher/ccacipher_aes_192 ccacipher/ccacipher_aes_256 ccacipher/ccacipher_aes_128_xts ccacipher/ccacipher_aes_256_xts Each attribute emits a secure key blob for the corresponding key size and cipher mode. Signed-off-by: Ingo Franzki Reviewed-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/pkey_api.c | 113 +++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index f76a1d0f54c4..9de3d46b3253 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -1363,9 +1363,122 @@ static struct attribute_group ccadata_attr_group = { .bin_attrs = ccadata_attrs, }; +#define CCACIPHERTOKENSIZE (sizeof(struct cipherkeytoken) + 80) + +/* + * Sysfs attribute read function for all secure key ccacipher binary attributes. + * The implementation can not deal with partial reads, because a new random + * secure key blob is generated with each read. In case of partial reads + * (i.e. off != 0 or count < key blob size) -EINVAL is returned. + */ +static ssize_t pkey_ccacipher_aes_attr_read(enum pkey_key_size keybits, + bool is_xts, char *buf, loff_t off, + size_t count) +{ + size_t keysize; + int rc; + + if (off != 0 || count < CCACIPHERTOKENSIZE) + return -EINVAL; + if (is_xts) + if (count < 2 * CCACIPHERTOKENSIZE) + return -EINVAL; + + keysize = CCACIPHERTOKENSIZE; + rc = cca_gencipherkey(-1, -1, keybits, 0, buf, &keysize); + if (rc) + return rc; + memset(buf + keysize, 0, CCACIPHERTOKENSIZE - keysize); + + if (is_xts) { + keysize = CCACIPHERTOKENSIZE; + rc = cca_gencipherkey(-1, -1, keybits, 0, + buf + CCACIPHERTOKENSIZE, &keysize); + if (rc) + return rc; + memset(buf + CCACIPHERTOKENSIZE + keysize, 0, + CCACIPHERTOKENSIZE - keysize); + + return 2 * CCACIPHERTOKENSIZE; + } + + return CCACIPHERTOKENSIZE; +} + +static ssize_t ccacipher_aes_128_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_128, false, buf, + off, count); +} + +static ssize_t ccacipher_aes_192_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_192, false, buf, + off, count); +} + +static ssize_t ccacipher_aes_256_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_256, false, buf, + off, count); +} + +static ssize_t ccacipher_aes_128_xts_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_128, true, buf, + off, count); +} + +static ssize_t ccacipher_aes_256_xts_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_256, true, buf, + off, count); +} + +static BIN_ATTR_RO(ccacipher_aes_128, CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_192, CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_256, CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_128_xts, 2 * CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_256_xts, 2 * CCACIPHERTOKENSIZE); + +static struct bin_attribute *ccacipher_attrs[] = { + &bin_attr_ccacipher_aes_128, + &bin_attr_ccacipher_aes_192, + &bin_attr_ccacipher_aes_256, + &bin_attr_ccacipher_aes_128_xts, + &bin_attr_ccacipher_aes_256_xts, + NULL +}; + +static struct attribute_group ccacipher_attr_group = { + .name = "ccacipher", + .bin_attrs = ccacipher_attrs, +}; + static const struct attribute_group *pkey_attr_groups[] = { &protkey_attr_group, &ccadata_attr_group, + &ccacipher_attr_group, NULL, }; From b91d9e67e50bfc79850a1760b8e59dc1dc56057f Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Wed, 11 Sep 2019 18:41:03 +0200 Subject: [PATCH 434/721] s390/cio: fix intparm documentation The common I/O layer is maintaining an "intparm" inspired by the hardware intparm for driver usage. This "intparm" is not only applicaple for ssch, but also for hsch/csch. The kerneldoc states that it is only updated for hsch/csch if no prior request is pending; however, this is not what the code does (whether that would actually desireable is a different issue.) Let's at least fix the kerneldoc for now. Fixes: b2ffd8e9a76e ("[S390] cio: Add docbook comments.") Signed-off-by: Cornelia Huck Signed-off-by: Sebastian Ott Signed-off-by: Vasily Gorbik --- drivers/s390/cio/device_ops.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index d722458c5928..65841af15748 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -124,9 +124,7 @@ EXPORT_SYMBOL(ccw_device_is_multipath); /** * ccw_device_clear() - terminate I/O request processing * @cdev: target ccw device - * @intparm: interruption parameter; value is only used if no I/O is - * outstanding, otherwise the intparm associated with the I/O request - * is returned + * @intparm: interruption parameter to be returned upon conclusion of csch * * ccw_device_clear() calls csch on @cdev's subchannel. * Returns: @@ -179,6 +177,9 @@ int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm) * completed during the time specified by @expires. If a timeout occurs, the * channel program is terminated via xsch, hsch or csch, and the device's * interrupt handler will be called with an irb containing ERR_PTR(-%ETIMEDOUT). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -256,6 +257,9 @@ int ccw_device_start_timeout_key(struct ccw_device *cdev, struct ccw1 *cpa, * Start a S/390 channel program. When the interrupt arrives, the * IRQ handler is called, either immediately, delayed (dev-end missing, * or sense required) or never (no IRQ handler registered). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -287,6 +291,9 @@ int ccw_device_start_key(struct ccw_device *cdev, struct ccw1 *cpa, * Start a S/390 channel program. When the interrupt arrives, the * IRQ handler is called, either immediately, delayed (dev-end missing, * or sense required) or never (no IRQ handler registered). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -322,6 +329,9 @@ int ccw_device_start(struct ccw_device *cdev, struct ccw1 *cpa, * completed during the time specified by @expires. If a timeout occurs, the * channel program is terminated via xsch, hsch or csch, and the device's * interrupt handler will be called with an irb containing ERR_PTR(-%ETIMEDOUT). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -343,11 +353,12 @@ int ccw_device_start_timeout(struct ccw_device *cdev, struct ccw1 *cpa, /** * ccw_device_halt() - halt I/O request processing * @cdev: target ccw device - * @intparm: interruption parameter; value is only used if no I/O is - * outstanding, otherwise the intparm associated with the I/O request - * is returned + * @intparm: interruption parameter to be returned upon conclusion of hsch * * ccw_device_halt() calls hsch on @cdev's subchannel. + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_clear(). * Returns: * %0 on success, * -%ENODEV on device not operational, From cf2957f3907e44ca40392ac19a0c22a14e3fdc18 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 16 Aug 2019 11:05:58 +0200 Subject: [PATCH 435/721] s390/zcrypt: CEX7S exploitation support This patch adds CEX7 exploitation support for the AP bus code, the zcrypt device driver zoo and the vfio device driver. Signed-off-by: Harald Freudenberger Reviewed-by: Ingo Franzki Signed-off-by: Vasily Gorbik --- arch/s390/include/uapi/asm/zcrypt.h | 4 +- drivers/s390/crypto/ap_bus.c | 12 ++--- drivers/s390/crypto/ap_bus.h | 3 +- drivers/s390/crypto/vfio_ap_drv.c | 2 + drivers/s390/crypto/zcrypt_api.h | 3 +- drivers/s390/crypto/zcrypt_cex4.c | 72 +++++++++++++++++++++-------- 6 files changed, 67 insertions(+), 29 deletions(-) diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h index 8c5755f41dde..f9e5e1f0821d 100644 --- a/arch/s390/include/uapi/asm/zcrypt.h +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -4,7 +4,7 @@ * * zcrypt 2.2.1 (user-visible header) * - * Copyright IBM Corp. 2001, 2018 + * Copyright IBM Corp. 2001, 2019 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * @@ -286,7 +286,7 @@ struct zcrypt_device_matrix_ext { * 0x08: CEX3A * 0x0a: CEX4 * 0x0b: CEX5 - * 0x0c: CEX6 + * 0x0c: CEX6 and CEX7 * 0x0d: device is disabled * * ZCRYPT_QDEPTH_MASK diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index a76b8a8bcbbb..a1915061932e 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1322,24 +1322,24 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) /* < CEX2A is not supported */ if (rawtype < AP_DEVICE_TYPE_CEX2A) return 0; - /* up to CEX6 known and fully supported */ - if (rawtype <= AP_DEVICE_TYPE_CEX6) + /* up to CEX7 known and fully supported */ + if (rawtype <= AP_DEVICE_TYPE_CEX7) return rawtype; /* - * unknown new type > CEX6, check for compatibility + * unknown new type > CEX7, check for compatibility * to the highest known and supported type which is - * currently CEX6 with the help of the QACT function. + * currently CEX7 with the help of the QACT function. */ if (ap_qact_available()) { struct ap_queue_status status; union ap_qact_ap_info apinfo = {0}; apinfo.mode = (func >> 26) & 0x07; - apinfo.cat = AP_DEVICE_TYPE_CEX6; + apinfo.cat = AP_DEVICE_TYPE_CEX7; status = ap_qact(qid, 0, &apinfo); if (status.response_code == AP_RESPONSE_NORMAL && apinfo.cat >= AP_DEVICE_TYPE_CEX2A - && apinfo.cat <= AP_DEVICE_TYPE_CEX6) + && apinfo.cat <= AP_DEVICE_TYPE_CEX7) comp_type = apinfo.cat; } if (!comp_type) diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 6f3cf37776ca..433b7b64368d 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * Copyright IBM Corp. 2006, 2012 + * Copyright IBM Corp. 2006, 2019 * Author(s): Cornelia Huck * Martin Schwidefsky * Ralph Wuerthner @@ -63,6 +63,7 @@ static inline int ap_test_bit(unsigned int *ptr, unsigned int nr) #define AP_DEVICE_TYPE_CEX4 10 #define AP_DEVICE_TYPE_CEX5 11 #define AP_DEVICE_TYPE_CEX6 12 +#define AP_DEVICE_TYPE_CEX7 13 /* * Known function facilities diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c index 003662aa8060..be2520cc010b 100644 --- a/drivers/s390/crypto/vfio_ap_drv.c +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -36,6 +36,8 @@ static struct ap_device_id ap_queue_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX6, .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX7, + .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { /* end of sibling */ }, }; diff --git a/drivers/s390/crypto/zcrypt_api.h b/drivers/s390/crypto/zcrypt_api.h index 2d3f2732344f..d464618cd84f 100644 --- a/drivers/s390/crypto/zcrypt_api.h +++ b/drivers/s390/crypto/zcrypt_api.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * Copyright IBM Corp. 2001, 2018 + * Copyright IBM Corp. 2001, 2019 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * Cornelia Huck @@ -29,6 +29,7 @@ #define ZCRYPT_CEX4 10 #define ZCRYPT_CEX5 11 #define ZCRYPT_CEX6 12 +#define ZCRYPT_CEX7 13 /** * Large random numbers are pulled in 4096 byte chunks from the crypto cards diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c index f58d8dec19dc..442e3d6162f7 100644 --- a/drivers/s390/crypto/zcrypt_cex4.c +++ b/drivers/s390/crypto/zcrypt_cex4.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright IBM Corp. 2012 + * Copyright IBM Corp. 2012, 2019 * Author(s): Holger Dengler */ @@ -38,8 +38,8 @@ #define CEX4_CLEANUP_TIME (900*HZ) MODULE_AUTHOR("IBM Corporation"); -MODULE_DESCRIPTION("CEX4/CEX5/CEX6 Cryptographic Card device driver, " \ - "Copyright IBM Corp. 2018"); +MODULE_DESCRIPTION("CEX4/CEX5/CEX6/CEX7 Cryptographic Card device driver, " \ + "Copyright IBM Corp. 2019"); MODULE_LICENSE("GPL"); static struct ap_device_id zcrypt_cex4_card_ids[] = { @@ -49,6 +49,8 @@ static struct ap_device_id zcrypt_cex4_card_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX6, .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX7, + .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, { /* end of list */ }, }; @@ -61,6 +63,8 @@ static struct ap_device_id zcrypt_cex4_queue_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX6, .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX7, + .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { /* end of list */ }, }; @@ -146,7 +150,7 @@ static const struct attribute_group cca_queue_attr_group = { }; /** - * Probe function for CEX4/CEX5/CEX6 card device. It always + * Probe function for CEX4/CEX5/CEX6/CEX7 card device. It always * accepts the AP device since the bus_match already checked * the hardware type. * @ap_dev: pointer to the AP device. @@ -158,25 +162,31 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) * MEX_1k, MEX_2k, MEX_4k, CRT_1k, CRT_2k, CRT_4k, RNG, SECKEY */ static const int CEX4A_SPEED_IDX[] = { - 14, 19, 249, 42, 228, 1458, 0, 0}; + 14, 19, 249, 42, 228, 1458, 0, 0}; static const int CEX5A_SPEED_IDX[] = { - 8, 9, 20, 18, 66, 458, 0, 0}; + 8, 9, 20, 18, 66, 458, 0, 0}; static const int CEX6A_SPEED_IDX[] = { - 6, 9, 20, 17, 65, 438, 0, 0}; + 6, 9, 20, 17, 65, 438, 0, 0}; + static const int CEX7A_SPEED_IDX[] = { + 6, 8, 17, 15, 54, 362, 0, 0}; static const int CEX4C_SPEED_IDX[] = { 59, 69, 308, 83, 278, 2204, 209, 40}; static const int CEX5C_SPEED_IDX[] = { - 24, 31, 50, 37, 90, 479, 27, 10}; + 24, 31, 50, 37, 90, 479, 27, 10}; static const int CEX6C_SPEED_IDX[] = { - 16, 20, 32, 27, 77, 455, 23, 9}; + 16, 20, 32, 27, 77, 455, 24, 9}; + static const int CEX7C_SPEED_IDX[] = { + 14, 16, 26, 23, 64, 376, 23, 8}; static const int CEX4P_SPEED_IDX[] = { - 224, 313, 3560, 359, 605, 2827, 0, 50}; + 0, 0, 0, 0, 0, 0, 0, 50}; static const int CEX5P_SPEED_IDX[] = { - 63, 84, 156, 83, 142, 533, 0, 10}; + 0, 0, 0, 0, 0, 0, 0, 10}; static const int CEX6P_SPEED_IDX[] = { - 55, 70, 121, 73, 129, 522, 0, 9}; + 0, 0, 0, 0, 0, 0, 0, 9}; + static const int CEX7P_SPEED_IDX[] = { + 0, 0, 0, 0, 0, 0, 0, 8}; struct ap_card *ac = to_ap_card(&ap_dev->device); struct zcrypt_card *zc; @@ -198,11 +208,19 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX5; memcpy(zc->speed_rating, CEX5A_SPEED_IDX, sizeof(CEX5A_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) { zc->type_string = "CEX6A"; zc->user_space_type = ZCRYPT_CEX6; memcpy(zc->speed_rating, CEX6A_SPEED_IDX, sizeof(CEX6A_SPEED_IDX)); + } else { + zc->type_string = "CEX7A"; + /* wrong user space type, just for compatibility + * with the ZCRYPT_STATUS_MASK ioctl. + */ + zc->user_space_type = ZCRYPT_CEX6; + memcpy(zc->speed_rating, CEX7A_SPEED_IDX, + sizeof(CEX7A_SPEED_IDX)); } zc->min_mod_size = CEX4A_MIN_MOD_SIZE; if (ap_test_bit(&ac->functions, AP_FUNC_MEX4K) && @@ -232,7 +250,7 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX3C; memcpy(zc->speed_rating, CEX5C_SPEED_IDX, sizeof(CEX5C_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) { zc->type_string = "CEX6C"; /* wrong user space type, must be CEX6 * just keep it for cca compatibility @@ -240,6 +258,14 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX3C; memcpy(zc->speed_rating, CEX6C_SPEED_IDX, sizeof(CEX6C_SPEED_IDX)); + } else { + zc->type_string = "CEX7C"; + /* wrong user space type, must be CEX7 + * just keep it for cca compatibility + */ + zc->user_space_type = ZCRYPT_CEX3C; + memcpy(zc->speed_rating, CEX7C_SPEED_IDX, + sizeof(CEX7C_SPEED_IDX)); } zc->min_mod_size = CEX4C_MIN_MOD_SIZE; zc->max_mod_size = CEX4C_MAX_MOD_SIZE; @@ -255,11 +281,19 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX5; memcpy(zc->speed_rating, CEX5P_SPEED_IDX, sizeof(CEX5P_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) { zc->type_string = "CEX6P"; zc->user_space_type = ZCRYPT_CEX6; memcpy(zc->speed_rating, CEX6P_SPEED_IDX, sizeof(CEX6P_SPEED_IDX)); + } else { + zc->type_string = "CEX7P"; + /* wrong user space type, just for compatibility + * with the ZCRYPT_STATUS_MASK ioctl. + */ + zc->user_space_type = ZCRYPT_CEX6; + memcpy(zc->speed_rating, CEX7P_SPEED_IDX, + sizeof(CEX7P_SPEED_IDX)); } zc->min_mod_size = CEX4C_MIN_MOD_SIZE; zc->max_mod_size = CEX4C_MAX_MOD_SIZE; @@ -289,8 +323,8 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) } /** - * This is called to remove the CEX4/CEX5/CEX6 card driver information - * if an AP card device is removed. + * This is called to remove the CEX4/CEX5/CEX6/CEX7 card driver + * information if an AP card device is removed. */ static void zcrypt_cex4_card_remove(struct ap_device *ap_dev) { @@ -311,7 +345,7 @@ static struct ap_driver zcrypt_cex4_card_driver = { }; /** - * Probe function for CEX4/CEX5/CEX6 queue device. It always + * Probe function for CEX4/CEX5/CEX6/CEX7 queue device. It always * accepts the AP device since the bus_match already checked * the hardware type. * @ap_dev: pointer to the AP device. @@ -369,7 +403,7 @@ static int zcrypt_cex4_queue_probe(struct ap_device *ap_dev) } /** - * This is called to remove the CEX4/CEX5/CEX6 queue driver + * This is called to remove the CEX4/CEX5/CEX6/CEX7 queue driver * information if an AP queue device is removed. */ static void zcrypt_cex4_queue_remove(struct ap_device *ap_dev) From 2cb549a821e9daf441b62d55ca8eb6a9c22acf95 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 16 Sep 2019 11:21:23 +0200 Subject: [PATCH 436/721] s390/cpum_sf: Support ioctl PERF_EVENT_IOC_PERIOD A perf_event can be set up to deliver overflow notifications via SIGIO signal. The setup of the event is: 1. create event with perf_event_open() 2. assign it a signal for I/O notification with fcntl() 3. Install signal handler and consume samples The initial setup of perf_event_open() determines the period/frequency time span needed to elapse before each signal is delivered to the user process. While the event is active, system call ioctl(.., PERF_EVENT_IOC_PERIOD, value) can be used the change the frequency/period time span of the active event. The remaining signal handler invocations honour the new value. This does not work on s390. In fact the time span does not change regardless of ioctl's third argument 'value'. The call succeeds but the time span does not change. Support this behavior and make it common with other platforms. This is achieved by changing the interval value of the sampling control block accordingly and feed this new value every time the event is enabled using pmu_event_enable(). Before this change the interval value was set only once at pmu_event_add() and never changed. Signed-off-by: Thomas Richter Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/perf_event.h | 2 + arch/s390/kernel/perf_cpum_sf.c | 165 ++++++++++++++++++++++------- 2 files changed, 127 insertions(+), 40 deletions(-) diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index 560d8f766ddf..4652ffffe0b2 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -60,6 +60,7 @@ struct perf_sf_sde_regs { #define PERF_CPUM_SF_MODE_MASK (PERF_CPUM_SF_BASIC_MODE| \ PERF_CPUM_SF_DIAG_MODE) #define PERF_CPUM_SF_FULL_BLOCKS 0x0004 /* Process full SDBs only */ +#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ #define REG_NONE 0 #define REG_OVERFLOW 1 @@ -70,5 +71,6 @@ struct perf_sf_sde_regs { #define SAMPL_FLAGS(hwc) ((hwc)->config_base) #define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) #define SDB_FULL_BLOCKS(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FULL_BLOCKS) +#define SAMPLE_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) #endif /* _ASM_S390_PERF_EVENT_H */ diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 292a452cd1f3..544a02e944c6 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -673,13 +673,89 @@ static void cpumsf_output_event_pid(struct perf_event *event, rcu_read_unlock(); } +static unsigned long getrate(bool freq, unsigned long sample, + struct hws_qsi_info_block *si) +{ + unsigned long rate; + + if (freq) { + rate = freq_to_sample_rate(si, sample); + rate = hw_limit_rate(si, rate); + } else { + /* The min/max sampling rates specifies the valid range + * of sample periods. If the specified sample period is + * out of range, limit the period to the range boundary. + */ + rate = hw_limit_rate(si, sample); + + /* The perf core maintains a maximum sample rate that is + * configurable through the sysctl interface. Ensure the + * sampling rate does not exceed this value. This also helps + * to avoid throttling when pushing samples with + * perf_event_overflow(). + */ + if (sample_rate_to_freq(si, rate) > + sysctl_perf_event_sample_rate) { + debug_sprintf_event(sfdbg, 1, + "Sampling rate exceeds maximum " + "perf sample rate\n"); + rate = 0; + } + } + return rate; +} + +/* The sampling information (si) contains information about the + * min/max sampling intervals and the CPU speed. So calculate the + * correct sampling interval and avoid the whole period adjust + * feedback loop. + * + * Since the CPU Measurement sampling facility can not handle frequency + * calculate the sampling interval when frequency is specified using + * this formula: + * interval := cpu_speed * 1000000 / sample_freq + * + * Returns errno on bad input and zero on success with parameter interval + * set to the correct sampling rate. + * + * Note: This function turns off freq bit to avoid calling function + * perf_adjust_period(). This causes frequency adjustment in the common + * code part which causes tremendous variations in the counter values. + */ +static int __hw_perf_event_init_rate(struct perf_event *event, + struct hws_qsi_info_block *si) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + unsigned long rate; + + if (attr->freq) { + if (!attr->sample_freq) + return -EINVAL; + rate = getrate(attr->freq, attr->sample_freq, si); + attr->freq = 0; /* Don't call perf_adjust_period() */ + SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FREQ_MODE; + } else { + rate = getrate(attr->freq, attr->sample_period, si); + if (!rate) + return -EINVAL; + } + attr->sample_period = rate; + SAMPL_RATE(hwc) = rate; + hw_init_period(hwc, SAMPL_RATE(hwc)); + debug_sprintf_event(sfdbg, 4, "__hw_perf_event_init_rate:" + "cpu:%d period:%llx freq:%d,%#lx\n", event->cpu, + event->attr.sample_period, event->attr.freq, + SAMPLE_FREQ_MODE(hwc)); + return 0; +} + static int __hw_perf_event_init(struct perf_event *event) { struct cpu_hw_sf *cpuhw; struct hws_qsi_info_block si; struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; - unsigned long rate; int cpu, err; /* Reserve CPU-measurement sampling facility */ @@ -745,43 +821,9 @@ static int __hw_perf_event_init(struct perf_event *event) if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS) SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS; - /* The sampling information (si) contains information about the - * min/max sampling intervals and the CPU speed. So calculate the - * correct sampling interval and avoid the whole period adjust - * feedback loop. - */ - rate = 0; - if (attr->freq) { - if (!attr->sample_freq) { - err = -EINVAL; - goto out; - } - rate = freq_to_sample_rate(&si, attr->sample_freq); - rate = hw_limit_rate(&si, rate); - attr->freq = 0; - attr->sample_period = rate; - } else { - /* The min/max sampling rates specifies the valid range - * of sample periods. If the specified sample period is - * out of range, limit the period to the range boundary. - */ - rate = hw_limit_rate(&si, hwc->sample_period); - - /* The perf core maintains a maximum sample rate that is - * configurable through the sysctl interface. Ensure the - * sampling rate does not exceed this value. This also helps - * to avoid throttling when pushing samples with - * perf_event_overflow(). - */ - if (sample_rate_to_freq(&si, rate) > - sysctl_perf_event_sample_rate) { - err = -EINVAL; - debug_sprintf_event(sfdbg, 1, "Sampling rate exceeds maximum perf sample rate\n"); - goto out; - } - } - SAMPL_RATE(hwc) = rate; - hw_init_period(hwc, SAMPL_RATE(hwc)); + err = __hw_perf_event_init_rate(event, &si); + if (err) + goto out; /* Initialize sample data overflow accounting */ hwc->extra_reg.reg = REG_OVERFLOW; @@ -904,6 +946,8 @@ static void cpumsf_pmu_enable(struct pmu *pmu) if (sfb_has_pending_allocs(&cpuhw->sfb, hwc)) extend_sampling_buffer(&cpuhw->sfb, hwc); } + /* Rate may be adjusted with ioctl() */ + cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw); } /* (Re)enable the PMU and sampling facility */ @@ -922,8 +966,9 @@ static void cpumsf_pmu_enable(struct pmu *pmu) lpp(&S390_lowcore.lpp); debug_sprintf_event(sfdbg, 6, "pmu_enable: es=%i cs=%i ed=%i cd=%i " - "tear=%p dear=%p\n", cpuhw->lsctl.es, - cpuhw->lsctl.cs, cpuhw->lsctl.ed, cpuhw->lsctl.cd, + "interval:%lx tear=%p dear=%p\n", + cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed, + cpuhw->lsctl.cd, cpuhw->lsctl.interval, (void *) cpuhw->lsctl.tear, (void *) cpuhw->lsctl.dear); } @@ -1717,6 +1762,44 @@ static void cpumsf_pmu_read(struct perf_event *event) /* Nothing to do ... updates are interrupt-driven */ } +/* Check if the new sampling period/freqeuncy is appropriate. + * + * Return non-zero on error and zero on passed checks. + */ +static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) +{ + struct hws_qsi_info_block si; + unsigned long rate; + bool do_freq; + + memset(&si, 0, sizeof(si)); + if (event->cpu == -1) { + if (qsi(&si)) + return -ENODEV; + } else { + /* Event is pinned to a particular CPU, retrieve the per-CPU + * sampling structure for accessing the CPU-specific QSI. + */ + struct cpu_hw_sf *cpuhw = &per_cpu(cpu_hw_sf, event->cpu); + + si = cpuhw->qsi; + } + + do_freq = !!SAMPLE_FREQ_MODE(&event->hw); + rate = getrate(do_freq, value, &si); + if (!rate) + return -EINVAL; + + event->attr.sample_period = rate; + SAMPL_RATE(&event->hw) = rate; + hw_init_period(&event->hw, SAMPL_RATE(&event->hw)); + debug_sprintf_event(sfdbg, 4, "cpumsf_pmu_check_period:" + "cpu:%d value:%llx period:%llx freq:%d\n", + event->cpu, value, + event->attr.sample_period, do_freq); + return 0; +} + /* Activate sampling control. * Next call of pmu_enable() starts sampling. */ @@ -1908,6 +1991,8 @@ static struct pmu cpumf_sampling = { .setup_aux = aux_buffer_setup, .free_aux = aux_buffer_free, + + .check_period = cpumsf_pmu_check_period, }; static void cpumf_measurement_alert(struct ext_code ext_code, From 93426cadc339939d723a71fa1e2fdbfb97773ef0 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 16 Sep 2019 11:31:43 +0200 Subject: [PATCH 437/721] s390/cpumf: Remove mixed white space Remove blanks in comment and replace them by tabs. Signed-off-by: Thomas Richter Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/cpu_mf.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index ae3e3221d4b5..ceeb552d3472 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -70,7 +70,7 @@ struct hws_qsi_info_block { /* Bit(s) */ unsigned long tear; /* 24-31: TEAR contents */ unsigned long dear; /* 32-39: DEAR contents */ unsigned int rsvrd0; /* 40-43: reserved */ - unsigned int cpu_speed; /* 44-47: CPU speed */ + unsigned int cpu_speed; /* 44-47: CPU speed */ unsigned long long rsvrd1; /* 48-55: reserved */ unsigned long long rsvrd2; /* 56-63: reserved */ } __packed; @@ -89,10 +89,10 @@ struct hws_lsctl_request_block { unsigned long tear; /* 16-23: TEAR contents */ unsigned long dear; /* 24-31: DEAR contents */ /* 32-63: */ - unsigned long rsvrd1; /* reserved */ - unsigned long rsvrd2; /* reserved */ - unsigned long rsvrd3; /* reserved */ - unsigned long rsvrd4; /* reserved */ + unsigned long rsvrd1; /* reserved */ + unsigned long rsvrd2; /* reserved */ + unsigned long rsvrd3; /* reserved */ + unsigned long rsvrd4; /* reserved */ } __packed; struct hws_basic_entry { From 44460efe44e05eae2f21e57d06d542bbbb792e65 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Sat, 14 Sep 2019 12:45:43 -0700 Subject: [PATCH 438/721] tools/power/x86/intel-speed-select: Fix high priority core mask over count If the CPU package has the less logical CPU than topo_max_cpus, but un-present CPU's punit_cpu_core will be initiated to 0 and they will be count to core 0 Like below, there are only 10 high priority cores (20 logical CPUs) in the CPU package, but it count to 27 logic CPUs. ./intel-speed-select base-freq info -l 0 | grep mask high-priority-cpu-mask:7f000179,f000179f With the fix patch: ./intel-speed-select base-freq info -l 0 high-priority-cpu-mask:00000179,f000179f Signed-off-by: Youquan Song Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- tools/power/x86/intel-speed-select/isst-config.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 59753b3917bb..83ac72902b36 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -402,6 +402,9 @@ void set_cpu_mask_from_punit_coremask(int cpu, unsigned long long core_mask, int j; for (j = 0; j < topo_max_cpus; ++j) { + if (!CPU_ISSET_S(j, present_cpumask_size, present_cpumask)) + continue; + if (cpu_map[j].pkg_id == pkg_id && cpu_map[j].die_id == die_id && cpu_map[j].punit_cpu_core == i) { From 3c64c81ad1f06823b603e9143193dcb4f3121a2f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 14 Sep 2019 12:45:44 -0700 Subject: [PATCH 439/721] tools/power/x86/intel-speed-select: Allow online/offline based on tdp Using enable core mask, do online offline CPUs. There is a new option --online|-o for set-config-level. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../x86/intel-speed-select/isst-config.c | 60 +++++++++++++++++-- tools/power/x86/intel-speed-select/isst.h | 2 + 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 83ac72902b36..e505aaf2beef 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -38,6 +38,7 @@ static int fact_avx = 0xFF; static unsigned long long fact_trl; static int out_format_json; static int cmd_help; +static int force_online_offline; /* clos related */ static int current_clos = -1; @@ -138,14 +139,14 @@ int out_format_is_json(void) int get_physical_package_id(int cpu) { return parse_int_file( - 1, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", + 0, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu); } int get_physical_core_id(int cpu) { return parse_int_file( - 1, "/sys/devices/system/cpu/cpu%d/topology/core_id", cpu); + 0, "/sys/devices/system/cpu/cpu%d/topology/core_id", cpu); } int get_physical_die_id(int cpu) @@ -165,6 +166,26 @@ int get_topo_max_cpus(void) return topo_max_cpus; } +static void set_cpu_online_offline(int cpu, int state) +{ + char buffer[128]; + int fd; + + snprintf(buffer, sizeof(buffer), + "/sys/devices/system/cpu/cpu%d/online", cpu); + + fd = open(buffer, O_WRONLY); + if (fd < 0) + err(-1, "%s open failed", buffer); + + if (state) + write(fd, "1\n", 2); + else + write(fd, "0\n", 2); + + close(fd); +} + #define MAX_PACKAGE_COUNT 8 #define MAX_DIE_PER_PACKAGE 2 static void for_each_online_package_in_set(void (*callback)(int, void *, void *, @@ -736,9 +757,34 @@ static void set_tdp_level_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, ret = isst_set_tdp_level(cpu, tdp_level); if (ret) perror("set_tdp_level_for_cpu"); - else + else { isst_display_result(cpu, outf, "perf-profile", "set_tdp_level", ret); + if (force_online_offline) { + struct isst_pkg_ctdp_level_info ctdp_level; + int pkg_id = get_physical_package_id(cpu); + int die_id = get_physical_die_id(cpu); + + fprintf(stderr, "Option is set to online/offline\n"); + ctdp_level.core_cpumask_size = + alloc_cpu_set(&ctdp_level.core_cpumask); + isst_get_coremask_info(cpu, tdp_level, &ctdp_level); + if (ctdp_level.cpu_count) { + int i, max_cpus = get_topo_max_cpus(); + for (i = 0; i < max_cpus; ++i) { + if (pkg_id != get_physical_package_id(i) || die_id != get_physical_die_id(i)) + continue; + if (CPU_ISSET_S(i, ctdp_level.core_cpumask_size, ctdp_level.core_cpumask)) { + fprintf(stderr, "online cpu %d\n", i); + set_cpu_online_offline(i, 1); + } else { + fprintf(stderr, "offline cpu %d\n", i); + set_cpu_online_offline(i, 0); + } + } + } + } + } } static void set_tdp_level(void) @@ -747,6 +793,8 @@ static void set_tdp_level(void) fprintf(stderr, "Set Config TDP level\n"); fprintf(stderr, "\t Arguments: -l|--level : Specify tdp level\n"); + fprintf(stderr, + "\t Optional Arguments: -o | online : online/offline for the tdp level\n"); exit(0); } @@ -1319,6 +1367,7 @@ static void parse_cmd_args(int argc, int start, char **argv) static struct option long_options[] = { { "bucket", required_argument, 0, 'b' }, { "level", required_argument, 0, 'l' }, + { "online", required_argument, 0, 'o' }, { "trl-type", required_argument, 0, 'r' }, { "trl", required_argument, 0, 't' }, { "help", no_argument, 0, 'h' }, @@ -1335,7 +1384,7 @@ static void parse_cmd_args(int argc, int start, char **argv) option_index = start; optind = start + 1; - while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:h", + while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:ho", long_options, &option_index)) != -1) { switch (opt) { case 'b': @@ -1347,6 +1396,9 @@ static void parse_cmd_args(int argc, int start, char **argv) case 'l': tdp_level = atoi(optarg); break; + case 'o': + force_online_offline = 1; + break; case 't': sscanf(optarg, "0x%llx", &fact_trl); break; diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 2f7f62765eb6..668f914d077f 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -187,6 +187,8 @@ extern int isst_send_msr_command(unsigned int cpu, unsigned int command, int write, unsigned long long *req_resp); extern int isst_get_ctdp_levels(int cpu, struct isst_pkg_ctdp *pkg_dev); +extern int isst_get_coremask_info(int cpu, int config_index, + struct isst_pkg_ctdp_level_info *ctdp_level); extern int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev); extern void isst_get_process_ctdp_complete(int cpu, From e118fbe366d817175b2c47bfa338959bbff5bd37 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 14 Sep 2019 12:45:45 -0700 Subject: [PATCH 440/721] tools/power/x86/intel-speed-select: Format get-assoc information Format the get-assoc command output consistant with other commands. For example: Intel(R) Speed Select Technology Executing on CPU model:142[0x8e] package-0 die-0 cpu-0 get-assoc clos:0 Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../x86/intel-speed-select/isst-config.c | 14 +++++++---- .../x86/intel-speed-select/isst-display.c | 23 +++++++++++++++++++ tools/power/x86/intel-speed-select/isst.h | 2 +- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index e505aaf2beef..c2892d86be36 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -1249,7 +1249,7 @@ static void get_clos_assoc_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, if (ret) perror("isst_clos_get_assoc_status"); else - isst_display_result(cpu, outf, "core-power", "get-assoc", clos); + isst_clos_display_assoc_information(cpu, outf, clos); } static void get_clos_assoc(void) @@ -1259,13 +1259,17 @@ static void get_clos_assoc(void) fprintf(stderr, "\tSpecify targeted cpu id with [--cpu|-c]\n"); exit(0); } - if (max_target_cpus) - for_each_online_target_cpu_in_set(get_clos_assoc_for_cpu, NULL, - NULL, NULL, NULL); - else { + + if (!max_target_cpus) { fprintf(stderr, "Invalid target cpu. Specify with [-c|--cpu]\n"); + exit(0); } + + isst_ctdp_display_information_start(outf); + for_each_online_target_cpu_in_set(get_clos_assoc_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); } static struct process_cmd_struct isst_cmds[] = { diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index df4aa99c4e92..bd7aaf27e4de 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -503,6 +503,29 @@ void isst_clos_display_information(int cpu, FILE *outf, int clos, format_and_print(outf, 1, NULL, NULL); } +void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + + snprintf(header, sizeof(header), "get-assoc"); + format_and_print(outf, 4, header, NULL); + + snprintf(header, sizeof(header), "clos"); + snprintf(value, sizeof(value), "%d", clos); + format_and_print(outf, 5, header, value); + + format_and_print(outf, 1, NULL, NULL); +} + void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, int result) { diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 668f914d077f..48655d0dee2d 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -225,7 +225,7 @@ extern int isst_clos_associate(int cpu, int clos); extern int isst_clos_get_assoc_status(int cpu, int *clos_id); extern void isst_clos_display_information(int cpu, FILE *outf, int clos, struct isst_clos_config *clos_config); - +extern void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos); extern int isst_read_reg(unsigned short reg, unsigned int *val); extern int isst_write_reg(int reg, unsigned int val); From d2d1f304dc965e6a06e7f105b09bffceb477fccc Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 14 Sep 2019 12:45:46 -0700 Subject: [PATCH 441/721] tools/power/x86/intel-speed-select: Fix some debug prints Fix wrong debug print for cpu, which is displayed as CLOS. Also avoid printing clos id, when user is specify clos as parameter. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- tools/power/x86/intel-speed-select/isst-config.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index c2892d86be36..889396d676cb 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -508,7 +508,7 @@ int isst_send_mbox_command(unsigned int cpu, unsigned char command, int write = 0; int clos_id, core_id, ret = 0; - debug_printf("CLOS %d\n", cpu); + debug_printf("CPU %d\n", cpu); if (parameter & BIT(MBOX_CMD_WRITE_BIT)) { value = req_data; @@ -1421,7 +1421,6 @@ static void parse_cmd_args(int argc, int start, char **argv) /* CLOS related */ case 'c': current_clos = atoi(optarg); - printf("clos %d\n", current_clos); break; case 'd': clos_desired = atoi(optarg); From 188afed9db7db3aefc8c9c33150be2f8f398da9a Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 14 Sep 2019 12:45:47 -0700 Subject: [PATCH 442/721] tools/power/x86/intel-speed-select: Extend core-power command set Add additional command to get the clos enable and priority type. The current info option is actually dumping per clos QOS config, so name the command appropriately to get-config. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../x86/intel-speed-select/isst-config.c | 38 ++++++++++++++++++- .../power/x86/intel-speed-select/isst-core.c | 25 ++++++++++++ .../x86/intel-speed-select/isst-display.c | 28 ++++++++++++++ tools/power/x86/intel-speed-select/isst.h | 5 +++ 4 files changed, 95 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 889396d676cb..6a54e165672d 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -1133,6 +1133,40 @@ static void dump_clos_config(void) isst_ctdp_display_information_end(outf); } +static void get_clos_info_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int enable, ret, prio_type; + + ret = isst_clos_get_clos_information(cpu, &enable, &prio_type); + if (ret) + perror("isst_clos_get_info"); + else + isst_clos_display_clos_information(cpu, outf, enable, prio_type); +} + +static void dump_clos_info(void) +{ + if (cmd_help) { + fprintf(stderr, + "Print Intel Speed Select Technology core power information\n"); + fprintf(stderr, "\tSpecify targeted cpu id with [--cpu|-c]\n"); + exit(0); + } + + if (!max_target_cpus) { + fprintf(stderr, + "Invalid target cpu. Specify with [-c|--cpu]\n"); + exit(0); + } + + isst_ctdp_display_information_start(outf); + for_each_online_target_cpu_in_set(get_clos_info_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); + +} + static void set_clos_config_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, void *arg4) { @@ -1286,10 +1320,11 @@ static struct process_cmd_struct isst_cmds[] = { { "turbo-freq", "info", dump_fact_config }, { "turbo-freq", "enable", set_fact_enable }, { "turbo-freq", "disable", set_fact_disable }, - { "core-power", "info", dump_clos_config }, + { "core-power", "info", dump_clos_info }, { "core-power", "enable", set_clos_enable }, { "core-power", "disable", set_clos_disable }, { "core-power", "config", set_clos_config }, + { "core-power", "get-config", dump_clos_config }, { "core-power", "assoc", set_clos_assoc }, { "core-power", "get-assoc", get_clos_assoc }, { NULL, NULL, NULL } @@ -1491,6 +1526,7 @@ static void core_power_help(void) printf("\tenable\n"); printf("\tdisable\n"); printf("\tconfig\n"); + printf("\tget-config\n"); printf("\tassoc\n"); printf("\tget-assoc\n"); } diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c index 0bf341ad9697..6dee5332c9d3 100644 --- a/tools/power/x86/intel-speed-select/isst-core.c +++ b/tools/power/x86/intel-speed-select/isst-core.c @@ -619,6 +619,31 @@ int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev) return 0; } +int isst_clos_get_clos_information(int cpu, int *enable, int *type) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PM_QOS_CONFIG, 0, 0, + &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CLOS_PM_QOS_CONFIG resp:%x\n", cpu, resp); + + if (resp & BIT(1)) + *enable = 1; + else + *enable = 0; + + if (resp & BIT(2)) + *type = 1; + else + *type = 0; + + return 0; +} + int isst_pm_qos_config(int cpu, int enable_clos, int priority_type) { unsigned int req, resp; diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index bd7aaf27e4de..2e6e5fcdbd7c 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -503,6 +503,34 @@ void isst_clos_display_information(int cpu, FILE *outf, int clos, format_and_print(outf, 1, NULL, NULL); } +void isst_clos_display_clos_information(int cpu, FILE *outf, + int clos_enable, int type) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + + snprintf(header, sizeof(header), "core-power"); + format_and_print(outf, 4, header, NULL); + + snprintf(header, sizeof(header), "enable-status"); + snprintf(value, sizeof(value), "%d", clos_enable); + format_and_print(outf, 5, header, value); + + snprintf(header, sizeof(header), "priority-type"); + snprintf(value, sizeof(value), "%d", type); + format_and_print(outf, 5, header, value); + + format_and_print(outf, 1, NULL, NULL); +} + void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos) { char header[256]; diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 48655d0dee2d..09e16a41b57c 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -231,4 +231,9 @@ extern int isst_write_reg(int reg, unsigned int val); extern void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, int result); + +extern int isst_clos_get_clos_information(int cpu, int *enable, int *type); +extern void isst_clos_display_clos_information(int cpu, FILE *outf, + int clos_enable, int type); + #endif From b3abfd778bf1dbdd96f70bd7d00671d027f67c62 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 17 Sep 2019 17:52:37 -0700 Subject: [PATCH 443/721] tools/power/x86/intel-speed-select: Fix perf-profile command output commit "c016ae8f9fa04d361efc8629de49ad3af12b5262 "tools/power/x86/intel-speed-select: Output success/failed for command output" introduced a regression in perf-profile outputs. With this the result field is changed to string interpreting every non zero value as errors. But these commands display on zero (>0) result. For example before this commit the display was: package-1 die-0 cpu-14 get-config-levels:4 Here the get-config-levels is interpreted as error and displayed as error with the above commit: package-1 die-0 cpu-14 get-config-levels:failed(error 4) Fix this issue by not using isst_display_result() to display such results, but define a new function which formats this data and prints. Signed-off-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- .../x86/intel-speed-select/isst-config.c | 4 ++-- .../x86/intel-speed-select/isst-display.c | 20 +++++++++++++++++++ tools/power/x86/intel-speed-select/isst.h | 3 ++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 6a54e165672d..2a9890c8395a 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -673,8 +673,8 @@ static void exec_on_get_ctdp_cpu(int cpu, void *arg1, void *arg2, void *arg3, if (ret) perror("get_tdp_*"); else - isst_display_result(cpu, outf, "perf-profile", (char *)arg3, - *(unsigned int *)arg4); + isst_ctdp_display_core_info(cpu, outf, arg3, + *(unsigned int *)arg4); } #define _get_tdp_level(desc, suffix, object, help) \ diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index 2e6e5fcdbd7c..40346d534f78 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -287,6 +287,26 @@ static void _isst_fact_display_information(int cpu, FILE *outf, int level, format_and_print(outf, base_level + 2, header, value); } +void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix, + unsigned int val) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + + snprintf(value, sizeof(value), "%u", val); + format_and_print(outf, 4, prefix, value); + + format_and_print(outf, 1, NULL, NULL); +} + void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev) { diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 09e16a41b57c..d280b27d600d 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -195,6 +195,8 @@ extern void isst_get_process_ctdp_complete(int cpu, struct isst_pkg_ctdp *pkg_dev); extern void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev); +extern void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix, + unsigned int val); extern void isst_ctdp_display_information_start(FILE *outf); extern void isst_ctdp_display_information_end(FILE *outf); extern void isst_pbf_display_information(int cpu, FILE *outf, int level, @@ -235,5 +237,4 @@ extern void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, extern int isst_clos_get_clos_information(int cpu, int *enable, int *type); extern void isst_clos_display_clos_information(int cpu, FILE *outf, int clos_enable, int type); - #endif From d7f76f36a8b4dc8eff0c22819e4a5d55b0dee62a Mon Sep 17 00:00:00 2001 From: Nishka Dasgupta Date: Thu, 15 Aug 2019 11:30:14 +0530 Subject: [PATCH 444/721] ata: libahci_platform: Add of_node_put() before loop exit Each iteration of for_each_child_of_node puts the previous node, but in the case of a goto from the middle of the loop, there is no put, thus causing a memory leak. Add an of_node_put before three such goto statements. Issue found with Coccinelle. Reviewed-by: Hans de Goede Signed-off-by: Nishka Dasgupta Signed-off-by: Jens Axboe --- drivers/ata/libahci_platform.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index 9e9583a6bba9..e742780950de 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -497,6 +497,7 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, if (of_property_read_u32(child, "reg", &port)) { rc = -EINVAL; + of_node_put(child); goto err_out; } @@ -514,14 +515,18 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, if (port_dev) { rc = ahci_platform_get_regulator(hpriv, port, &port_dev->dev); - if (rc == -EPROBE_DEFER) + if (rc == -EPROBE_DEFER) { + of_node_put(child); goto err_out; + } } #endif rc = ahci_platform_get_phy(hpriv, port, dev, child); - if (rc) + if (rc) { + of_node_put(child); goto err_out; + } enabled_ports++; } From 6fe7b9901400152238e1b76198747f6716c78aad Mon Sep 17 00:00:00 2001 From: Matthew Bobrowski Date: Thu, 19 Sep 2019 15:32:44 -0700 Subject: [PATCH 445/721] iomap: split size and error for iomap_dio_rw ->end_io Modify the calling convention for the iomap_dio_rw ->end_io() callback. Rather than passing either dio->error or dio->size as the 'size' argument, instead pass both the dio->error and the dio->size value separately. In the instance that an error occurred during a write, we currently cannot determine whether any blocks have been allocated beyond the current EOF and data has subsequently been written to these blocks within the ->end_io() callback. As a result, we cannot judge whether we should take the truncate failed write path. Having both dio->error and dio->size will allow us to perform such checks within this callback. Signed-off-by: Matthew Bobrowski [hch: minor cleanups] Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Matthew Wilcox (Oracle) --- fs/iomap/direct-io.c | 9 +++------ fs/xfs/xfs_file.c | 8 +++++--- include/linux/iomap.h | 4 ++-- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 10517cea9682..2ccf1c6460d4 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -77,13 +77,10 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) loff_t offset = iocb->ki_pos; ssize_t ret; - if (dio->end_io) { - ret = dio->end_io(iocb, - dio->error ? dio->error : dio->size, - dio->flags); - } else { + if (dio->end_io) + ret = dio->end_io(iocb, dio->size, dio->error, dio->flags); + else ret = dio->error; - } if (likely(!ret)) { ret = dio->size; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 28101bbc0b78..74411296f6b5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -369,21 +369,23 @@ static int xfs_dio_write_end_io( struct kiocb *iocb, ssize_t size, + int error, unsigned flags) { struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; unsigned int nofs_flag; - int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (size <= 0) - return size; + if (error) + return error; + if (!size) + return 0; /* * Capture amount written on completion as we can't reliably account diff --git a/include/linux/iomap.h b/include/linux/iomap.h index bc499ceae392..50bb746d2216 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -188,8 +188,8 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, */ #define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ #define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ -typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret, - unsigned flags); +typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, iomap_dio_end_io_t end_io); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); From 838c4f3d7515efe9d0e32c846fb5d102b6d8a29d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Sep 2019 15:32:45 -0700 Subject: [PATCH 446/721] iomap: move the iomap_dio_rw ->end_io callback into a structure Add a new iomap_dio_ops structure that for now just contains the end_io handler. This avoid storing the function pointer in a mutable structure, which is a possible exploit vector for kernel code execution, and prepares for adding a submit_io handler that btrfs needs. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/direct-io.c | 21 ++++++++++----------- fs/xfs/xfs_file.c | 6 +++++- include/linux/iomap.h | 10 +++++++--- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 2ccf1c6460d4..1fc28c2da279 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -24,7 +24,7 @@ struct iomap_dio { struct kiocb *iocb; - iomap_dio_end_io_t *end_io; + const struct iomap_dio_ops *dops; loff_t i_size; loff_t size; atomic_t ref; @@ -72,15 +72,14 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, static ssize_t iomap_dio_complete(struct iomap_dio *dio) { + const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; - ssize_t ret; + ssize_t ret = dio->error; - if (dio->end_io) - ret = dio->end_io(iocb, dio->size, dio->error, dio->flags); - else - ret = dio->error; + if (dops && dops->end_io) + ret = dops->end_io(iocb, dio->size, ret, dio->flags); if (likely(!ret)) { ret = dio->size; @@ -98,9 +97,9 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) * one is a pretty crazy thing to do, so we don't support it 100%. If * this invalidation fails, tough, the write still worked... * - * And this page cache invalidation has to be after dio->end_io(), as - * some filesystems convert unwritten extents to real allocations in - * end_io() when necessary, otherwise a racing buffer read would cache + * And this page cache invalidation has to be after ->end_io(), as some + * filesystems convert unwritten extents to real allocations in + * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ if (!dio->error && @@ -393,7 +392,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, */ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, iomap_dio_end_io_t end_io) + const struct iomap_ops *ops, const struct iomap_dio_ops *dops) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); @@ -418,7 +417,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, atomic_set(&dio->ref, 1); dio->size = 0; dio->i_size = i_size_read(inode); - dio->end_io = end_io; + dio->dops = dops; dio->error = 0; dio->flags = 0; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 74411296f6b5..21bd3d575aaa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -442,6 +442,10 @@ xfs_dio_write_end_io( return error; } +static const struct iomap_dio_ops xfs_dio_write_ops = { + .end_io = xfs_dio_write_end_io, +}; + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -542,7 +546,7 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops); /* * If unaligned, this is the only IO in-flight. If it has not yet diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 50bb746d2216..7aa5d6117936 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -188,10 +188,14 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, */ #define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ #define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ -typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t size, int error, - unsigned int flags); + +struct iomap_dio_ops { + int (*end_io)(struct kiocb *iocb, ssize_t size, int error, + unsigned flags); +}; + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, iomap_dio_end_io_t end_io); + const struct iomap_ops *ops, const struct iomap_dio_ops *dops); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); #ifdef CONFIG_SWAP From d2c63b7dfd06788a466d5ec8a850491f084c5fc2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 20 Sep 2019 09:30:40 +0200 Subject: [PATCH 447/721] ALSA: hda - Apply AMD controller workaround for Raven platform It's reported that the garbled sound on HP Envy x360 13z-ag000 (Ryzen Laptop) is fixed by the same workaround applied to other AMD chips. Update the driver_data entry for Raven (1022:15e3) to use the newly introduced preset, AZX_DCAPS_PRESET_AMD_SB. Since it already contains AZX_DCAPS_PM_RUNTIME, we can drop that bit, too. Reported-and-tested-by: Dennis Padiernos Cc: Link: https://lore.kernel.org/r/20190920073040.31764-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 91e71be42fa4..240f4ca76391 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2485,8 +2485,7 @@ static const struct pci_device_id azx_ids[] = { AZX_DCAPS_PM_RUNTIME }, /* AMD Raven */ { PCI_DEVICE(0x1022, 0x15e3), - .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_ATI_SB | - AZX_DCAPS_PM_RUNTIME }, + .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_AMD_SB }, /* ATI HDMI */ { PCI_DEVICE(0x1002, 0x0002), .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, From bd9c10bc663dd2eaac8fe39dad0f18cd21527446 Mon Sep 17 00:00:00 2001 From: Jan-Marek Glogowski Date: Sun, 15 Sep 2019 16:57:28 +0200 Subject: [PATCH 448/721] ALSA: hda/realtek - PCI quirk for Medion E4254 The laptop has a combined jack to attach headsets on the right. The BIOS encodes them as two different colored jacks at the front, but otherwise it seems to be configured ok. But any adaption of the pins config on its own doesn't fix the jack detection to work in Linux. Still Windows works correct. This is somehow fixed by chaining ALC256_FIXUP_ASUS_HEADSET_MODE, which seems to register the microphone jack as a headset part and also results in fixing jack sensing, visible in dmesg as: -snd_hda_codec_realtek hdaudioC0D0: Mic=0x19 +snd_hda_codec_realtek hdaudioC0D0: Headset Mic=0x19 [ Actually the essential change is the location of the jack; the driver created "Front Mic Jack" without the matching volume / mute control element due to its jack location, which confused PA. -- tiwai ] Signed-off-by: Jan-Marek Glogowski Cc: Link: https://lore.kernel.org/r/8f4f9b20-0aeb-f8f1-c02f-fd53c09679f1@fbihome.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e27ef434b60d..b000b36ac3c6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5872,6 +5872,7 @@ enum { ALC256_FIXUP_ASUS_MIC_NO_PRESENCE, ALC299_FIXUP_PREDATOR_SPK, ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC, + ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, }; static const struct hda_fixup alc269_fixups[] = { @@ -6937,6 +6938,16 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC }, + [ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x04a11040 }, + { 0x21, 0x04211020 }, + { } + }, + .chained = true, + .chain_id = ALC256_FIXUP_ASUS_HEADSET_MODE + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -7200,6 +7211,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD), SND_PCI_QUIRK(0x19e5, 0x3204, "Huawei MACH-WX9", ALC256_FIXUP_HUAWEI_MACH_WX9_PINS), SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */ + SND_PCI_QUIRK(0x10ec, 0x118c, "Medion EE4254 MD62100", ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE), #if 0 /* Below is a quirk table taken from the old code. @@ -7368,6 +7380,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC295_FIXUP_CHROME_BOOK, .name = "alc-chrome-book"}, {.id = ALC299_FIXUP_PREDATOR_SPK, .name = "predator-spk"}, {.id = ALC298_FIXUP_HUAWEI_MBX_STEREO, .name = "huawei-mbx-stereo"}, + {.id = ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, .name = "alc256-medion-headset"}, {} }; #define ALC225_STANDARD_PINS \ From 7a5d9815cc010b055c2a99ccf418c4629365fa43 Mon Sep 17 00:00:00 2001 From: Bard liao Date: Wed, 18 Sep 2019 21:31:31 +0800 Subject: [PATCH 449/721] ASoC: core: use list_del_init and move it back to soc_cleanup_component commit a0a4bf57a977 ("ASoC: core: delete component->card_list in soc_remove_component only") was trying to fix a kernel oops when list_del was called twice without re-init the list. Use list_del_init() can solve it, too. Besides, it will be more readable if we cleanup all component related resource at soc_cleanup_component(). Suggested-by: Kuninori Morimoto Signed-off-by: Bard liao Link: https://lore.kernel.org/r/20190918133131.15045-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/soc-core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index aff4b4bf4d07..88978a3036c4 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -978,6 +978,7 @@ static void soc_cleanup_component(struct snd_soc_component *component) /* For framework level robustness */ snd_soc_component_set_jack(component, NULL, NULL); + list_del_init(&component->card_list); snd_soc_dapm_free(snd_soc_component_get_dapm(component)); soc_cleanup_component_debugfs(component); component->card = NULL; @@ -990,7 +991,7 @@ static void soc_remove_component(struct snd_soc_component *component) return; snd_soc_component_remove(component); - list_del(&component->card_list); + soc_cleanup_component(component); } From 7b2db65b59c30d58c129d3c8b2101feca686155a Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 19 Sep 2019 10:16:52 +0300 Subject: [PATCH 450/721] ASoC: pcm3168a: The codec does not support S32_LE 24 bits is supported in all modes and 16 bit only when the codec is slave and the DAI is set to RIGHT_J. Remove the unsupported sample format. Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20190919071652.31724-1-peter.ujfalusi@ti.com Signed-off-by: Mark Brown --- sound/soc/codecs/pcm3168a.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/codecs/pcm3168a.c b/sound/soc/codecs/pcm3168a.c index 50ed86d45c26..88b75695fbf7 100644 --- a/sound/soc/codecs/pcm3168a.c +++ b/sound/soc/codecs/pcm3168a.c @@ -21,8 +21,7 @@ #define PCM3168A_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | \ SNDRV_PCM_FMTBIT_S24_3LE | \ - SNDRV_PCM_FMTBIT_S24_LE | \ - SNDRV_PCM_FMTBIT_S32_LE) + SNDRV_PCM_FMTBIT_S24_LE) #define PCM3168A_FMT_I2S 0x0 #define PCM3168A_FMT_LEFT_J 0x1 From 147162f575152db80000fb3de26358264768ee9f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Sep 2019 09:50:18 +0200 Subject: [PATCH 451/721] ASoC: ti: fix SND_SOC_DM365_VOICE_CODEC dependencies SND_SOC_DM365_VOICE_CODEC is a 'bool' option in a choice statement, meaning it cannot be set to =m, but it selects two other drivers that we may want to be loadable modules after all: WARNING: unmet direct dependencies detected for SND_SOC_CQ0093VC Depends on [m]: SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] Selected by [y]: - SND_SOC_DM365_VOICE_CODEC [=y] && Selected by [m]: - SND_SOC_ALL_CODECS [=m] && SOUND [=m] && !UML && SND [=m] && SND_SOC [=m] && COMPILE_TEST [=y] Add an intermediate symbol that sets SND_SOC_CQ0093VC and MFD_DAVINCI_VOICECODEC to =m if SND_SOC=m. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20190920075046.3210393-1-arnd@arndb.de Signed-off-by: Mark Brown --- sound/soc/ti/Kconfig | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sound/soc/ti/Kconfig b/sound/soc/ti/Kconfig index 87a9b9dd4e98..29f61053ab62 100644 --- a/sound/soc/ti/Kconfig +++ b/sound/soc/ti/Kconfig @@ -200,11 +200,18 @@ config SND_SOC_DM365_AIC3X_CODEC config SND_SOC_DM365_VOICE_CODEC bool "Voice Codec - CQ93VC" - select MFD_DAVINCI_VOICECODEC - select SND_SOC_CQ0093VC help Say Y if you want to add support for SoC On-chip voice codec endchoice +config SND_SOC_DM365_VOICE_CODEC_MODULE + def_tristate y + depends on SND_SOC_DM365_VOICE_CODEC && SND_SOC + select MFD_DAVINCI_VOICECODEC + select SND_SOC_CQ0093VC + help + The is an internal symbol needed to ensure that the codec + and MFD driver can be built as loadable modules if necessary. + endmenu From 5fc194ea6d34dfad9833d3043ce41d6c52aff39a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 9 Sep 2019 00:29:52 -0500 Subject: [PATCH 452/721] crypto: talitos - fix missing break in switch statement Add missing break statement in order to prevent the code from falling through to case CRYPTO_ALG_TYPE_AHASH. Fixes: aeb4c132f33d ("crypto: talitos - Convert to new AEAD interface") Cc: stable@vger.kernel.org Reported-by: kbuild test robot Signed-off-by: Gustavo A. R. Silva Reviewed-by: Christophe Leroy Signed-off-by: Herbert Xu --- drivers/crypto/talitos.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index cb6c10b1bf36..56e3068c9947 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -3116,6 +3116,7 @@ static int talitos_remove(struct platform_device *ofdev) break; case CRYPTO_ALG_TYPE_AEAD: crypto_unregister_aead(&t_alg->algt.alg.aead); + break; case CRYPTO_ALG_TYPE_AHASH: crypto_unregister_ahash(&t_alg->algt.alg.hash); break; From 212ef6f29e5b82bfd0ff595347fa1643326589a0 Mon Sep 17 00:00:00 2001 From: Pascal van Leeuwen Date: Fri, 13 Sep 2019 11:04:40 +0200 Subject: [PATCH 453/721] crypto: inside-secure - Fix unused variable warning when CONFIG_PCI=n This patch fixes an unused variable warning from the compiler when the driver is being compiled without PCI support in the kernel. Fixes: 625f269a5a7a ("crypto: inside-secure - add support for...") Signed-off-by: Pascal van Leeuwen Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel.c | 42 ++++++++++++++++++------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c index b456b85f46d3..4ab1bde8dd9b 100644 --- a/drivers/crypto/inside-secure/safexcel.c +++ b/drivers/crypto/inside-secure/safexcel.c @@ -1789,32 +1789,50 @@ static struct pci_driver safexcel_pci_driver = { }; #endif +/* Unfortunately, we have to resort to global variables here */ +#if IS_ENABLED(CONFIG_PCI) +int pcireg_rc = -EINVAL; /* Default safe value */ +#endif +#if IS_ENABLED(CONFIG_OF) +int ofreg_rc = -EINVAL; /* Default safe value */ +#endif + static int __init safexcel_init(void) { - int rc; +#if IS_ENABLED(CONFIG_PCI) + /* Register PCI driver */ + pcireg_rc = pci_register_driver(&safexcel_pci_driver); +#endif #if IS_ENABLED(CONFIG_OF) - /* Register platform driver */ - platform_driver_register(&crypto_safexcel); + /* Register platform driver */ + ofreg_rc = platform_driver_register(&crypto_safexcel); + #if IS_ENABLED(CONFIG_PCI) + /* Return success if either PCI or OF registered OK */ + return pcireg_rc ? ofreg_rc : 0; + #else + return ofreg_rc; + #endif +#else + #if IS_ENABLED(CONFIG_PCI) + return pcireg_rc; + #else + return -EINVAL; + #endif #endif - -#if IS_ENABLED(CONFIG_PCI) - /* Register PCI driver */ - rc = pci_register_driver(&safexcel_pci_driver); -#endif - - return 0; } static void __exit safexcel_exit(void) { #if IS_ENABLED(CONFIG_OF) - /* Unregister platform driver */ + /* Unregister platform driver */ + if (!ofreg_rc) platform_driver_unregister(&crypto_safexcel); #endif #if IS_ENABLED(CONFIG_PCI) - /* Unregister PCI driver if successfully registered before */ + /* Unregister PCI driver if successfully registered before */ + if (!pcireg_rc) pci_unregister_driver(&safexcel_pci_driver); #endif } From 24fbf7bad888767bed952f540ac963bc57e47e15 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Sun, 15 Sep 2019 17:26:56 +0800 Subject: [PATCH 454/721] crypto: hisilicon - Fix double free in sec_free_hw_sgl() There are two problems in sec_free_hw_sgl(): First, when sgl_current->next is valid, @hw_sgl will be freed in the first loop, but it free again after the loop. Second, sgl_current and sgl_current->next_sgl is not match when dma_pool_free() is invoked, the third parameter should be the dma address of sgl_current, but sgl_current->next_sgl is the dma address of next chain, so use sgl_current->next_sgl is wrong. Fix this by deleting the last dma_pool_free() in sec_free_hw_sgl(), modifying the condition for while loop, and matching the address for dma_pool_free(). Fixes: 915e4e8413da ("crypto: hisilicon - SEC security accelerator driver") Signed-off-by: Yunfeng Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/sec/sec_algs.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/crypto/hisilicon/sec/sec_algs.c b/drivers/crypto/hisilicon/sec/sec_algs.c index e0508ea160f1..aa126332fdcb 100644 --- a/drivers/crypto/hisilicon/sec/sec_algs.c +++ b/drivers/crypto/hisilicon/sec/sec_algs.c @@ -215,17 +215,18 @@ static void sec_free_hw_sgl(struct sec_hw_sgl *hw_sgl, dma_addr_t psec_sgl, struct sec_dev_info *info) { struct sec_hw_sgl *sgl_current, *sgl_next; + dma_addr_t sgl_next_dma; - if (!hw_sgl) - return; sgl_current = hw_sgl; - while (sgl_current->next) { + while (sgl_current) { sgl_next = sgl_current->next; - dma_pool_free(info->hw_sgl_pool, sgl_current, - sgl_current->next_sgl); + sgl_next_dma = sgl_current->next_sgl; + + dma_pool_free(info->hw_sgl_pool, sgl_current, psec_sgl); + sgl_current = sgl_next; + psec_sgl = sgl_next_dma; } - dma_pool_free(info->hw_sgl_pool, hw_sgl, psec_sgl); } static int sec_alg_skcipher_setkey(struct crypto_skcipher *tfm, From e00371af1d4ce73d527d8ee69fda2febaf5a42c2 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Sun, 15 Sep 2019 17:31:14 +0800 Subject: [PATCH 455/721] crypto: hisilicon - Matching the dma address for dma_pool_free() When dma_pool_zalloc() fail in sec_alloc_and_fill_hw_sgl(), dma_pool_free() is invoked, but the parameters that sgl_current and sgl_current->next_sgl is not match. Using sec_free_hw_sgl() instead of the original free routine. Fixes: 915e4e8413da ("crypto: hisilicon - SEC security accelerator driver") Signed-off-by: Yunfeng Ye Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/sec/sec_algs.c | 44 +++++++++++-------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/drivers/crypto/hisilicon/sec/sec_algs.c b/drivers/crypto/hisilicon/sec/sec_algs.c index aa126332fdcb..c27e7160d2df 100644 --- a/drivers/crypto/hisilicon/sec/sec_algs.c +++ b/drivers/crypto/hisilicon/sec/sec_algs.c @@ -153,6 +153,24 @@ static void sec_alg_skcipher_init_context(struct crypto_skcipher *atfm, ctx->cipher_alg); } +static void sec_free_hw_sgl(struct sec_hw_sgl *hw_sgl, + dma_addr_t psec_sgl, struct sec_dev_info *info) +{ + struct sec_hw_sgl *sgl_current, *sgl_next; + dma_addr_t sgl_next_dma; + + sgl_current = hw_sgl; + while (sgl_current) { + sgl_next = sgl_current->next; + sgl_next_dma = sgl_current->next_sgl; + + dma_pool_free(info->hw_sgl_pool, sgl_current, psec_sgl); + + sgl_current = sgl_next; + psec_sgl = sgl_next_dma; + } +} + static int sec_alloc_and_fill_hw_sgl(struct sec_hw_sgl **sec_sgl, dma_addr_t *psec_sgl, struct scatterlist *sgl, @@ -199,36 +217,12 @@ static int sec_alloc_and_fill_hw_sgl(struct sec_hw_sgl **sec_sgl, return 0; err_free_hw_sgls: - sgl_current = *sec_sgl; - while (sgl_current) { - sgl_next = sgl_current->next; - dma_pool_free(info->hw_sgl_pool, sgl_current, - sgl_current->next_sgl); - sgl_current = sgl_next; - } + sec_free_hw_sgl(*sec_sgl, *psec_sgl, info); *psec_sgl = 0; return ret; } -static void sec_free_hw_sgl(struct sec_hw_sgl *hw_sgl, - dma_addr_t psec_sgl, struct sec_dev_info *info) -{ - struct sec_hw_sgl *sgl_current, *sgl_next; - dma_addr_t sgl_next_dma; - - sgl_current = hw_sgl; - while (sgl_current) { - sgl_next = sgl_current->next; - sgl_next_dma = sgl_current->next_sgl; - - dma_pool_free(info->hw_sgl_pool, sgl_current, psec_sgl); - - sgl_current = sgl_next; - psec_sgl = sgl_next_dma; - } -} - static int sec_alg_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen, enum sec_cipher_alg alg) From 62a9d9fc7a210005cdbbf186d6e655228497dfac Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Mon, 16 Sep 2019 14:38:25 +0800 Subject: [PATCH 456/721] crypto: hisilicon - Fix return value check in hisi_zip_acompress() The return valude of add_comp_head() is int, but @head_size is size_t, which is a unsigned type. size_t head_size; ... if (head_size < 0) // it will never work return -ENOMEM Modify the type of @head_size to int, then change the type to size_t when invoke hisi_zip_create_req() as a parameter. Fixes: 62c455ca853e ("crypto: hisilicon - add HiSilicon ZIP accelerator support") Signed-off-by: Yunfeng Ye Acked-by: Zhou Wang Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/zip/zip_crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/hisilicon/zip/zip_crypto.c b/drivers/crypto/hisilicon/zip/zip_crypto.c index 5a3f84dcdcde..59023545a1c4 100644 --- a/drivers/crypto/hisilicon/zip/zip_crypto.c +++ b/drivers/crypto/hisilicon/zip/zip_crypto.c @@ -559,7 +559,7 @@ static int hisi_zip_acompress(struct acomp_req *acomp_req) struct hisi_zip_ctx *ctx = crypto_tfm_ctx(acomp_req->base.tfm); struct hisi_zip_qp_ctx *qp_ctx = &ctx->qp_ctx[QPC_COMP]; struct hisi_zip_req *req; - size_t head_size; + int head_size; int ret; /* let's output compression head now */ @@ -567,7 +567,7 @@ static int hisi_zip_acompress(struct acomp_req *acomp_req) if (head_size < 0) return -ENOMEM; - req = hisi_zip_create_req(acomp_req, qp_ctx, head_size, true); + req = hisi_zip_create_req(acomp_req, qp_ctx, (size_t)head_size, true); if (IS_ERR(req)) return PTR_ERR(req); From 78887832e76541f77169a24ac238fccb51059b63 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Tue, 17 Sep 2019 11:54:50 +0200 Subject: [PATCH 457/721] hwrng: core - don't wait on add_early_randomness() add_early_randomness() is called by hwrng_register() when the hardware is added. If this hardware and its module are present at boot, and if there is no data available the boot hangs until data are available and can't be interrupted. For instance, in the case of virtio-rng, in some cases the host can be not able to provide enough entropy for all the guests. We can have two easy ways to reproduce the problem but they rely on misconfiguration of the hypervisor or the egd daemon: - if virtio-rng device is configured to connect to the egd daemon of the host but when the virtio-rng driver asks for data the daemon is not connected, - if virtio-rng device is configured to connect to the egd daemon of the host but the egd daemon doesn't provide data. The guest kernel will hang at boot until the virtio-rng driver provides enough data. To avoid that, call rng_get_data() in non-blocking mode (wait=0) from add_early_randomness(). Signed-off-by: Laurent Vivier Fixes: d9e797261933 ("hwrng: add randomness to system from rng...") Cc: Reviewed-by: Theodore Ts'o Signed-off-by: Herbert Xu --- drivers/char/hw_random/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index bdab5d9af8d2..80b850ef1bf6 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -68,7 +68,7 @@ static void add_early_randomness(struct hwrng *rng) size_t size = min_t(size_t, 16, rng_buffer_size()); mutex_lock(&reading_mutex); - bytes_read = rng_get_data(rng, rng_buffer, size, 1); + bytes_read = rng_get_data(rng, rng_buffer, size, 0); mutex_unlock(&reading_mutex); if (bytes_read > 0) add_device_randomness(rng_buffer, bytes_read); From bf6a7a5ad6fa69e48b735be75eeb90569d9584bb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Sep 2019 16:05:52 +0200 Subject: [PATCH 458/721] crypto: hisilicon - avoid unused function warning The only caller of hisi_zip_vf_q_assign() is hidden in an #ifdef, so the function causes a warning when CONFIG_PCI_IOV is disabled: drivers/crypto/hisilicon/zip/zip_main.c:740:12: error: unused function 'hisi_zip_vf_q_assign' [-Werror,-Wunused-function] Replace the #ifdef with an IS_ENABLED() check that leads to the function being dropped based on the configuration. Fixes: 79e09f30eeba ("crypto: hisilicon - add SRIOV support for ZIP") Signed-off-by: Arnd Bergmann Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/zip/zip_main.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index 6e0ca75585d4..1b2ee96c888d 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -785,7 +785,6 @@ static int hisi_zip_clear_vft_config(struct hisi_zip *hisi_zip) static int hisi_zip_sriov_enable(struct pci_dev *pdev, int max_vfs) { -#ifdef CONFIG_PCI_IOV struct hisi_zip *hisi_zip = pci_get_drvdata(pdev); int pre_existing_vfs, num_vfs, ret; @@ -815,9 +814,6 @@ static int hisi_zip_sriov_enable(struct pci_dev *pdev, int max_vfs) } return num_vfs; -#else - return 0; -#endif } static int hisi_zip_sriov_disable(struct pci_dev *pdev) @@ -948,7 +944,8 @@ static struct pci_driver hisi_zip_pci_driver = { .id_table = hisi_zip_dev_ids, .probe = hisi_zip_probe, .remove = hisi_zip_remove, - .sriov_configure = hisi_zip_sriov_configure, + .sriov_configure = IS_ENABLED(CONFIG_PCI_IOV) ? + hisi_zip_sriov_configure : 0, .err_handler = &hisi_zip_err_handler, }; From 7b485d175631be676424aedb8cd2f66d0c93da78 Mon Sep 17 00:00:00 2001 From: "Shih-Yuan Lee (FourDollars)" Date: Fri, 20 Sep 2019 21:40:53 +0800 Subject: [PATCH 459/721] ALSA: hda - Add laptop imic fixup for ASUS M9V laptop The same fixup to enable laptop imic is needed for ASUS M9V with AD1986A codec like another HP machine. Signed-off-by: Shih-Yuan Lee (FourDollars) Cc: Link: https://lore.kernel.org/r/20190920134052.GA8035@localhost Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_analog.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_analog.c b/sound/pci/hda/patch_analog.c index e283966bdbb1..bc9dd8e6fd86 100644 --- a/sound/pci/hda/patch_analog.c +++ b/sound/pci/hda/patch_analog.c @@ -357,6 +357,7 @@ static const struct hda_fixup ad1986a_fixups[] = { static const struct snd_pci_quirk ad1986a_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x30af, "HP B2800", AD1986A_FIXUP_LAPTOP_IMIC), + SND_PCI_QUIRK(0x1043, 0x1153, "ASUS M9V", AD1986A_FIXUP_LAPTOP_IMIC), SND_PCI_QUIRK(0x1043, 0x1443, "ASUS Z99He", AD1986A_FIXUP_EAPD), SND_PCI_QUIRK(0x1043, 0x1447, "ASUS A8JN", AD1986A_FIXUP_EAPD), SND_PCI_QUIRK_MASK(0x1043, 0xff00, 0x8100, "ASUS P5", AD1986A_FIXUP_3STACK), From f110d252ae7985def3cd0fd8b03979180a8633a0 Mon Sep 17 00:00:00 2001 From: Srikanth Krishnakar Date: Fri, 20 Sep 2019 15:01:56 +0530 Subject: [PATCH 460/721] platform/x86: pmc_atom: Add Siemens SIMATIC IPC277E to critclk_systems DMI table The SIMATIC IPC277E uses the PMC clock for on-board components and gets stuck during boot if the clock is disabled. Therefore, add this device to the critical systems list. Tested on SIMATIC IPC277E. Fixes: 648e921888ad ("clk: x86: Stop marking clocks as CLK_IS_CRITICAL") Reviewed-by: Jan Kiszka Cc: Cedric Hombourger Signed-off-by: Srikanth Krishnakar Signed-off-by: Andy Shevchenko --- drivers/platform/x86/pmc_atom.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/platform/x86/pmc_atom.c b/drivers/platform/x86/pmc_atom.c index 9aca5e7ce6d0..07d1b911e72f 100644 --- a/drivers/platform/x86/pmc_atom.c +++ b/drivers/platform/x86/pmc_atom.c @@ -422,6 +422,13 @@ static const struct dmi_system_id critclk_systems[] = { DMI_MATCH(DMI_PRODUCT_VERSION, "6ES7647-8B"), }, }, + { + .ident = "SIMATIC IPC277E", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "SIEMENS AG"), + DMI_MATCH(DMI_PRODUCT_VERSION, "6AV7882-0"), + }, + }, { /*sentinel*/ } }; From 24a8d78a9affb63e5ced313ccde6888fe96edc6e Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Fri, 20 Sep 2019 13:02:33 +0300 Subject: [PATCH 461/721] platform/x86: i2c-multi-instantiate: Derive the device name from parent When naming the new devices, instead of using the ACPI ID in the name as base, using the parent device's name. That makes it possible to support multiple multi-instance i2c devices of the same type in the same system. This fixes an issue seen on some Intel Kaby Lake based boards: sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:15.0/i2c_designware.0/i2c-0/i2c-INT3515-tps6598x.0' Fixes: 2336dfadfb1e ("platform/x86: i2c-multi-instantiate: Allow to have same slaves") Cc: stable@vger.kernel.org Signed-off-by: Heikki Krogerus Reviewed-by: Hans de Goede Signed-off-by: Andy Shevchenko --- drivers/platform/x86/i2c-multi-instantiate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/i2c-multi-instantiate.c b/drivers/platform/x86/i2c-multi-instantiate.c index 61fe341a85aa..ea68f6ed66ae 100644 --- a/drivers/platform/x86/i2c-multi-instantiate.c +++ b/drivers/platform/x86/i2c-multi-instantiate.c @@ -90,7 +90,7 @@ static int i2c_multi_inst_probe(struct platform_device *pdev) for (i = 0; i < multi->num_clients && inst_data[i].type; i++) { memset(&board_info, 0, sizeof(board_info)); strlcpy(board_info.type, inst_data[i].type, I2C_NAME_SIZE); - snprintf(name, sizeof(name), "%s-%s.%d", match->id, + snprintf(name, sizeof(name), "%s-%s.%d", dev_name(dev), inst_data[i].type, i); board_info.dev_name = name; switch (inst_data[i].flags & IRQ_RESOURCE_TYPE) { From 5f1bc39979d868a0358c683864bec3fc8395440b Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 16 Sep 2019 07:59:37 -0400 Subject: [PATCH 462/721] SUNRPC: Fix buffer handling of GSS MIC without slack The GSS Message Integrity Check data for krb5i may lie partially in the XDR reply buffer's pages and tail. If so, we try to copy the entire MIC into free space in the tail. But as the estimations of the slack space required for authentication and verification have improved there may be less free space in the tail to complete this copy -- see commit 2c94b8eca1a2 ("SUNRPC: Use au_rslack when computing reply buffer size"). In fact, there may only be room in the tail for a single copy of the MIC, and not part of the MIC and then another complete copy. The real world failure reported is that `ls` of a directory on NFS may sometimes return -EIO, which can be traced back to xdr_buf_read_netobj() failing to find available free space in the tail to copy the MIC. Fix this by checking for the case of the MIC crossing the boundaries of head, pages, and tail. If so, shift the buffer until the MIC is contained completely within the pages or tail. This allows the remainder of the function to create a sub buffer that directly address the complete MIC. Signed-off-by: Benjamin Coddington Cc: stable@vger.kernel.org # v5.1 Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xdr.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 7ba0ede6b417..24ec2ab5bfa5 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1237,16 +1237,29 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) EXPORT_SYMBOL_GPL(xdr_encode_word); /* If the netobj starting offset bytes from the start of xdr_buf is contained - * entirely in the head or the tail, set object to point to it; otherwise - * try to find space for it at the end of the tail, copy it there, and - * set obj to point to it. */ + * entirely in the head, pages, or tail, set object to point to it; otherwise + * shift the buffer until it is contained entirely within the pages or tail. + */ int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset) { struct xdr_buf subbuf; + unsigned int boundary; if (xdr_decode_word(buf, offset, &obj->len)) return -EFAULT; - if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len)) + offset += 4; + + /* Is the obj partially in the head? */ + boundary = buf->head[0].iov_len; + if (offset < boundary && (offset + obj->len) > boundary) + xdr_shift_buf(buf, boundary - offset); + + /* Is the obj partially in the pages? */ + boundary += buf->page_len; + if (offset < boundary && (offset + obj->len) > boundary) + xdr_shrink_pagelen(buf, boundary - offset); + + if (xdr_buf_subsegment(buf, &subbuf, offset, obj->len)) return -EFAULT; /* Is the obj contained entirely in the head? */ @@ -1258,11 +1271,7 @@ int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned in if (subbuf.tail[0].iov_len == obj->len) return 0; - /* use end of tail as storage for obj: - * (We don't copy to the beginning because then we'd have - * to worry about doing a potentially overlapping copy. - * This assumes the object is at most half the length of the - * tail.) */ + /* Find a contiguous area in @buf to hold all of @obj */ if (obj->len > buf->buflen - buf->len) return -ENOMEM; if (buf->tail[0].iov_len != 0) From f925ab926d1a9c2112d34ecb59fbb050bb58646c Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 16 Sep 2019 07:59:38 -0400 Subject: [PATCH 463/721] SUNRPC: Rename xdr_buf_read_netobj to xdr_buf_read_mic Let the name reflect the single use. The function now assumes the GSS MIC is the last object in the buffer. Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 2 +- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/xdr.c | 52 ++++++++++++++++++++-------------- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 8a87d8bcb197..f33e5013bdfb 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -186,7 +186,7 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p) extern void xdr_shift_buf(struct xdr_buf *, size_t); extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *); extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); -extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, unsigned int); +extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int); extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 4ce42c62458e..d75fddca44c9 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1960,7 +1960,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len)) goto unwrap_failed; - if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) + if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 24ec2ab5bfa5..14ba9e72a204 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1236,52 +1236,60 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) } EXPORT_SYMBOL_GPL(xdr_encode_word); -/* If the netobj starting offset bytes from the start of xdr_buf is contained - * entirely in the head, pages, or tail, set object to point to it; otherwise - * shift the buffer until it is contained entirely within the pages or tail. +/** + * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf + * @buf: pointer to buffer containing a mic + * @mic: on success, returns the address of the mic + * @offset: the offset in buf where mic may be found + * + * This function may modify the xdr buf if the mic is found to be straddling + * a boundary between head, pages, and tail. On success the mic can be read + * from the address returned. There is no need to free the mic. + * + * Return: Success returns 0, otherwise an integer error. */ -int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset) +int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset) { struct xdr_buf subbuf; unsigned int boundary; - if (xdr_decode_word(buf, offset, &obj->len)) + if (xdr_decode_word(buf, offset, &mic->len)) return -EFAULT; offset += 4; - /* Is the obj partially in the head? */ + /* Is the mic partially in the head? */ boundary = buf->head[0].iov_len; - if (offset < boundary && (offset + obj->len) > boundary) + if (offset < boundary && (offset + mic->len) > boundary) xdr_shift_buf(buf, boundary - offset); - /* Is the obj partially in the pages? */ + /* Is the mic partially in the pages? */ boundary += buf->page_len; - if (offset < boundary && (offset + obj->len) > boundary) + if (offset < boundary && (offset + mic->len) > boundary) xdr_shrink_pagelen(buf, boundary - offset); - if (xdr_buf_subsegment(buf, &subbuf, offset, obj->len)) + if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len)) return -EFAULT; - /* Is the obj contained entirely in the head? */ - obj->data = subbuf.head[0].iov_base; - if (subbuf.head[0].iov_len == obj->len) + /* Is the mic contained entirely in the head? */ + mic->data = subbuf.head[0].iov_base; + if (subbuf.head[0].iov_len == mic->len) return 0; - /* ..or is the obj contained entirely in the tail? */ - obj->data = subbuf.tail[0].iov_base; - if (subbuf.tail[0].iov_len == obj->len) + /* ..or is the mic contained entirely in the tail? */ + mic->data = subbuf.tail[0].iov_base; + if (subbuf.tail[0].iov_len == mic->len) return 0; - /* Find a contiguous area in @buf to hold all of @obj */ - if (obj->len > buf->buflen - buf->len) + /* Find a contiguous area in @buf to hold all of @mic */ + if (mic->len > buf->buflen - buf->len) return -ENOMEM; if (buf->tail[0].iov_len != 0) - obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len; + mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len; else - obj->data = buf->head[0].iov_base + buf->head[0].iov_len; - __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len); + mic->data = buf->head[0].iov_base + buf->head[0].iov_len; + __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len); return 0; } -EXPORT_SYMBOL_GPL(xdr_buf_read_netobj); +EXPORT_SYMBOL_GPL(xdr_buf_read_mic); /* Returns 0 on success, or else a negative error code. */ static int From 9ba828861c56a21d211d5d10f5643774b1ea330d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 16 Sep 2019 09:12:19 -0400 Subject: [PATCH 464/721] SUNRPC: Don't try to parse incomplete RPC messages If the copy of the RPC reply into our buffers did not complete, and we could end up with a truncated message. In that case, just resend the call. Fixes: a0584ee9aed80 ("SUNRPC: Use struct xdr_stream when decoding...") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 44d5cf318813..8b622ceb1158 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2465,6 +2465,7 @@ call_decode(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct xdr_stream xdr; + int err; dprint_status(task); @@ -2487,6 +2488,15 @@ call_decode(struct rpc_task *task) * before it changed req->rq_reply_bytes_recvd. */ smp_rmb(); + + /* + * Did we ever call xprt_complete_rqst()? If not, we should assume + * the message is incomplete. + */ + err = -EAGAIN; + if (!req->rq_reply_bytes_recvd) + goto out; + req->rq_rcv_buf.len = req->rq_private_buf.len; /* Check that the softirq receive buffer is valid */ @@ -2495,7 +2505,9 @@ call_decode(struct rpc_task *task) xdr_init_decode(&xdr, &req->rq_rcv_buf, req->rq_rcv_buf.head[0].iov_base, req); - switch (rpc_decode_header(task, &xdr)) { + err = rpc_decode_header(task, &xdr); +out: + switch (err) { case 0: task->tk_action = rpc_exit_task; task->tk_status = rpcauth_unwrap_resp(task, &xdr); From 8593e010786181df887b001824ff8f3e52e2098f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 13 Sep 2019 16:01:07 -0400 Subject: [PATCH 465/721] SUNRPC: Fix congestion window race with disconnect If the congestion window closes just as the transport disconnects, a reconnect is never driven because: 1. The XPRT_CONG_WAIT flag prevents tasks from taking the write lock 2. There's no wake-up of the first task on the xprt->sending queue To address this, clear the congestion wait flag as part of completing a disconnect. Fixes: 75891f502f5f ("SUNRPC: Support for congestion control ... ") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 02d5b2125c07..83ec4edd2f91 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -456,6 +456,12 @@ void xprt_release_rqst_cong(struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); +static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) + __xprt_lock_write_next_cong(xprt); +} + /* * Clear the congestion window wait flag and wake up the next * entry on xprt->sending @@ -671,6 +677,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) spin_lock(&xprt->transport_lock); xprt_clear_connected(xprt); xprt_clear_write_space_locked(xprt); + xprt_clear_congestion_window_wait_locked(xprt); xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock(&xprt->transport_lock); } From 406cd91533dcc5e82ef2373c39e6a531d944131e Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 13 Sep 2019 08:29:02 -0400 Subject: [PATCH 466/721] NFS: Refactor nfs_instantiate() for dentry referencing callers Since commit b0c6108ecf64 ("nfs_instantiate(): prevent multiple aliases for directory inode"), nfs_instantiate() may succeed without actually instantiating the dentry that was passed in. That can be problematic for some callers in NFSv3, so this patch breaks things up so we can get the actual dentry obtained. Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 41 +++++++++++++++++++++++++++-------------- include/linux/nfs_fs.h | 3 +++ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8d501093660f..d1bbf2fb6ac7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1669,24 +1669,23 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) #endif /* CONFIG_NFSV4 */ -/* - * Code common to create, mkdir, and mknod. - */ -int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, +struct dentry * +nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); struct inode *inode; - struct dentry *d; - int error = -EACCES; + struct dentry *d = NULL; + int error; d_drop(dentry); /* We may have been initialized further down */ if (d_really_is_positive(dentry)) goto out; + if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); if (error) @@ -1702,18 +1701,32 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, } inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); d = d_splice_alias(inode, dentry); - if (IS_ERR(d)) { - error = PTR_ERR(d); - goto out_error; - } - dput(d); out: dput(parent); - return 0; + return d; out_error: nfs_mark_for_revalidate(dir); - dput(parent); - return error; + d = ERR_PTR(error); + goto out; +} +EXPORT_SYMBOL_GPL(nfs_add_or_obtain); + +/* + * Code common to create, mkdir, and mknod. + */ +int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + struct dentry *d; + + d = nfs_add_or_obtain(dentry, fhandle, fattr, label); + if (IS_ERR(d)) + return PTR_ERR(d); + + /* Callers don't care */ + dput(d); + return 0; } EXPORT_SYMBOL_GPL(nfs_instantiate); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 0a11712a80e3..570a60c2f4f4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -490,6 +490,9 @@ extern const struct file_operations nfs_dir_operations; extern const struct dentry_operations nfs_dentry_operations; extern void nfs_force_lookup_revalidate(struct inode *dir); +extern struct dentry *nfs_add_or_obtain(struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr, + struct nfs4_label *label); extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); From 17fd6e457b30e2e025fce4399fe20ae69c7081dd Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 13 Sep 2019 08:29:03 -0400 Subject: [PATCH 467/721] NFSv3: use nfs_add_or_obtain() to create and reference inodes Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/nfs3proc.c | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index a3ad2d46fd42..9eb2f1a503ab 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -279,15 +279,17 @@ static struct nfs3_createdata *nfs3_alloc_createdata(void) return data; } -static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +static struct dentry * +nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) { int status; status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); nfs_post_op_update_inode(dir, data->res.dir_attr); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); - return status; + if (status != 0) + return ERR_PTR(status); + + return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL); } static void nfs3_free_createdata(struct nfs3_createdata *data) @@ -304,6 +306,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call create %pd\n", dentry); @@ -330,7 +333,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; for (;;) { - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != -ENOTSUPP) break; @@ -355,6 +359,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { @@ -372,11 +379,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) - goto out_release_acls; + goto out_dput; } status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); +out_dput: + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -504,6 +513,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; if (len > NFS3_MAXPATHLEN) @@ -522,7 +532,11 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, data->arg.symlink.pathlen = len; data->arg.symlink.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + + if (status == 0) + dput(d_alias); nfs3_free_createdata(data); out: @@ -535,6 +549,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); @@ -553,12 +568,18 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -660,6 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mknod %pd %u:%u\n", dentry, @@ -698,12 +720,17 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; } - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); From 581057c8346b9da51f1115768fd4189ed5eab19b Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 13 Sep 2019 08:29:04 -0400 Subject: [PATCH 468/721] NFS: remove unused check for negative dentry This check has been hanging out since we used to have parallel paths to add dentry in nfs_create(), but that hasn't been the case for some years. Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d1bbf2fb6ac7..dd8b218785be 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1677,15 +1677,11 @@ nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); struct inode *inode; - struct dentry *d = NULL; + struct dentry *d; int error; d_drop(dentry); - /* We may have been initialized further down */ - if (d_really_is_positive(dentry)) - goto out; - if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); if (error) From 9c47b18cf722184f32148784189fca945a7d0561 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:40 -0400 Subject: [PATCH 469/721] pNFS: Ensure we do clear the return-on-close layout stateid on fatal errors IF the server rejected our layout return with a state error such as NFS4ERR_BAD_STATEID, or even a stale inode error, then we do want to clear out all the remaining layout segments and mark that stateid as invalid. Fixes: 1c5bd76d17cca ("pNFS: Enable layoutreturn operation for...") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4525d5acae38..0418b198edd3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1449,10 +1449,15 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; - if (ret == 0) { - arg_stateid = &args->stateid; + switch (ret) { + case -NFS4ERR_NOMATCHING_LAYOUT: + break; + case 0: if (res->lrs_present) res_stateid = &res->stateid; + /* Fallthrough */ + default: + arg_stateid = &args->stateid; } pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, res_stateid); From f4ff4faf894d36b4aa243e241d4d47b4b8ba3c84 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:50 +0530 Subject: [PATCH 470/721] PCI: tegra: Add support to configure sideband pins Add support to configure sideband signal pins when the information is present in the respective controller device-tree node. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi [bhelgaas: fold in YueHaibing's fix for build error without CONFIG_PINCTRL; https://lore.kernel.org/r/20190920014807.38288-1-yuehaibing@huawei.com] Signed-off-by: Bjorn Helgaas Reviewed-by: Andrew Murray Acked-by: Thierry Reding --- drivers/pci/controller/dwc/pcie-tegra194.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 6056414c07d4..0b870287f6b6 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1311,8 +1312,13 @@ static int tegra_pcie_config_rp(struct tegra_pcie_dw *pcie) if (ret < 0) { dev_err(dev, "Failed to get runtime sync for PCIe dev: %d\n", ret); - pm_runtime_disable(dev); - return ret; + goto fail_pm_get_sync; + } + + ret = pinctrl_pm_select_default_state(dev); + if (ret < 0) { + dev_err(dev, "Failed to configure sideband pins: %d\n", ret); + goto fail_pinctrl; } tegra_pcie_init_controller(pcie); @@ -1339,7 +1345,9 @@ static int tegra_pcie_config_rp(struct tegra_pcie_dw *pcie) fail_host_init: tegra_pcie_deinit_controller(pcie); +fail_pinctrl: pm_runtime_put_sync(dev); +fail_pm_get_sync: pm_runtime_disable(dev); return ret; } From 0a901f2130802fdcabcd6f1cfabf4f62fa0d3cde Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:51 +0530 Subject: [PATCH 471/721] PCI: tegra: Add support to enable slot regulators Add support to get regulator information of 3.3V and 12V supplies of a PCIe slot from the respective controller's device-tree node and enable those supplies. This is required in platforms like p2972-0000 where the supplies to x16 slot owned by C5 controller need to be enabled before attempting to enumerate the devices. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray Acked-by: Thierry Reding --- drivers/pci/controller/dwc/pcie-tegra194.c | 83 ++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 0b870287f6b6..f89f5acee72d 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -278,6 +278,8 @@ struct tegra_pcie_dw { u32 aspm_l0s_enter_lat; struct regulator *pex_ctl_supply; + struct regulator *slot_ctl_3v3; + struct regulator *slot_ctl_12v; unsigned int phy_count; struct phy **phys; @@ -1055,6 +1057,73 @@ static void tegra_pcie_downstream_dev_to_D0(struct tegra_pcie_dw *pcie) } } +static int tegra_pcie_get_slot_regulators(struct tegra_pcie_dw *pcie) +{ + pcie->slot_ctl_3v3 = devm_regulator_get_optional(pcie->dev, "vpcie3v3"); + if (IS_ERR(pcie->slot_ctl_3v3)) { + if (PTR_ERR(pcie->slot_ctl_3v3) != -ENODEV) + return PTR_ERR(pcie->slot_ctl_3v3); + + pcie->slot_ctl_3v3 = NULL; + } + + pcie->slot_ctl_12v = devm_regulator_get_optional(pcie->dev, "vpcie12v"); + if (IS_ERR(pcie->slot_ctl_12v)) { + if (PTR_ERR(pcie->slot_ctl_12v) != -ENODEV) + return PTR_ERR(pcie->slot_ctl_12v); + + pcie->slot_ctl_12v = NULL; + } + + return 0; +} + +static int tegra_pcie_enable_slot_regulators(struct tegra_pcie_dw *pcie) +{ + int ret; + + if (pcie->slot_ctl_3v3) { + ret = regulator_enable(pcie->slot_ctl_3v3); + if (ret < 0) { + dev_err(pcie->dev, + "Failed to enable 3.3V slot supply: %d\n", ret); + return ret; + } + } + + if (pcie->slot_ctl_12v) { + ret = regulator_enable(pcie->slot_ctl_12v); + if (ret < 0) { + dev_err(pcie->dev, + "Failed to enable 12V slot supply: %d\n", ret); + goto fail_12v_enable; + } + } + + /* + * According to PCI Express Card Electromechanical Specification + * Revision 1.1, Table-2.4, T_PVPERL (Power stable to PERST# inactive) + * should be a minimum of 100ms. + */ + if (pcie->slot_ctl_3v3 || pcie->slot_ctl_12v) + msleep(100); + + return 0; + +fail_12v_enable: + if (pcie->slot_ctl_3v3) + regulator_disable(pcie->slot_ctl_3v3); + return ret; +} + +static void tegra_pcie_disable_slot_regulators(struct tegra_pcie_dw *pcie) +{ + if (pcie->slot_ctl_12v) + regulator_disable(pcie->slot_ctl_12v); + if (pcie->slot_ctl_3v3) + regulator_disable(pcie->slot_ctl_3v3); +} + static int tegra_pcie_config_controller(struct tegra_pcie_dw *pcie, bool en_hw_hot_rst) { @@ -1068,6 +1137,10 @@ static int tegra_pcie_config_controller(struct tegra_pcie_dw *pcie, return ret; } + ret = tegra_pcie_enable_slot_regulators(pcie); + if (ret < 0) + goto fail_slot_reg_en; + ret = regulator_enable(pcie->pex_ctl_supply); if (ret < 0) { dev_err(pcie->dev, "Failed to enable regulator: %d\n", ret); @@ -1150,6 +1223,8 @@ static int tegra_pcie_config_controller(struct tegra_pcie_dw *pcie, fail_core_clk: regulator_disable(pcie->pex_ctl_supply); fail_reg_en: + tegra_pcie_disable_slot_regulators(pcie); +fail_slot_reg_en: tegra_pcie_bpmp_set_ctrl_state(pcie, false); return ret; @@ -1182,6 +1257,8 @@ static int __deinit_controller(struct tegra_pcie_dw *pcie) return ret; } + tegra_pcie_disable_slot_regulators(pcie); + ret = tegra_pcie_bpmp_set_ctrl_state(pcie, false); if (ret) { dev_err(pcie->dev, "Failed to disable controller %d: %d\n", @@ -1381,6 +1458,12 @@ static int tegra_pcie_dw_probe(struct platform_device *pdev) return ret; } + ret = tegra_pcie_get_slot_regulators(pcie); + if (ret < 0) { + dev_err(dev, "Failed to get slot regulators: %d\n", ret); + return ret; + } + pcie->pex_ctl_supply = devm_regulator_get(dev, "vddio-pex-ctl"); if (IS_ERR(pcie->pex_ctl_supply)) { ret = PTR_ERR(pcie->pex_ctl_supply); From dbb72e2c305b269652eb97fb3544fcb474096a2a Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:52 +0530 Subject: [PATCH 472/721] arm64: tegra: Add configuration for PCIe C5 sideband signals Add support to configure PCIe C5's sideband signals PERST# and CLKREQ# as output and bi-directional signals respectively which unlike other PCIe controllers sideband signals are not configured by default. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- arch/arm64/boot/dts/nvidia/tegra194.dtsi | 38 +++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/nvidia/tegra194.dtsi b/arch/arm64/boot/dts/nvidia/tegra194.dtsi index adebbbf36bd0..3c0cf54f0aab 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194.dtsi @@ -3,8 +3,9 @@ #include #include #include -#include +#include #include +#include #include / { @@ -130,6 +131,38 @@ }; }; + pinmux: pinmux@2430000 { + compatible = "nvidia,tegra194-pinmux"; + reg = <0x2430000 0x17000 + 0xc300000 0x4000>; + + status = "okay"; + + pex_rst_c5_out_state: pex_rst_c5_out { + pex_rst { + nvidia,pins = "pex_l5_rst_n_pgg1"; + nvidia,schmitt = ; + nvidia,lpdr = ; + nvidia,enable-input = ; + nvidia,io-high-voltage = ; + nvidia,tristate = ; + nvidia,pull = ; + }; + }; + + clkreq_c5_bi_dir_state: clkreq_c5_bi_dir { + clkreq { + nvidia,pins = "pex_l5_clkreq_n_pgg0"; + nvidia,schmitt = ; + nvidia,lpdr = ; + nvidia,enable-input = ; + nvidia,io-high-voltage = ; + nvidia,tristate = ; + nvidia,pull = ; + }; + }; + }; + uarta: serial@3100000 { compatible = "nvidia,tegra194-uart", "nvidia,tegra20-uart"; reg = <0x03100000 0x40>; @@ -1365,6 +1398,9 @@ num-viewport = <8>; linux,pci-domain = <5>; + pinctrl-names = "default"; + pinctrl-0 = <&pex_rst_c5_out_state>, <&clkreq_c5_bi_dir_state>; + clocks = <&bpmp TEGRA194_CLK_PEX1_CORE_5>, <&bpmp TEGRA194_CLK_PEX1_CORE_5M>; clock-names = "core", "core_m"; From 09a0774a183d4d9fdfa3d6e471fe09982cac2702 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Thu, 5 Sep 2019 16:15:53 +0530 Subject: [PATCH 473/721] arm64: tegra: Add PCIe slot supply information in p2972-0000 platform Add 3.3V and 12V supplies regulators information of x16 PCIe slot in p2972-0000 platform which is owned by C5 controller and also enable C5 controller. Signed-off-by: Vidya Sagar Signed-off-by: Lorenzo Pieralisi Reviewed-by: Andrew Murray --- .../arm64/boot/dts/nvidia/tegra194-p2888.dtsi | 24 +++++++++++++++++++ .../boot/dts/nvidia/tegra194-p2972-0000.dts | 4 +++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi index 62e07e1197cc..4c38426a6969 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi @@ -289,5 +289,29 @@ gpio = <&gpio TEGRA194_MAIN_GPIO(A, 3) GPIO_ACTIVE_HIGH>; enable-active-high; }; + + vdd_3v3_pcie: regulator@2 { + compatible = "regulator-fixed"; + reg = <2>; + + regulator-name = "PEX_3V3"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + gpio = <&gpio TEGRA194_MAIN_GPIO(Z, 2) GPIO_ACTIVE_HIGH>; + regulator-boot-on; + enable-active-high; + }; + + vdd_12v_pcie: regulator@3 { + compatible = "regulator-fixed"; + reg = <3>; + + regulator-name = "VDD_12V"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <1200000>; + gpio = <&gpio TEGRA194_MAIN_GPIO(A, 1) GPIO_ACTIVE_LOW>; + regulator-boot-on; + enable-active-low; + }; }; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts index 23597d53c9c9..d47cd8c4dd24 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts +++ b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts @@ -93,9 +93,11 @@ }; pcie@141a0000 { - status = "disabled"; + status = "okay"; vddio-pex-ctl-supply = <&vdd_1v8ao>; + vpcie3v3-supply = <&vdd_3v3_pcie>; + vpcie12v-supply = <&vdd_12v_pcie>; phys = <&p2u_nvhs_0>, <&p2u_nvhs_1>, <&p2u_nvhs_2>, <&p2u_nvhs_3>, <&p2u_nvhs_4>, <&p2u_nvhs_5>, From 287a9c558b9b825b3af36731bb09b06621f3e744 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:41 -0400 Subject: [PATCH 474/721] NFSv4: Clean up pNFS return-on-close error handling Both close and delegreturn have identical code to handle pNFS return-on-close. This patch refactors that code and places it in pnfs.c Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 66 +++++++---------------------------------------- fs/nfs/pnfs.c | 27 +++++++++++++++++++ fs/nfs/pnfs.h | 13 ++++++++++ 3 files changed, 50 insertions(+), 56 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 00c7a92e3d6b..a7cecac5a4ec 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3363,32 +3363,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); /* Handle Layoutreturn errors */ - if (calldata->arg.lr_args && task->tk_status != 0) { - switch (calldata->res.lr_ret) { - default: - calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - break; - case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&calldata->arg.lr_args->stateid, - &calldata->arg.lr_args->range, - calldata->inode)) - goto lr_restart; - /* Fallthrough */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - goto lr_restart; - } - } + if (pnfs_roc_done(task, calldata->inode, + &calldata->arg.lr_args, + &calldata->res.lr_res, + &calldata->res.lr_ret) == -EAGAIN) + goto out_restart; /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors @@ -3435,8 +3414,6 @@ static void nfs4_close_done(struct rpc_task *task, void *data) nfs_refresh_inode(calldata->inode, &calldata->fattr); dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); return; -lr_restart: - calldata->res.lr_ret = 0; out_restart: task->tk_status = 0; rpc_restart_call_prepare(task); @@ -6128,32 +6105,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); /* Handle Layoutreturn errors */ - if (data->args.lr_args && task->tk_status != 0) { - switch(data->res.lr_ret) { - default: - data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - break; - case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&data->args.lr_args->stateid, - &data->args.lr_args->range, - data->inode)) - goto lr_restart; - /* Fallthrough */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - goto lr_restart; - } - } + if (pnfs_roc_done(task, data->inode, + &data->args.lr_args, + &data->res.lr_res, + &data->res.lr_ret) == -EAGAIN) + goto out_restart; switch (task->tk_status) { case 0: @@ -6191,8 +6147,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } data->rpc_status = task->tk_status; return; -lr_restart: - data->res.lr_ret = 0; out_restart: task->tk_status = 0; rpc_restart_call_prepare(task); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0418b198edd3..8769422a12f5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1440,6 +1440,33 @@ bool pnfs_roc(struct inode *ino, return false; } +int pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + struct nfs4_layoutreturn_args *arg = *argpp; + int retval = -EAGAIN; + + if (!arg) + return 0; + /* Handle Layoutreturn errors */ + switch (*ret) { + case 0: + retval = 0; + break; + case -NFS4ERR_OLD_STATEID: + if (!nfs4_layoutreturn_refresh_stateid(&arg->stateid, + &arg->range, inode)) + break; + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return -EAGAIN; + } + *argpp = NULL; + *respp = NULL; + return retval; +} + void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f15609c003d8..3ef3756d437c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -282,6 +282,10 @@ bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, const struct cred *cred); +int pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret); void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret); @@ -701,6 +705,15 @@ pnfs_roc(struct inode *ino, return false; } +static inline int +pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + return 0; +} + static inline void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, From 078a432d1c6afc2e70c6c5a0ce8ff29ce2a2e34a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:42 -0400 Subject: [PATCH 475/721] NFSv4: Handle NFS4ERR_DELAY correctly in return-on-close If the server sends a NFS4ERR_DELAY, then allow the caller to retry. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8769422a12f5..6436047dc999 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1455,6 +1455,10 @@ int pnfs_roc_done(struct rpc_task *task, struct inode *inode, case 0: retval = 0; break; + case -NFS4ERR_DELAY: + /* Let the caller handle the retry */ + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return 0; case -NFS4ERR_OLD_STATEID: if (!nfs4_layoutreturn_refresh_stateid(&arg->stateid, &arg->range, inode)) From 6109bcf7130126d24a9ec9d881c665d5d9ab0eb4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:43 -0400 Subject: [PATCH 476/721] NFSv4: Handle RPC level errors in LAYOUTRETURN Handle RPC level errors by assuming that the RPC call was successful. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 9 +++++++++ fs/nfs/pnfs.c | 15 +++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a7cecac5a4ec..c0bff7ad0c9f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9051,6 +9051,15 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; + /* + * Was there an RPC level error? Assume the call succeeded, + * and that we need to release the layout + */ + if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) { + lrp->res.lrs_present = 0; + return; + } + server = NFS_SERVER(lrp->args.inode); switch (task->tk_status) { case -NFS4ERR_OLD_STATEID: diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6436047dc999..abc7188f1853 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1455,6 +1455,21 @@ int pnfs_roc_done(struct rpc_task *task, struct inode *inode, case 0: retval = 0; break; + case -NFS4ERR_NOMATCHING_LAYOUT: + /* Was there an RPC level error? If not, retry */ + if (task->tk_rpc_status == 0) + break; + /* If the call was not sent, let caller handle it */ + if (!RPC_WAS_SENT(task)) + return 0; + /* + * Otherwise, assume the call succeeded and + * that we need to release the layout + */ + *ret = 0; + (*respp)->lrs_present = 0; + retval = 0; + break; case -NFS4ERR_DELAY: /* Let the caller handle the retry */ *ret = -NFS4ERR_NOMATCHING_LAYOUT; From 922839570920c024baf740ef45dfa98009f0192e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:44 -0400 Subject: [PATCH 477/721] NFSv4: Add a helper to increment stateid seqids Add a helper function to increment stateid seqids according to the rules specified in RFC5661 Section 8.2.2. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3564da1ba8a1..e8f74ed98e42 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -574,6 +574,15 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; } +static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1) +{ + u32 seqid = be32_to_cpu(s1->seqid); + + if (++seqid == 0) + ++seqid; + s1->seqid = cpu_to_be32(seqid); +} + static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) { return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; From 30cb3ee299cbf343fe07419e9b0f8dc305979eb6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:45 -0400 Subject: [PATCH 478/721] pNFS: Handle NFS4ERR_OLD_STATEID on layoutreturn by bumping the state seqid If a LAYOUTRETURN receives a reply of NFS4ERR_OLD_STATEID then assume we've missed an update, and just bump the stateid. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 +- fs/nfs/pnfs.c | 18 ++++++++++++++---- fs/nfs/pnfs.h | 4 ++-- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c0bff7ad0c9f..27964ea0b3b7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9063,7 +9063,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) server = NFS_SERVER(lrp->args.inode); switch (task->tk_status) { case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&lrp->args.stateid, + if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid, &lrp->args.range, lrp->args.inode)) goto out_restart; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index abc7188f1853..bb80034a7661 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -359,9 +359,10 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, } /* - * Update the seqid of a layout stateid + * Update the seqid of a layout stateid after receiving + * NFS4ERR_OLD_STATEID */ -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { @@ -377,7 +378,15 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, spin_lock(&inode->i_lock); lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + if (lo && pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + /* Is our call using the most recent seqid? If so, bump it */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) { + nfs4_stateid_seqid_inc(dst); + ret = true; + goto out; + } + /* Try to update the seqid to the most recent */ err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); if (err != -EBUSY) { dst->seqid = lo->plh_stateid.seqid; @@ -385,6 +394,7 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, ret = true; } } +out: spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); return ret; @@ -1475,7 +1485,7 @@ int pnfs_roc_done(struct rpc_task *task, struct inode *inode, *ret = -NFS4ERR_NOMATCHING_LAYOUT; return 0; case -NFS4ERR_OLD_STATEID: - if (!nfs4_layoutreturn_refresh_stateid(&arg->stateid, + if (!nfs4_layout_refresh_old_stateid(&arg->stateid, &arg->range, inode)) break; *ret = -NFS4ERR_NOMATCHING_LAYOUT; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 3ef3756d437c..f8a38065c7e4 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -261,7 +261,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, bool is_recall); int pnfs_destroy_layouts_byclid(struct nfs_client *clp, bool is_recall); -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); @@ -798,7 +798,7 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) { } -static inline bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { From e217e825dca8d2d80c371b43da7257adc499404a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:46 -0400 Subject: [PATCH 479/721] NFSv4: Fix OPEN_DOWNGRADE error handling If OPEN_DOWNGRADE returns a state error, then we want to initiate state recovery in addition to marking the stateid as closed. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 27964ea0b3b7..9e283a8e8e93 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3399,7 +3399,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) task->tk_msg.rpc_cred); /* Fallthrough */ case -NFS4ERR_BAD_STATEID: - break; + if (calldata->arg.fmode == 0) + break; + /* Fallthrough */ default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); From 0e0cb35b417f505447694463694aff75fca32889 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:47 -0400 Subject: [PATCH 480/721] NFSv4: Handle NFS4ERR_OLD_STATEID in CLOSE/OPEN_DOWNGRADE If a CLOSE or OPEN_DOWNGRADE operation receives a NFS4ERR_OLD_STATEID then bump the seqid before resending. Ensure we only bump the seqid by 1. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 2 -- fs/nfs/nfs4proc.c | 75 ++++++++++++++++++++++++++++++++++++++++++++-- fs/nfs/nfs4state.c | 16 ---------- 3 files changed, 72 insertions(+), 21 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index e8f74ed98e42..16b2e5cc3e94 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -491,8 +491,6 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, const struct nfs_lock_context *, nfs4_stateid *, const struct cred **); -extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst, - struct nfs4_state *state); extern bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9e283a8e8e93..0949bb535d7c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3313,6 +3313,75 @@ nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task) return pnfs_wait_on_layoutreturn(inode, task); } +/* + * Update the seqid of an open stateid + */ +static void nfs4_sync_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + int seq; + + for (;;) { + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + nfs4_stateid_copy(dst, &state->open_stateid); + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0) + dst->seqid = seqid_open; + break; + } +} + +/* + * Update the seqid of an open stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + bool ret; + int seq; + + for (;;) { + ret = false; + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0) + dst->seqid = cpu_to_be32(dst_seqid + 1); + else + dst->seqid = seqid_open; + ret = true; + break; + } + + return ret; +} + struct nfs4_closedata { struct inode *inode; struct nfs4_state *state; @@ -3387,7 +3456,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_OLD_STATEID: /* Did we race with OPEN? */ - if (nfs4_refresh_open_stateid(&calldata->arg.stateid, + if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid, state)) goto out_restart; goto out_release; @@ -3456,8 +3525,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) } else if (is_rdwr) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; - if (!nfs4_valid_open_stateid(state) || - !nfs4_refresh_open_stateid(&calldata->arg.stateid, state)) + nfs4_sync_open_stateid(&calldata->arg.stateid, state); + if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e916aba7a799..0c6d53dc3672 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1015,22 +1015,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, return ret; } -bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) -{ - bool ret; - int seq; - - do { - ret = false; - seq = read_seqbegin(&state->seqlock); - if (nfs4_state_match_open_stateid_other(state, dst)) { - dst->seqid = state->open_stateid.seqid; - ret = true; - } - } while (read_seqretry(&state->seqlock, seq)); - return ret; -} - bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { bool ret; From 32c6e7eee39999084448b15e96a0e3c7d2f9021e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Sep 2019 07:23:48 -0400 Subject: [PATCH 481/721] NFSv4: Handle NFS4ERR_OLD_STATEID in LOCKU If a LOCKU request receives a NFS4ERR_OLD_STATEID, then bump the seqid before resending. Ensure we only bump the seqid by 1. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 53 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0949bb535d7c..11eafcfc490b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6410,6 +6410,42 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } +/* + * Update the seqid of a lock stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret = false; + + spin_lock(&state->state_lock); + if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid)) + goto out; + if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst)) + nfs4_stateid_seqid_inc(dst); + else + dst->seqid = lsp->ls_stateid.seqid; + ret = true; +out: + spin_unlock(&state->state_lock); + return ret; +} + +static bool nfs4_sync_lock_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret; + + spin_lock(&state->state_lock); + ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid); + nfs4_stateid_copy(dst, &lsp->ls_stateid); + spin_unlock(&state->state_lock); + return ret; +} + struct nfs4_unlockdata { struct nfs_locku_args arg; struct nfs_locku_res res; @@ -6427,7 +6463,8 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs_seqid *seqid) { struct nfs4_unlockdata *p; - struct inode *inode = lsp->ls_state->inode; + struct nfs4_state *state = lsp->ls_state; + struct inode *inode = state->inode; p = kzalloc(sizeof(*p), GFP_NOFS); if (p == NULL) @@ -6443,6 +6480,9 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); + spin_lock(&state->state_lock); + nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid); + spin_unlock(&state->state_lock); return p; } @@ -6481,10 +6521,14 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) task->tk_msg.rpc_cred); /* Fall through */ case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - if (!nfs4_stateid_match(&calldata->arg.stateid, - &calldata->lsp->ls_stateid)) + if (nfs4_sync_lock_stateid(&calldata->arg.stateid, + calldata->lsp)) + rpc_restart_call_prepare(task); + break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid, + calldata->lsp)) rpc_restart_call_prepare(task); break; default: @@ -6507,7 +6551,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; - nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ goto out_no_action; From 0d8006ddbe89cbaedef06a8789ddefa1164a3a77 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 4 Sep 2019 22:26:00 +1000 Subject: [PATCH 482/721] PCI: Add pci_irq_vector() and other stubs when !CONFIG_PCI Add stub functions pci_alloc_irq_vectors_affinity() and pci_irq_vector() when CONFIG_PCI is off so drivers can use them without resorting to ifdefs. Also move the PCI_IRQ_* macros outside of the ifdefs so they are always available. Fixes: 625f269a5a7a ("crypto: inside-secure - add support for PCI based FPGA development board") Link: https://lore.kernel.org/r/20190904122600.GA28660@gondor.apana.org.au Reported-by: kbuild test robot Reported-by: YueHaibing Signed-off-by: Herbert Xu Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/include/linux/pci.h b/include/linux/pci.h index f5bab312995d..23e4a6bcd08d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -925,6 +925,11 @@ enum { PCI_SCAN_ALL_PCIE_DEVS = 0x00000040, /* Scan all, not just dev 0 */ }; +#define PCI_IRQ_LEGACY (1 << 0) /* Allow legacy interrupts */ +#define PCI_IRQ_MSI (1 << 1) /* Allow MSI interrupts */ +#define PCI_IRQ_MSIX (1 << 2) /* Allow MSI-X interrupts */ +#define PCI_IRQ_AFFINITY (1 << 3) /* Auto-assign affinity */ + /* These external functions are only available when PCI support is enabled */ #ifdef CONFIG_PCI @@ -1408,11 +1413,6 @@ resource_size_t pcibios_window_alignment(struct pci_bus *bus, int pci_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags); -#define PCI_IRQ_LEGACY (1 << 0) /* Allow legacy interrupts */ -#define PCI_IRQ_MSI (1 << 1) /* Allow MSI interrupts */ -#define PCI_IRQ_MSIX (1 << 2) /* Allow MSI-X interrupts */ -#define PCI_IRQ_AFFINITY (1 << 3) /* Auto-assign affinity */ - /* * Virtual interrupts allow for more interrupts to be allocated * than the device has interrupts for. These are not programmed @@ -1517,14 +1517,6 @@ static inline int pci_irq_get_node(struct pci_dev *pdev, int vec) } #endif -static inline int -pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, - unsigned int max_vecs, unsigned int flags) -{ - return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags, - NULL); -} - /** * pci_irqd_intx_xlate() - Translate PCI INTx value to an IRQ domain hwirq * @d: the INTx IRQ domain @@ -1780,8 +1772,29 @@ static inline const struct pci_device_id *pci_match_id(const struct pci_device_i struct pci_dev *dev) { return NULL; } static inline bool pci_ats_disabled(void) { return true; } + +static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + return -EINVAL; +} + +static inline int +pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags, + struct irq_affinity *aff_desc) +{ + return -ENOSPC; +} #endif /* CONFIG_PCI */ +static inline int +pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags) +{ + return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags, + NULL); +} + #ifdef CONFIG_PCI_ATS /* Address Translation Service */ void pci_ats_init(struct pci_dev *dev); From eb09b3cc464d2c3bbde9a6648603c8d599ea8582 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 22 Sep 2019 10:01:05 -0600 Subject: [PATCH 483/721] pktcdvd: remove warning on attempting to register non-passthrough dev Anatoly reports that he gets the below warning when booting -git on a sparc64 box on debian unstable: ... [ 13.352975] aes_sparc64: Using sparc64 aes opcodes optimized AES implementation [ 13.428002] ------------[ cut here ]------------ [ 13.428081] WARNING: CPU: 21 PID: 586 at drivers/block/pktcdvd.c:2597 pkt_setup_dev+0x2e4/0x5a0 [pktcdvd] [ 13.428147] Attempt to register a non-SCSI queue [ 13.428184] Modules linked in: pktcdvd libdes cdrom aes_sparc64 n2_rng md5_sparc64 sha512_sparc64 rng_core sha256_sparc64 flash sha1_sparc64 ip_tables x_tables ipv6 crc_ccitt nf_defrag_ipv6 autofs4 ext4 crc16 mbcache jbd2 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear md_mod crc32c_sparc64 [ 13.428452] CPU: 21 PID: 586 Comm: pktsetup Not tainted 5.3.0-10169-g574cc4539762 #1234 [ 13.428507] Call Trace: [ 13.428542] [00000000004635c0] __warn+0xc0/0x100 [ 13.428582] [0000000000463634] warn_slowpath_fmt+0x34/0x60 [ 13.428626] [000000001045b244] pkt_setup_dev+0x2e4/0x5a0 [pktcdvd] [ 13.428674] [000000001045ccf4] pkt_ctl_ioctl+0x94/0x220 [pktcdvd] [ 13.428724] [00000000006b95c8] do_vfs_ioctl+0x628/0x6e0 [ 13.428764] [00000000006b96c8] ksys_ioctl+0x48/0x80 [ 13.428803] [00000000006b9714] sys_ioctl+0x14/0x40 [ 13.428847] [0000000000406294] linux_sparc_syscall+0x34/0x44 [ 13.428890] irq event stamp: 4181 [ 13.428924] hardirqs last enabled at (4189): [<00000000004e0a74>] console_unlock+0x634/0x6c0 [ 13.428984] hardirqs last disabled at (4196): [<00000000004e0540>] console_unlock+0x100/0x6c0 [ 13.429048] softirqs last enabled at (3978): [<0000000000b2e2d8>] __do_softirq+0x498/0x520 [ 13.429110] softirqs last disabled at (3967): [<000000000042cfb4>] do_softirq_own_stack+0x34/0x60 [ 13.429172] ---[ end trace 2220ca468f32967d ]--- [ 13.430018] pktcdvd: setup of pktcdvd device failed [ 13.455589] des_sparc64: Using sparc64 des opcodes optimized DES implementation [ 13.515334] camellia_sparc64: Using sparc64 camellia opcodes optimized CAMELLIA implementation [ 13.522856] pktcdvd: setup of pktcdvd device failed [ 13.529327] pktcdvd: setup of pktcdvd device failed [ 13.532932] pktcdvd: setup of pktcdvd device failed [ 13.536165] pktcdvd: setup of pktcdvd device failed [ 13.539372] pktcdvd: setup of pktcdvd device failed [ 13.542834] pktcdvd: setup of pktcdvd device failed [ 13.546536] pktcdvd: setup of pktcdvd device failed [ 15.431071] XFS (dm-0): Mounting V5 Filesystem ... Apparently debian auto-attaches any cdrom like device to pktcdvd, which can lead to the above warning. There's really no reason to warn for this situation, kill it. Reported-by: Anatoly Pugachev Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 024060165afa..76457003f140 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2594,7 +2594,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (ret) return ret; if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { - WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); return -EINVAL; } From be21683e48f26032199504fb60b9a27eeff05fc3 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 22 Sep 2019 12:46:55 +0300 Subject: [PATCH 484/721] block: t10-pi: fix -Wswitch warning Changing the switch() statement to symbolic constants made the compiler (at least clang-9, did not check gcc) notice that there is one enum value that is not handled here: block/t10-pi.c:62:11: error: enumeration value 'T10_PI_TYPE0_PROTECTION' not handled in switch [-Werror,-Wswitch] Add a BUG_ON statement if we ever get to t10_pi_verify function with TYPE0 and replace the switch() statement with if/else clause for the valid types. Fixes: 9b2061b1a262 ("block: use symbolic constants for t10_pi type") Cc: Arnd Bergmann Signed-off-by: Max Gurtovoy Signed-off-by: Jens Axboe --- block/t10-pi.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/block/t10-pi.c b/block/t10-pi.c index 0c0120a672f9..9803c7e0376e 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -55,13 +55,14 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, { unsigned int i; + BUG_ON(type == T10_PI_TYPE0_PROTECTION); + for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf; __be16 csum; - switch (type) { - case T10_PI_TYPE1_PROTECTION: - case T10_PI_TYPE2_PROTECTION: + if (type == T10_PI_TYPE1_PROTECTION || + type == T10_PI_TYPE2_PROTECTION) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -73,12 +74,10 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, iter->seed, be32_to_cpu(pi->ref_tag)); return BLK_STS_PROTECTION; } - break; - case T10_PI_TYPE3_PROTECTION: + } else if (type == T10_PI_TYPE3_PROTECTION) { if (pi->app_tag == T10_PI_APP_ESCAPE && pi->ref_tag == T10_PI_REF_ESCAPE) goto next; - break; } csum = fn(iter->data_buf, iter->interval); From 32960613b7c3352ddf38c42596e28a16ae36335e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 Sep 2019 11:05:34 -0600 Subject: [PATCH 485/721] io_uring: correctly handle non ->{read,write}_iter() file_operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we just -EINVAL a read or write to an fd that isn't backed by ->read_iter() or ->write_iter(). But we can handle them just fine, as long as we punt fo async context first. Implement a simple loop function for doing ->read() or ->write() instead, and ensure we call it appropriately. Reported-by: 李通洲 Signed-off-by: Jens Axboe --- fs/io_uring.c | 60 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9d8e703bc851..ca7570aca430 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1298,6 +1298,51 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) } } +/* + * For files that don't have ->read_iter() and ->write_iter(), handle them + * by looping over ->read() or ->write() manually. + */ +static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, + struct iov_iter *iter) +{ + ssize_t ret = 0; + + /* + * Don't support polled IO through this interface, and we can't + * support non-blocking either. For the latter, this just causes + * the kiocb to be handled from an async context. + */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EOPNOTSUPP; + if (kiocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + while (iov_iter_count(iter)) { + struct iovec iovec = iov_iter_iovec(iter); + ssize_t nr; + + if (rw == READ) { + nr = file->f_op->read(file, iovec.iov_base, + iovec.iov_len, &kiocb->ki_pos); + } else { + nr = file->f_op->write(file, iovec.iov_base, + iovec.iov_len, &kiocb->ki_pos); + } + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (nr != iovec.iov_len) + break; + iov_iter_advance(iter, nr); + } + + return ret; +} + static int io_read(struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock) { @@ -1315,8 +1360,6 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - if (unlikely(!file->f_op->read_iter)) - return -EINVAL; ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); if (ret < 0) @@ -1331,7 +1374,11 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (!ret) { ssize_t ret2; - ret2 = call_read_iter(file, kiocb, &iter); + if (file->f_op->read_iter) + ret2 = call_read_iter(file, kiocb, &iter); + else + ret2 = loop_rw_iter(READ, file, kiocb, &iter); + /* * In case of a short read, punt to async. This can happen * if we have data partially cached. Alternatively we can @@ -1376,8 +1423,6 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, file = kiocb->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; - if (unlikely(!file->f_op->write_iter)) - return -EINVAL; ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); if (ret < 0) @@ -1415,7 +1460,10 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, } kiocb->ki_flags |= IOCB_WRITE; - ret2 = call_write_iter(file, kiocb, &iter); + if (file->f_op->write_iter) + ret2 = call_write_iter(file, kiocb, &iter); + else + ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { io_rw_done(kiocb, ret2); } else { From d46fe2cb2dce7f5038473b5859e03f5e16b7428e Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Mon, 23 Sep 2019 14:02:02 +0000 Subject: [PATCH 486/721] block: drop device references in bsg_queue_rq() Make sure that bsg_queue_rq() calls put_device() if an error is encountered after get_device() was successful. Fixes: cd2f076f1d7a ("bsg: convert to use blk-mq") Signed-off-by: Martin Wilck Signed-off-by: Jens Axboe --- block/bsg-lib.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 785dd58947f1..347dda16c2f4 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -266,6 +266,7 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct bsg_set *bset = container_of(q->tag_set, struct bsg_set, tag_set); + int sts = BLK_STS_IOERR; int ret; blk_mq_start_request(req); @@ -274,14 +275,15 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_IOERR; if (!bsg_prepare_job(dev, req)) - return BLK_STS_IOERR; + goto out; ret = bset->job_fn(blk_mq_rq_to_pdu(req)); - if (ret) - return BLK_STS_IOERR; + if (!ret) + sts = BLK_STS_OK; +out: put_device(dev); - return BLK_STS_OK; + return sts; } /* called right after the request is allocated for the request_queue */ From e20e174ca1bd98241b42d5ccfa228d8c6522e4e7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 18 Sep 2019 17:50:45 -0700 Subject: [PATCH 487/721] xfs: convert inode to extent format after extent merge due to shift The collapse range operation can merge extents if two newly adjacent extents are physically contiguous. If the extent count is reduced on a btree format inode, a change to extent format might be necessary. This format change currently occurs as a side effect of the file size update after extents have been shifted for the collapse. This codepath ultimately calls xfs_bunmapi(), which happens to check for and execute the format conversion even if there were no blocks removed from the mapping. While this ultimately puts the inode into the correct state, the fact the format conversion occurs in a separate transaction from the change that called for it is a problem. If an extent shift transaction commits and the filesystem happens to crash before the format conversion, the inode fork is left in a corrupted state after log recovery. The inode fork verifier fails and xfs_repair ultimately nukes the inode. This problem was originally reproduced by generic/388. Similar to how the insert range extent split code handles extent to btree conversion, update the collapse range extent merge code to handle btree to extent format conversion in the same transaction that merges the extents. This ensures that the inode fork format remains consistent if the filesystem happens to crash in the middle of a collapse range operation that changes the inode fork format. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 054b4ce30033..eaf2d4250a26 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5621,6 +5621,11 @@ xfs_bmse_merge( if (error) return error; + /* change to extent format if required after extent removal */ + error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork); + if (error) + return error; + done: xfs_iext_remove(ip, icur, 0); xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); From 583e4eff98fab8d4d3a44114b44408b6a4ad0737 Mon Sep 17 00:00:00 2001 From: Aliasgar Surti Date: Mon, 23 Sep 2019 13:00:56 -0700 Subject: [PATCH 488/721] xfs: removed unneeded variable Returned value directly instead of using variable as it wasn't updated. Signed-off-by: Aliasgar Surti Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index a43d1813c4ff..5533e48e605d 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -97,7 +97,6 @@ xchk_allocbt_rec( xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t bno; xfs_extlen_t len; - int error = 0; bno = be32_to_cpu(rec->alloc.ar_startblock); len = be32_to_cpu(rec->alloc.ar_blockcount); @@ -109,7 +108,7 @@ xchk_allocbt_rec( xchk_allocbt_xref(bs->sc, bno, len); - return error; + return 0; } /* Scrub the freespace btrees for some AG. */ From ce840429260a98bcfe4aaf487bb07fa346d86c41 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 23 Sep 2019 13:02:41 -0700 Subject: [PATCH 489/721] xfs: revert 1baa2800e62d ("xfs: remove the unused XFS_ALLOC_USERDATA flag") Revert this commit, as it caused periodic regressions in xfs/173 w/ 1k blocks. [1] https://lore.kernel.org/lkml/20190919014602.GN15734@shao2-debian/ Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_alloc.h | 7 ++++--- fs/xfs/libxfs/xfs_bmap.c | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 58fa85cec325..d6ed5d2c07c2 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -81,9 +81,10 @@ typedef struct xfs_alloc_arg { /* * Defines for datatype */ -#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */ -#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */ -#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ +#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ +#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ static inline bool xfs_alloc_is_userdata(int datatype) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index eaf2d4250a26..4edc25a2ba80 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4042,8 +4042,12 @@ xfs_bmapi_allocate( */ if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK && bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + if (whichfork == XFS_DATA_FORK) { + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + else + bma->datatype |= XFS_ALLOC_USERDATA; + } if (bma->flags & XFS_BMAPI_ZERO) bma->datatype |= XFS_ALLOC_USERDATA_ZERO; } From f3122a79a1b0a113d3aea748e0ec26f2cb2889de Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 17 Sep 2019 22:59:03 +0200 Subject: [PATCH 490/721] s390/topology: avoid firing events before kobjs are created arch_update_cpu_topology is first called from: kernel_init_freeable->sched_init_smp->sched_init_domains even before cpus has been registered in: kernel_init_freeable->do_one_initcall->s390_smp_init Do not trigger kobject_uevent change events until cpu devices are actually created. Fixes the following kasan findings: BUG: KASAN: global-out-of-bounds in kobject_uevent_env+0xb40/0xee0 Read of size 8 at addr 0000000000000020 by task swapper/0/1 BUG: KASAN: global-out-of-bounds in kobject_uevent_env+0xb36/0xee0 Read of size 8 at addr 0000000000000018 by task swapper/0/1 CPU: 0 PID: 1 Comm: swapper/0 Tainted: G B Hardware name: IBM 3906 M04 704 (LPAR) Call Trace: ([<0000000143c6db7e>] show_stack+0x14e/0x1a8) [<0000000145956498>] dump_stack+0x1d0/0x218 [<000000014429fb4c>] print_address_description+0x64/0x380 [<000000014429f630>] __kasan_report+0x138/0x168 [<0000000145960b96>] kobject_uevent_env+0xb36/0xee0 [<0000000143c7c47c>] arch_update_cpu_topology+0x104/0x108 [<0000000143df9e22>] sched_init_domains+0x62/0xe8 [<000000014644c94a>] sched_init_smp+0x3a/0xc0 [<0000000146433a20>] kernel_init_freeable+0x558/0x958 [<000000014599002a>] kernel_init+0x22/0x160 [<00000001459a71d4>] ret_from_fork+0x28/0x30 [<00000001459a71dc>] kernel_thread_starter+0x0/0x10 Cc: stable@vger.kernel.org Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/topology.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 2db6fb405a9a..3627953007ed 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -311,7 +311,8 @@ int arch_update_cpu_topology(void) on_each_cpu(__arch_update_dedicated_flag, NULL, 0); for_each_online_cpu(cpu) { dev = get_cpu_device(cpu); - kobject_uevent(&dev->kobj, KOBJ_CHANGE); + if (dev) + kobject_uevent(&dev->kobj, KOBJ_CHANGE); } return rc; } From ea298e6ee8b34b3ed4366be7eb799d0650ebe555 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 17 Sep 2019 20:04:04 +0200 Subject: [PATCH 491/721] s390/cio: avoid calling strlen on null pointer Fix the following kasan finding: BUG: KASAN: global-out-of-bounds in ccwgroup_create_dev+0x850/0x1140 Read of size 1 at addr 0000000000000000 by task systemd-udevd.r/561 CPU: 30 PID: 561 Comm: systemd-udevd.r Tainted: G B Hardware name: IBM 3906 M04 704 (LPAR) Call Trace: ([<0000000231b3db7e>] show_stack+0x14e/0x1a8) [<0000000233826410>] dump_stack+0x1d0/0x218 [<000000023216fac4>] print_address_description+0x64/0x380 [<000000023216f5a8>] __kasan_report+0x138/0x168 [<00000002331b8378>] ccwgroup_create_dev+0x850/0x1140 [<00000002332b618a>] group_store+0x3a/0x50 [<00000002323ac706>] kernfs_fop_write+0x246/0x3b8 [<00000002321d409a>] vfs_write+0x132/0x450 [<00000002321d47da>] ksys_write+0x122/0x208 [<0000000233877102>] system_call+0x2a6/0x2c8 Triggered by: openat(AT_FDCWD, "/sys/bus/ccwgroup/drivers/qeth/group", O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, 0666) = 16 write(16, "0.0.bd00,0.0.bd01,0.0.bd02", 26) = 26 The problem is that __get_next_id in ccwgroup_create_dev might set "buf" buffer pointer to NULL and explicit check for that is required. Cc: stable@vger.kernel.org Reviewed-by: Sebastian Ott Signed-off-by: Vasily Gorbik --- drivers/s390/cio/ccwgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index c522e9313c50..ae66875a934d 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -372,7 +372,7 @@ int ccwgroup_create_dev(struct device *parent, struct ccwgroup_driver *gdrv, goto error; } /* Check for trailing stuff. */ - if (i == num_devices && strlen(buf) > 0) { + if (i == num_devices && buf && strlen(buf) > 0) { rc = -EINVAL; goto error; } From ab5758848039de9a4b249d46e4ab591197eebaf2 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 19 Sep 2019 15:55:17 +0200 Subject: [PATCH 492/721] s390/cio: exclude subchannels with no parent from pseudo check ccw console is created early in start_kernel and used before css is initialized or ccw console subchannel is registered. Until then console subchannel does not have a parent. For that reason assume subchannels with no parent are not pseudo subchannels. This fixes the following kasan finding: BUG: KASAN: global-out-of-bounds in sch_is_pseudo_sch+0x8e/0x98 Read of size 8 at addr 00000000000005e8 by task swapper/0/0 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-rc8-07370-g6ac43dd12538 #2 Hardware name: IBM 2964 NC9 702 (z/VM 6.4.0) Call Trace: ([<000000000012cd76>] show_stack+0x14e/0x1e0) [<0000000001f7fb44>] dump_stack+0x1a4/0x1f8 [<00000000007d7afc>] print_address_description+0x64/0x3c8 [<00000000007d75f6>] __kasan_report+0x14e/0x180 [<00000000018a2986>] sch_is_pseudo_sch+0x8e/0x98 [<000000000189b950>] cio_enable_subchannel+0x1d0/0x510 [<00000000018cac7c>] ccw_device_recognition+0x12c/0x188 [<0000000002ceb1a8>] ccw_device_enable_console+0x138/0x340 [<0000000002cf1cbe>] con3215_init+0x25e/0x300 [<0000000002c8770a>] console_init+0x68a/0x9b8 [<0000000002c6a3d6>] start_kernel+0x4fe/0x728 [<0000000000100070>] startup_continue+0x70/0xd0 Cc: stable@vger.kernel.org Reviewed-by: Sebastian Ott Signed-off-by: Vasily Gorbik --- drivers/s390/cio/css.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 22c55816100b..1fbfb0a93f5f 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -1388,6 +1388,8 @@ device_initcall(cio_settle_init); int sch_is_pseudo_sch(struct subchannel *sch) { + if (!sch->dev.parent) + return 0; return sch == to_css(sch->dev.parent)->pseudo_subchannel; } From f41f900568d9ffd896cc941db7021eb14bd55910 Mon Sep 17 00:00:00 2001 From: Jussi Laako Date: Tue, 24 Sep 2019 10:11:43 +0300 Subject: [PATCH 493/721] ALSA: usb-audio: Add DSD support for EVGA NU Audio EVGA NU Audio is actually a USB audio device on a PCIexpress card, with it's own USB controller. It supports both PCM and DSD. Signed-off-by: Jussi Laako Cc: Link: https://lore.kernel.org/r/20190924071143.30911-1-jussi@sonarnerd.net Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 5fd4ccce452d..fbfde996fee7 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1658,6 +1658,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, case 0x25ce: /* Mytek devices */ case 0x278b: /* Rotel? */ case 0x2ab6: /* T+A devices */ + case 0x3842: /* EVGA */ case 0xc502: /* HiBy devices */ if (fp->dsd_raw) return SNDRV_PCM_FMTBIT_DSD_U32_BE; From d5880c7a8620290a6c90ced7a0e8bd0ad9419601 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 13 Sep 2019 18:17:11 +0300 Subject: [PATCH 494/721] fuse: fix missing unlock_page in fuse_writepage() unlock_page() was missing in case of an already in-flight write against the same page. Signed-off-by: Vasily Averin Fixes: ff17be086477 ("fuse: writepage: skip already in flight") Cc: # v3.13 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a2ea347c4d2c..8c7578b95d2c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1861,6 +1861,7 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) WARN_ON(wbc->sync_mode == WB_SYNC_ALL); redirty_page_for_writepage(wbc, page); + unlock_page(page); return 0; } From 30c6a23d34cbe19162240e9f9c2c122ba807e58c Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Mon, 16 Sep 2019 16:56:41 -0700 Subject: [PATCH 495/721] fuse: on 64-bit store time in d_fsdata directly Implements the optimization noted in commit f75fdf22b0a8 ("fuse: don't use ->d_time"), as the additional memory can be significant. (In particular, on SLAB configurations this 8-byte alloc becomes 32 bytes). Per-dentry, this can consume significant memory. Reviewed-by: Shakeel Butt Signed-off-by: Khazhismel Kumykov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ba0a175d7578..58557d4817e9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -24,11 +24,34 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } +#if BITS_PER_LONG >= 64 +static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_fsdata = (void *) time; +} + +static inline u64 fuse_dentry_time(const struct dentry *entry) +{ + return (u64)entry->d_fsdata; +} + +#else union fuse_dentry { u64 time; struct rcu_head rcu; }; +static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) +{ + ((union fuse_dentry *) dentry->d_fsdata)->time = time; +} + +static inline u64 fuse_dentry_time(const struct dentry *entry) +{ + return ((union fuse_dentry *) entry->d_fsdata)->time; +} +#endif + static void fuse_dentry_settime(struct dentry *dentry, u64 time) { struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); @@ -47,12 +70,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time) spin_unlock(&dentry->d_lock); } - ((union fuse_dentry *) dentry->d_fsdata)->time = time; -} - -static inline u64 fuse_dentry_time(const struct dentry *entry) -{ - return ((union fuse_dentry *) entry->d_fsdata)->time; + __fuse_dentry_settime(dentry, time); } /* @@ -258,6 +276,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto out; } +#if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); @@ -270,6 +289,7 @@ static void fuse_dentry_release(struct dentry *dentry) kfree_rcu(fd, rcu); } +#endif static int fuse_dentry_delete(const struct dentry *dentry) { @@ -279,13 +299,17 @@ static int fuse_dentry_delete(const struct dentry *dentry) const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; const struct dentry_operations fuse_root_dentry_operations = { +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; int fuse_valid_type(int m) From dc69e98c241e1456e37d73b862f7b8b8900ba50f Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Tue, 17 Sep 2019 12:35:33 -0700 Subject: [PATCH 496/721] fuse: kmemcg account fs data account per-file, dentry, and inode data blockdev/superblock and temporary per-request data was left alone, as this usually isn't accounted Reviewed-by: Shakeel Butt Signed-off-by: Khazhismel Kumykov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 3 ++- fs/fuse/file.c | 5 +++-- fs/fuse/inode.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 58557d4817e9..d572c900bb0f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -279,7 +279,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) #if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8c7578b95d2c..0f0225686aee 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -63,12 +63,13 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); if (unlikely(!ff)) return NULL; ff->fc = fc; - ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL); + ff->release_args = kzalloc(sizeof(*ff->release_args), + GFP_KERNEL_ACCOUNT); if (!ff->release_args) { kfree(ff); return NULL; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 10d193b24fb8..51cb471f4dc3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -66,7 +66,7 @@ static struct file_system_type fuseblk_fs_type; struct fuse_forget_link *fuse_alloc_forget(void) { - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); } static struct inode *fuse_alloc_inode(struct super_block *sb) From 0ed4059302a72ad0d70c874b3f4fd5489da711c6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Sep 2019 21:58:16 +0200 Subject: [PATCH 497/721] fuse: unexport fuse_put_request This function has been made static, which now causes a compile-time warning: WARNING: "fuse_put_request" [vmlinux] is a static EXPORT_SYMBOL_GPL Remove the unneeded export. Fixes: 66abc3599c3c ("fuse: unexport request ops") Signed-off-by: Arnd Bergmann Reviewed-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b2e18861c59c..04fe6788e405 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -175,7 +175,6 @@ static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) fuse_request_free(req); } } -EXPORT_SYMBOL_GPL(fuse_put_request); unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) { From e5854b1cdf6cb48a20e01e3bdad0476a4c60a077 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 22 Sep 2019 06:19:36 -0700 Subject: [PATCH 498/721] fuse: fix beyond-end-of-page access in fuse_parse_cache() With DEBUG_PAGEALLOC on, the following triggers. BUG: unable to handle page fault for address: ffff88859367c000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 3001067 P4D 3001067 PUD 406d3a8067 PMD 406d30c067 PTE 800ffffa6c983060 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 38 PID: 3110657 Comm: python2.7 RIP: 0010:fuse_readdir+0x88f/0xe7a [fuse] Code: 49 8b 4d 08 49 39 4e 60 0f 84 44 04 00 00 48 8b 43 08 43 8d 1c 3c 4d 01 7e 68 49 89 dc 48 03 5c 24 38 49 89 46 60 8b 44 24 30 <8b> 4b 10 44 29 e0 48 89 ca 48 83 c1 1f 48 83 e1 f8 83 f8 17 49 89 RSP: 0018:ffffc90035edbde0 EFLAGS: 00010286 RAX: 0000000000001000 RBX: ffff88859367bff0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88859367bfed RDI: 0000000000920907 RBP: ffffc90035edbe90 R08: 000000000000014b R09: 0000000000000004 R10: ffff88859367b000 R11: 0000000000000000 R12: 0000000000000ff0 R13: ffffc90035edbee0 R14: ffff889fb8546180 R15: 0000000000000020 FS: 00007f80b5f4a740(0000) GS:ffff889fffa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff88859367c000 CR3: 0000001c170c2001 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: iterate_dir+0x122/0x180 __x64_sys_getdents+0xa6/0x140 do_syscall_64+0x42/0x100 entry_SYSCALL_64_after_hwframe+0x44/0xa9 It's in fuse_parse_cache(). %rbx (ffff88859367bff0) is fuse_dirent pointer - addr + offset. FUSE_DIRENT_SIZE() is trying to dereference namelen off of it but that derefs into the next page which is disabled by pagealloc debug causing a PF. This is caused by dirent->namelen being accessed before ensuring that there's enough bytes in the page for the dirent. Fix it by pushing down reclen calculation. Signed-off-by: Tejun Heo Fixes: 5d7bc7e8680c ("fuse: allow using readdir cache") Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: Miklos Szeredi --- fs/fuse/readdir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index fae025db06f4..5c38b9d84c6e 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -386,11 +386,13 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, for (;;) { struct fuse_dirent *dirent = addr + offset; unsigned int nbytes = size - offset; - size_t reclen = FUSE_DIRENT_SIZE(dirent); + size_t reclen; if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) break; + reclen = FUSE_DIRENT_SIZE(dirent); /* derefs ->namelen */ + if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) return FOUND_ERR; if (WARN_ON(reclen > nbytes)) From 9ad09b1976c562061636ff1e01bfc3a57aebe56b Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 14 Aug 2019 15:59:09 +0800 Subject: [PATCH 499/721] fuse: fix memleak in cuse_channel_open If cuse_send_init fails, need to fuse_conn_put cc->fc. cuse_channel_open->fuse_conn_init->refcount_set(&fc->count, 1) ->fuse_dev_alloc->fuse_conn_get ->fuse_dev_free->fuse_conn_put Fixes: cc080e9e9be1 ("fuse: introduce per-instance fuse_dev structure") Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 45762bb7a934..00015d851382 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -521,6 +521,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) rc = cuse_send_init(cc); if (rc) { fuse_dev_free(fud); + fuse_conn_put(&cc->fc); return rc; } file->private_data = fud; From 5addcd5dbd8c2d2bcf719a2eb41a9b43bf9a7935 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 13:52:31 +0800 Subject: [PATCH 500/721] fuse: Make fuse_args_to_req static Fix sparse warning: fs/fuse/dev.c:468:6: warning: symbol 'fuse_args_to_req' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Fixes: 68583165f962 ("fuse: add pages to fuse_args") Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 04fe6788e405..dadd617d826c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -464,7 +464,7 @@ static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } -void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) +static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) { req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; From 6f4ff81a4602dcfba436c6e2307d61ce9e9f652c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 23 Sep 2019 16:53:37 -0700 Subject: [PATCH 501/721] xfs: log proper length of superblock xfs_trans_log_buf takes first byte, last byte as args. In this case, it should be from 0 to sizeof() - 1. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_sb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a08dd8f40346..ac6cdca63e15 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -928,7 +928,7 @@ xfs_log_sb( xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); - xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } /* From 88d32d3983e72f2a7de72a49b701e2529c48e9c1 Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Tue, 24 Sep 2019 08:00:50 -0700 Subject: [PATCH 502/721] xfs: avoid unused to_mp() function warning to_mp() was first introduced with the following commit: 'commit 801cc4e17a34c ("xfs: debug mode forced buffered write failure")' But the user of to_mp() was removed by below commit: 'commit f8c47250ba46e ("xfs: convert drop_writes to use the errortag mechanism")' So kernel build with clang throws below warning message: fs/xfs/xfs_sysfs.c:72:1: warning: unused function 'to_mp' [-Wunused-function] to_mp(struct kobject *kobject) Hence to_mp() might be removed safely to get rid of warning message. Signed-off-by: Austin Kim Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_sysfs.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index ddd0bf7a4740..f1bc88f4367c 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -63,19 +63,6 @@ static const struct sysfs_ops xfs_sysfs_ops = { .store = xfs_sysfs_object_store, }; -/* - * xfs_mount kobject. The mp kobject also serves as the per-mount parent object - * that is identified by the fsname under sysfs. - */ - -static inline struct xfs_mount * -to_mp(struct kobject *kobject) -{ - struct xfs_kobj *kobj = to_kobj(kobject); - - return container_of(kobj, struct xfs_mount, m_kobj); -} - static struct attribute *xfs_mp_attrs[] = { NULL, }; From 722e6f500ac72d0d43955710738a24fd957607a4 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 24 Sep 2019 11:45:34 -0700 Subject: [PATCH 503/721] ia64: Fix some warnings introduced in merge window Fix arch/ia64/kernel/irq_ia64.c:586:1: warning: no return statement in function returning non-void [-Wreturn-type] arch/ia64/mm/contig.c:111:6: warning: unused variable 'rc' [-Wunused-variable] arch/ia64/mm/discontig.c:189:39: warning: unused variable 'rc' [-Wunused-variable] Signed-off-by: Tony Luck Signed-off-by: Linus Torvalds --- arch/ia64/kernel/irq_ia64.c | 1 + arch/ia64/mm/contig.c | 1 - arch/ia64/mm/discontig.c | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index f10208478131..8e91c86e8072 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -583,6 +583,7 @@ void ia64_process_pending_intr(void) static irqreturn_t dummy_handler (int irq, void *dev_id) { BUG(); + return IRQ_NONE; } static struct irqaction ipi_irqaction = { diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index db09a693f094..5b00dc3898e1 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -108,7 +108,6 @@ setup_per_cpu_areas(void) struct pcpu_group_info *gi; unsigned int cpu; ssize_t static_size, reserved_size, dyn_size; - int rc; ai = pcpu_alloc_alloc_info(1, num_possible_cpus()); if (!ai) diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 219fc640414b..4f33f6e7e206 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -186,7 +186,7 @@ void __init setup_per_cpu_areas(void) unsigned long base_offset; unsigned int cpu; ssize_t static_size, reserved_size, dyn_size; - int node, prev_node, unit, nr_units, rc; + int node, prev_node, unit, nr_units; ai = pcpu_alloc_alloc_info(MAX_NUMNODES, nr_cpu_ids); if (!ai) From c128e575514ce93dced349417d136304a33b6f99 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 22 Sep 2019 15:07:49 -0400 Subject: [PATCH 504/721] NFS: Optimise the default readahead size In the years since the max readahead size was fixed in NFS, a number of things have happened: - Users can now set the value directly using /sys/class/bdi - NFS max supported block sizes have increased by several orders of magnitude from 64K to 1MB. - Disk access latencies are orders of magnitude faster due to SSD + NVME. In particular note that if the server is advertising 1MB as the optimal read size, as that will set the readahead size to 15MB. Let's therefore adjust down, and try to default to VM_READAHEAD_PAGES. However let's inform the VM about our preferred block size so that it can choose to round up in cases where that makes sense. Reported-by: Alkis Georgopoulos Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/internal.h | 8 -------- fs/nfs/super.c | 9 ++++++++- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a2346a2f8361..4b946e6a052f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -16,14 +16,6 @@ extern const struct export_operations nfs_export_ops; struct nfs_string; -/* Maximum number of readahead requests - * FIXME: this should really be a sysctl so that users may tune it to suit - * their needs. People that do NFS over a slow network, might for - * instance want to reduce it to something closer to 1 for improved - * interactive response. - */ -#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) - static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 703f595dce90..c96194e28692 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2627,6 +2627,13 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); +static void nfs_set_readahead(struct backing_dev_info *bdi, + unsigned long iomax_pages) +{ + bdi->ra_pages = VM_READAHEAD_PAGES; + bdi->io_pages = iomax_pages; +} + struct dentry *nfs_fs_mount_common(struct nfs_server *server, int flags, const char *dev_name, struct nfs_mount_info *mount_info, @@ -2669,7 +2676,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, mntroot = ERR_PTR(error); goto error_splat_super; } - s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; + nfs_set_readahead(s->s_bdi, server->rpages); server->super = s; } From a8fd0feeca35cb8f9ddd950191f4aeb777f52f89 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 10 Sep 2019 17:14:30 -0400 Subject: [PATCH 505/721] pNFS/filelayout: enable LAYOUTGET on OPEN Add the flag to the filelayout driver to add LAYOUTGET to the OPEN compound. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 3cb073c50fa6..c9b605f6c9cb 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTGET_ON_OPEN, .max_layoutget_response = 4096, /* 1 page or so... */ .alloc_layout_hdr = filelayout_alloc_layout_hdr, .free_layout_hdr = filelayout_free_layout_hdr, From 07bfa4415ab607e459b69bd86aa7e7602ce10b4f Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Mon, 23 Sep 2019 15:32:53 -0700 Subject: [PATCH 506/721] fat: work around race with userspace's read via blockdev while mounting If userspace reads the buffer via blockdev while mounting, sb_getblk()+modify can race with buffer read via blockdev. For example, FS userspace bh = sb_getblk() modify bh->b_data read ll_rw_block(bh) fill bh->b_data by on-disk data /* lost modified data by FS */ set_buffer_uptodate(bh) set_buffer_uptodate(bh) Userspace should not use the blockdev while mounting though, the udev seems to be already doing this. Although I think the udev should try to avoid this, workaround the race by small overhead. Link: http://lkml.kernel.org/r/87pnk7l3sw.fsf_-_@mail.parknet.co.jp Signed-off-by: OGAWA Hirofumi Reported-by: Jan Stancek Tested-by: Jan Stancek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/dir.c | 13 +++++++++++-- fs/fat/fatent.c | 3 +++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 1bda2ab6745b..814ad2c2ba80 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1100,8 +1100,11 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, err = -ENOMEM; goto error; } + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[n]); memset(bhs[n]->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bhs[n]); + unlock_buffer(bhs[n]); mark_buffer_dirty_inode(bhs[n], dir); n++; @@ -1158,6 +1161,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); de = (struct msdos_dir_entry *)bhs[0]->b_data; + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[0]); /* filling the new directory slots ("." and ".." entries) */ memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); @@ -1180,6 +1185,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) de[0].size = de[1].size = 0; memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); set_buffer_uptodate(bhs[0]); + unlock_buffer(bhs[0]); mark_buffer_dirty_inode(bhs[0], dir); err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); @@ -1237,11 +1243,14 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, /* fill the directory entry */ copy = min(size, sb->s_blocksize); + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[n]); memcpy(bhs[n]->b_data, slots, copy); + set_buffer_uptodate(bhs[n]); + unlock_buffer(bhs[n]); + mark_buffer_dirty_inode(bhs[n], dir); slots += copy; size -= copy; - set_buffer_uptodate(bhs[n]); - mark_buffer_dirty_inode(bhs[n], dir); if (!size) break; n++; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 265983635f2b..3647c65a0f48 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -388,8 +388,11 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, err = -ENOMEM; goto error; } + /* Avoid race with userspace read via bdev */ + lock_buffer(c_bh); memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); + unlock_buffer(c_bh); mark_buffer_dirty_inode(c_bh, sbi->fat_inode); if (sb->s_flags & SB_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); From 6e73fd25e2c7510b7dfadbaf701f31d4bff9c75b Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 23 Sep 2019 15:32:56 -0700 Subject: [PATCH 507/721] Revert "mm/z3fold.c: fix race between migration and destruction" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the original commit applied, z3fold_zpool_destroy() may get blocked on wait_event() for indefinite time. Revert this commit for the time being to get rid of this problem since the issue the original commit addresses is less severe. Link: http://lkml.kernel.org/r/20190910123142.7a9c8d2de4d0acbc0977c602@gmail.com Fixes: d776aaa9895eb6eb77 ("mm/z3fold.c: fix race between migration and destruction") Reported-by: Agustín Dall'Alba Signed-off-by: Vitaly Wool Cc: Vlastimil Babka Cc: Vitaly Wool Cc: Shakeel Butt Cc: Jonathan Adams Cc: Henry Burns Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 90 ----------------------------------------------------- 1 file changed, 90 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 75b7962439ff..ed19d98c9dcd 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include @@ -146,8 +145,6 @@ struct z3fold_header { * @release_wq: workqueue for safe page release * @work: work_struct for safe page release * @inode: inode for z3fold pseudo filesystem - * @destroying: bool to stop migration once we start destruction - * @isolated: int to count the number of pages currently in isolation * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -166,11 +163,8 @@ struct z3fold_pool { const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; - struct wait_queue_head isolate_wait; struct work_struct work; struct inode *inode; - bool destroying; - int isolated; }; /* @@ -775,7 +769,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); - init_waitqueue_head(&pool->isolate_wait); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); if (!pool->unbuddied) goto out_pool; @@ -815,15 +808,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, return NULL; } -static bool pool_isolated_are_drained(struct z3fold_pool *pool) -{ - bool ret; - - spin_lock(&pool->lock); - ret = pool->isolated == 0; - spin_unlock(&pool->lock); - return ret; -} /** * z3fold_destroy_pool() - destroys an existing z3fold pool * @pool: the z3fold pool to be destroyed @@ -833,22 +817,6 @@ static bool pool_isolated_are_drained(struct z3fold_pool *pool) static void z3fold_destroy_pool(struct z3fold_pool *pool) { kmem_cache_destroy(pool->c_handle); - /* - * We set pool-> destroying under lock to ensure that - * z3fold_page_isolate() sees any changes to destroying. This way we - * avoid the need for any memory barriers. - */ - - spin_lock(&pool->lock); - pool->destroying = true; - spin_unlock(&pool->lock); - - /* - * We need to ensure that no pages are being migrated while we destroy - * these workqueues, as migration can queue work on either of the - * workqueues. - */ - wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool)); /* * We need to destroy pool->compact_wq before pool->release_wq, @@ -1339,28 +1307,6 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) return atomic64_read(&pool->pages_nr); } -/* - * z3fold_dec_isolated() expects to be called while pool->lock is held. - */ -static void z3fold_dec_isolated(struct z3fold_pool *pool) -{ - assert_spin_locked(&pool->lock); - VM_BUG_ON(pool->isolated <= 0); - pool->isolated--; - - /* - * If we have no more isolated pages, we have to see if - * z3fold_destroy_pool() is waiting for a signal. - */ - if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait)) - wake_up_all(&pool->isolate_wait); -} - -static void z3fold_inc_isolated(struct z3fold_pool *pool) -{ - pool->isolated++; -} - static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) { struct z3fold_header *zhdr; @@ -1387,34 +1333,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) spin_lock(&pool->lock); if (!list_empty(&page->lru)) list_del(&page->lru); - /* - * We need to check for destruction while holding pool->lock, as - * otherwise destruction could see 0 isolated pages, and - * proceed. - */ - if (unlikely(pool->destroying)) { - spin_unlock(&pool->lock); - /* - * If this page isn't stale, somebody else holds a - * reference to it. Let't drop our refcount so that they - * can call the release logic. - */ - if (unlikely(kref_put(&zhdr->refcount, - release_z3fold_page_locked))) { - /* - * If we get here we have kref problems, so we - * should freak out. - */ - WARN(1, "Z3fold is experiencing kref problems\n"); - z3fold_page_unlock(zhdr); - return false; - } - z3fold_page_unlock(zhdr); - return false; - } - - - z3fold_inc_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); return true; @@ -1483,10 +1401,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); - spin_lock(&pool->lock); - z3fold_dec_isolated(pool); - spin_unlock(&pool->lock); - page_mapcount_reset(page); put_page(page); return 0; @@ -1506,14 +1420,10 @@ static void z3fold_page_putback(struct page *page) INIT_LIST_HEAD(&page->lru); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); - spin_lock(&pool->lock); - z3fold_dec_isolated(pool); - spin_unlock(&pool->lock); return; } spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); - z3fold_dec_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); } From 710ec38b0f633ab3e2581f07a73442d809e28ab0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 Sep 2019 15:32:59 -0700 Subject: [PATCH 508/721] mm: add dummy can_do_mlock() helper On kernels without CONFIG_MMU, we get a link error for the siw driver: drivers/infiniband/sw/siw/siw_mem.o: In function `siw_umem_get': siw_mem.c:(.text+0x4c8): undefined reference to `can_do_mlock' This is probably not the only driver that needs the function and could otherwise build correctly without CONFIG_MMU, so add a dummy variant that always returns false. Link: http://lkml.kernel.org/r/20190909204201.931830-1-arnd@arndb.de Fixes: 2251334dcac9 ("rdma/siw: application buffer management") Signed-off-by: Arnd Bergmann Suggested-by: Jason Gunthorpe Acked-by: Michal Hocko Cc: Bernard Metzler Cc: "Matthew Wilcox (Oracle)" Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7cf955feb823..6e79b3df1582 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1405,7 +1405,11 @@ extern void pagefault_out_of_memory(void); extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); +#ifdef CONFIG_MMU extern bool can_do_mlock(void); +#else +static inline bool can_do_mlock(void) { return false; } +#endif extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); From 3f9d2b5766aea06042630ac60b7316fd0cebf06f Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 23 Sep 2019 15:33:02 -0700 Subject: [PATCH 509/721] z3fold: fix retry mechanism in page reclaim z3fold_page_reclaim()'s retry mechanism is broken: on a second iteration it will have zhdr from the first one so that zhdr is no longer in line with struct page. That leads to crashes when the system is stressed. Fix that by moving zhdr assignment up. While at it, protect against using already freed handles by using own local slots structure in z3fold_page_reclaim(). Link: http://lkml.kernel.org/r/20190908162919.830388dc7404d1e2c80f4095@gmail.com Signed-off-by: Vitaly Wool Reported-by: Markus Linnala Reported-by: Chris Murphy Reported-by: Agustin Dall'Alba Cc: "Maciej S. Szmigiero" Cc: Shakeel Butt Cc: Henry Burns Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 49 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index ed19d98c9dcd..d637d9ee005c 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -366,9 +366,10 @@ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) * Encodes the handle of a particular buddy within a z3fold page * Pool lock should be held as this function accesses first_num */ -static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) +static unsigned long __encode_handle(struct z3fold_header *zhdr, + struct z3fold_buddy_slots *slots, + enum buddy bud) { - struct z3fold_buddy_slots *slots; unsigned long h = (unsigned long)zhdr; int idx = 0; @@ -385,11 +386,15 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) if (bud == LAST) h |= (zhdr->last_chunks << BUDDY_SHIFT); - slots = zhdr->slots; slots->slot[idx] = h; return (unsigned long)&slots->slot[idx]; } +static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) +{ + return __encode_handle(zhdr, zhdr->slots, bud); +} + /* Returns the z3fold page where a given handle is stored */ static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) { @@ -624,6 +629,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) } if (unlikely(PageIsolated(page) || + test_bit(PAGE_CLAIMED, &page->private) || test_bit(PAGE_STALE, &page->private))) { z3fold_page_unlock(zhdr); return; @@ -1100,6 +1106,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) struct z3fold_header *zhdr = NULL; struct page *page = NULL; struct list_head *pos; + struct z3fold_buddy_slots slots; unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; spin_lock(&pool->lock); @@ -1118,16 +1125,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) /* this bit could have been set by free, in which case * we pass over to the next page in the pool. */ - if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) { + page = NULL; continue; + } - if (unlikely(PageIsolated(page))) + if (unlikely(PageIsolated(page))) { + clear_bit(PAGE_CLAIMED, &page->private); + page = NULL; continue; + } + zhdr = page_address(page); if (test_bit(PAGE_HEADLESS, &page->private)) break; - zhdr = page_address(page); if (!z3fold_page_trylock(zhdr)) { + clear_bit(PAGE_CLAIMED, &page->private); zhdr = NULL; continue; /* can't evict at this point */ } @@ -1145,26 +1158,30 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) if (!test_bit(PAGE_HEADLESS, &page->private)) { /* - * We need encode the handles before unlocking, since - * we can race with free that will set - * (first|last)_chunks to 0 + * We need encode the handles before unlocking, and + * use our local slots structure because z3fold_free + * can zero out zhdr->slots and we can't do much + * about that */ first_handle = 0; last_handle = 0; middle_handle = 0; if (zhdr->first_chunks) - first_handle = encode_handle(zhdr, FIRST); + first_handle = __encode_handle(zhdr, &slots, + FIRST); if (zhdr->middle_chunks) - middle_handle = encode_handle(zhdr, MIDDLE); + middle_handle = __encode_handle(zhdr, &slots, + MIDDLE); if (zhdr->last_chunks) - last_handle = encode_handle(zhdr, LAST); + last_handle = __encode_handle(zhdr, &slots, + LAST); /* * it's safe to unlock here because we hold a * reference to this page */ z3fold_page_unlock(zhdr); } else { - first_handle = encode_handle(zhdr, HEADLESS); + first_handle = __encode_handle(zhdr, &slots, HEADLESS); last_handle = middle_handle = 0; } @@ -1194,9 +1211,9 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); + clear_bit(PAGE_CLAIMED, &page->private); } else { z3fold_page_lock(zhdr); - clear_bit(PAGE_CLAIMED, &page->private); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); @@ -1211,6 +1228,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); + clear_bit(PAGE_CLAIMED, &page->private); } /* We started off locked to we need to lock the pool back */ @@ -1315,7 +1333,8 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); - if (test_bit(PAGE_HEADLESS, &page->private)) + if (test_bit(PAGE_HEADLESS, &page->private) || + test_bit(PAGE_CLAIMED, &page->private)) return false; zhdr = page_address(page); From 6279eb3dd7946c69346a3b98473ed13d3a44adb5 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Mon, 23 Sep 2019 15:33:05 -0700 Subject: [PATCH 510/721] kbuild: clean compressed initramfs image Since 9e3596b0c653 ("kbuild: initramfs cleanup, set target from Kconfig") "make clean" leaves behind compressed initramfs images. Example: $ make defconfig $ sed -i 's|CONFIG_INITRAMFS_SOURCE=""|CONFIG_INITRAMFS_SOURCE="/tmp/ir.cpio"|' .config $ make olddefconfig $ make -s $ make -s clean $ git clean -ndxf | grep initramfs Would remove usr/initramfs_data.cpio.gz clean rules do not have CONFIG_* context so they do not know which compression format was used. Thus they don't know which files to delete. Tell clean to delete all possible compression formats. Once patched usr/initramfs_data.cpio.gz and friends are deleted by "make clean". Link: http://lkml.kernel.org/r/20190722063251.55541-1-gthelen@google.com Fixes: 9e3596b0c653 ("kbuild: initramfs cleanup, set target from Kconfig") Signed-off-by: Greg Thelen Cc: Nicholas Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- usr/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/usr/Makefile b/usr/Makefile index 6a89eb019275..e6f7cb2f81db 100644 --- a/usr/Makefile +++ b/usr/Makefile @@ -11,6 +11,9 @@ datafile_y = initramfs_data.cpio$(suffix_y) datafile_d_y = .$(datafile_y).d AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)" +# clean rules do not have CONFIG_INITRAMFS_COMPRESSION. So clean up after all +# possible compression formats. +clean-files += initramfs_data.cpio* # Generate builtin.o based on initramfs_data.o obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o From bbd0f32721e28cc7097fa50afa96178896f9e33b Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 23 Sep 2019 15:33:08 -0700 Subject: [PATCH 511/721] ocfs2: use jbd2_inode dirty range scoping 6ba0e7dc64a5 ("jbd2: introduce jbd2_inode dirty range scoping") allow us scoping each of the inode dirty ranges associated with a given transaction, and ext4 already does this way. Now let's also use the newly introduced jbd2_inode dirty range scoping to prevent us from waiting forever when trying to complete a journal transaction in ocfs2. Link: http://lkml.kernel.org/r/1562977611-8412-1-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reviewed-by: Ross Zwisler Reviewed-by: Changwei Ge Cc: "Theodore Ts'o" Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 5 ++++- fs/ocfs2/aops.c | 13 ++++++++++--- fs/ocfs2/file.c | 10 +++++++--- fs/ocfs2/journal.h | 11 +++++++---- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0c335b51043d..f5d2bd15e0ca 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6792,6 +6792,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, struct page *page, int zero, u64 *phys) { int ret, partial = 0; + loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from; + loff_t length = to - from; ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); if (ret) @@ -6811,7 +6813,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, if (ret < 0) mlog_errno(ret); else if (ocfs2_should_order_data(inode)) { - ret = ocfs2_jbd2_file_inode(handle, inode); + ret = ocfs2_jbd2_inode_add_write(handle, inode, + start_byte, length); if (ret < 0) mlog_errno(ret); } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a4c905d6b575..8de1c9d644f6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -942,7 +942,8 @@ static void ocfs2_write_failure(struct inode *inode, if (tmppage && page_has_buffers(tmppage)) { if (ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(wc->w_handle, inode); + ocfs2_jbd2_inode_add_write(wc->w_handle, inode, + user_pos, user_len); block_commit_write(tmppage, from, to); } @@ -2023,8 +2024,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } if (page_has_buffers(tmppage)) { - if (handle && ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(handle, inode); + if (handle && ocfs2_should_order_data(inode)) { + loff_t start_byte = + ((loff_t)tmppage->index << PAGE_SHIFT) + + from; + loff_t length = to - from; + ocfs2_jbd2_inode_add_write(handle, inode, + start_byte, length); + } block_commit_write(tmppage, from, to); } } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4435df3e5adb..efe9988a5be4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -706,7 +706,9 @@ static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, * Thus, we need to explicitly order the zeroed pages. */ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, - struct buffer_head *di_bh) + struct buffer_head *di_bh, + loff_t start_byte, + loff_t length) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -722,7 +724,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, goto out; } - ret = ocfs2_jbd2_file_inode(handle, inode); + ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret < 0) { mlog_errno(ret); goto out; @@ -761,7 +763,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); - handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); + handle = ocfs2_zero_start_ordered_transaction(inode, di_bh, + abs_from, + abs_to - abs_from); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index c0fe6ed08ab1..f37473c20a7b 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -232,8 +232,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * ocfs2_journal_access_*() unless you intend to * manage the checksum by hand. * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. - * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before - * the current handle commits. + * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes + * out before the current handle commits. */ /* You must always start_trans with a number of buffs > 0, but it's @@ -603,9 +603,12 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, return credits; } -static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) +static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode, + loff_t start_byte, loff_t length) { - return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode); + return jbd2_journal_inode_ranged_write(handle, + &OCFS2_I(inode)->ip_jinode, + start_byte, length); } static inline int ocfs2_begin_ordered_truncate(struct inode *inode, From 963abb9aebcdacf5c709a36da9ac0727a33671eb Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 23 Sep 2019 15:33:11 -0700 Subject: [PATCH 512/721] jbd2: remove jbd2_journal_inode_add_[write|wait] Since ext4/ocfs2 are using jbd2_inode dirty range scoping APIs now, jbd2_journal_inode_add_[write|wait] are not used any more, remove them. Link: http://lkml.kernel.org/r/1562977611-8412-2-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reviewed-by: Ross Zwisler Acked-by: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Joseph Qi Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd2/journal.c | 2 -- fs/jbd2/transaction.c | 12 ------------ include/linux/jbd2.h | 2 -- 3 files changed, 16 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 953990eb70a9..1c58859aa592 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -89,8 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); -EXPORT_SYMBOL(jbd2_journal_inode_add_write); -EXPORT_SYMBOL(jbd2_journal_inode_add_wait); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index afc06daee5bb..bee8498d7792 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2622,18 +2622,6 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, return 0; } -int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) -{ - return jbd2_journal_file_inode(handle, jinode, - JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX); -} - -int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) -{ - return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0, - LLONG_MAX); -} - int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *jinode, loff_t start_byte, loff_t length) { diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index df03825ad1a1..603fbc4e2f70 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1410,8 +1410,6 @@ extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); -extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode); -extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode); extern int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); From 5e7a3ed9f1a60f17c165e1b73df6d6aebb211266 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 23 Sep 2019 15:33:15 -0700 Subject: [PATCH 513/721] ocfs2: further debugfs cleanups There is no need to check return value of debugfs_create functions, but the last sweep through ocfs missed a number of places where this was happening. There is also no need to save the individual dentries for the debugfs files, as everything is can just be removed at once when the directory is removed. By getting rid of the file dentries for the debugfs entries, a bit of local memory can be saved as well. [colin.king@canonical.com: ensure ret is set to zero before returning] Link: http://lkml.kernel.org/r/20190807121929.28918-1-colin.king@canonical.com Link: http://lkml.kernel.org/r/20190731132119.GA12603@kroah.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Colin Ian King Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Jia Guo Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/blockcheck.c | 26 ++++----- fs/ocfs2/cluster/heartbeat.c | 103 +++++++++-------------------------- fs/ocfs2/dlm/dlmcommon.h | 1 - fs/ocfs2/dlm/dlmdebug.c | 55 ++++--------------- fs/ocfs2/dlm/dlmdebug.h | 16 +----- fs/ocfs2/dlm/dlmdomain.c | 7 +-- fs/ocfs2/dlmglue.c | 20 ++----- fs/ocfs2/ocfs2.h | 3 - fs/ocfs2/super.c | 10 +--- 9 files changed, 61 insertions(+), 180 deletions(-) diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 429e6a8359a5..eaf042feaf5e 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -231,14 +231,6 @@ static int blockcheck_u64_get(void *data, u64 *val) } DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); -static struct dentry *blockcheck_debugfs_create(const char *name, - struct dentry *parent, - u64 *value) -{ - return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, - &blockcheck_fops); -} - static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { if (stats) { @@ -250,16 +242,20 @@ static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, struct dentry *parent) { - stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); + struct dentry *dir; - blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir, - &stats->b_check_count); + dir = debugfs_create_dir("blockcheck", parent); + stats->b_debug_dir = dir; - blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir, - &stats->b_failure_count); + debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir, + &stats->b_check_count, &blockcheck_fops); + + debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir, + &stats->b_failure_count, &blockcheck_fops); + + debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir, + &stats->b_recover_count, &blockcheck_fops); - blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir, - &stats->b_recover_count); } #else static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f1b613327ac8..a368350d4c27 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -225,10 +225,6 @@ struct o2hb_region { unsigned int hr_region_num; struct dentry *hr_debug_dir; - struct dentry *hr_debug_livenodes; - struct dentry *hr_debug_regnum; - struct dentry *hr_debug_elapsed_time; - struct dentry *hr_debug_pinned; struct o2hb_debug_buf *hr_db_livenodes; struct o2hb_debug_buf *hr_db_regnum; struct o2hb_debug_buf *hr_db_elapsed_time; @@ -1394,21 +1390,20 @@ void o2hb_exit(void) kfree(o2hb_db_failedregions); } -static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, - struct o2hb_debug_buf **db, int db_len, - int type, int size, int len, void *data) +static void o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, int type, + int size, int len, void *data) { *db = kmalloc(db_len, GFP_KERNEL); if (!*db) - return NULL; + return; (*db)->db_type = type; (*db)->db_size = size; (*db)->db_len = len; (*db)->db_data = data; - return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, - &o2hb_debug_fops); + debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops); } static void o2hb_debug_init(void) @@ -1525,11 +1520,7 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slots); - debugfs_remove(reg->hr_debug_livenodes); - debugfs_remove(reg->hr_debug_regnum); - debugfs_remove(reg->hr_debug_elapsed_time); - debugfs_remove(reg->hr_debug_pinned); - debugfs_remove(reg->hr_debug_dir); + debugfs_remove_recursive(reg->hr_debug_dir); kfree(reg->hr_db_livenodes); kfree(reg->hr_db_regnum); kfree(reg->hr_db_elapsed_time); @@ -1988,69 +1979,33 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } -static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +static void o2hb_debug_region_init(struct o2hb_region *reg, + struct dentry *parent) { - int ret = -ENOMEM; + struct dentry *dir; - reg->hr_debug_dir = - debugfs_create_dir(config_item_name(®->hr_item), dir); - if (!reg->hr_debug_dir) { - mlog_errno(ret); - goto bail; - } + dir = debugfs_create_dir(config_item_name(®->hr_item), parent); + reg->hr_debug_dir = dir; - reg->hr_debug_livenodes = - o2hb_debug_create(O2HB_DEBUG_LIVENODES, - reg->hr_debug_dir, - &(reg->hr_db_livenodes), - sizeof(*(reg->hr_db_livenodes)), - O2HB_DB_TYPE_REGION_LIVENODES, - sizeof(reg->hr_live_node_bitmap), - O2NM_MAX_NODES, reg); - if (!reg->hr_debug_livenodes) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, + reg); - reg->hr_debug_regnum = - o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, - reg->hr_debug_dir, - &(reg->hr_db_regnum), - sizeof(*(reg->hr_db_regnum)), - O2HB_DB_TYPE_REGION_NUMBER, - 0, O2NM_MAX_NODES, reg); - if (!reg->hr_debug_regnum) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); - reg->hr_debug_elapsed_time = - o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, - reg->hr_debug_dir, - &(reg->hr_db_elapsed_time), - sizeof(*(reg->hr_db_elapsed_time)), - O2HB_DB_TYPE_REGION_ELAPSED_TIME, - 0, 0, reg); - if (!reg->hr_debug_elapsed_time) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); - reg->hr_debug_pinned = - o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, - reg->hr_debug_dir, - &(reg->hr_db_pinned), - sizeof(*(reg->hr_db_pinned)), - O2HB_DB_TYPE_REGION_PINNED, - 0, 0, reg); - if (!reg->hr_debug_pinned) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned), + sizeof(*(reg->hr_db_pinned)), + O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); - ret = 0; -bail: - return ret; } static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, @@ -2106,11 +2061,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (ret) goto unregister_handler; - ret = o2hb_debug_region_init(reg, o2hb_debug_dir); - if (ret) { - config_item_put(®->hr_item); - goto unregister_handler; - } + o2hb_debug_region_init(reg, o2hb_debug_dir); return ®->hr_item; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 69a429b625cc..aaf24548b02a 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -142,7 +142,6 @@ struct dlm_ctxt atomic_t res_tot_count; atomic_t res_cur_count; - struct dlm_debug_ctxt *dlm_debug_ctxt; struct dentry *dlm_debugfs_subroot; /* NOTE: Next three are protected by dlm_domain_lock */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index a4b58ba99927..4d0b452012b2 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -853,67 +853,34 @@ static const struct file_operations debug_state_fops = { /* files in subroot */ void dlm_debug_init(struct dlm_ctxt *dlm) { - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; - /* for dumping dlm_ctxt */ - dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_state_fops); + debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_state_fops); /* for dumping lockres */ - dc->debug_lockres_dentry = - debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_lockres_fops); + debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops); /* for dumping mles */ - dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_mle_fops); + debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops); /* for dumping lockres on the purge list */ - dc->debug_purgelist_dentry = - debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_purgelist_fops); -} - -void dlm_debug_shutdown(struct dlm_ctxt *dlm) -{ - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; - - if (dc) { - debugfs_remove(dc->debug_purgelist_dentry); - debugfs_remove(dc->debug_mle_dentry); - debugfs_remove(dc->debug_lockres_dentry); - debugfs_remove(dc->debug_state_dentry); - kfree(dc); - dc = NULL; - } + debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, + &debug_purgelist_fops); } /* subroot - domain dir */ -int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { - dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), - GFP_KERNEL); - if (!dlm->dlm_debug_ctxt) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } - dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, dlm_debugfs_root); - return 0; } void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { - debugfs_remove(dlm->dlm_debugfs_subroot); + debugfs_remove_recursive(dlm->dlm_debugfs_subroot); } /* debugfs root */ diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 7d0c7c9013ce..f8fd8680a4b6 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -14,13 +14,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle); #ifdef CONFIG_DEBUG_FS -struct dlm_debug_ctxt { - struct dentry *debug_state_dentry; - struct dentry *debug_lockres_dentry; - struct dentry *debug_mle_dentry; - struct dentry *debug_purgelist_dentry; -}; - struct debug_lockres { int dl_len; char *dl_buf; @@ -29,9 +22,8 @@ struct debug_lockres { }; void dlm_debug_init(struct dlm_ctxt *dlm); -void dlm_debug_shutdown(struct dlm_ctxt *dlm); -int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); +void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_create_debugfs_root(void); @@ -42,13 +34,9 @@ void dlm_destroy_debugfs_root(void); static inline void dlm_debug_init(struct dlm_ctxt *dlm) { } -static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) +static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { } -static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) -{ - return 0; -} static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7338b5d4647c..ee6f459f9770 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -387,7 +387,6 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) { dlm_unregister_domain_handlers(dlm); - dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -1938,7 +1937,6 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) if (status) { dlm_unregister_domain_handlers(dlm); - dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -1992,9 +1990,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->key = key; dlm->node_num = o2nm_this_node(); - ret = dlm_create_debugfs_subroot(dlm); - if (ret < 0) - goto leave; + dlm_create_debugfs_subroot(dlm); spin_lock_init(&dlm->spinlock); spin_lock_init(&dlm->master_lock); @@ -2056,6 +2052,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, mlog(0, "context init: refcount %u\n", kref_read(&dlm->dlm_refs)); + ret = 0; leave: if (ret < 0 && dlm) { if (dlm->master_hash) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 14207234fa3d..ad594fef2ab0 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3012,8 +3012,6 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) kref_init(&dlm_debug->d_refcnt); INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); - dlm_debug->d_locking_state = NULL; - dlm_debug->d_locking_filter = NULL; dlm_debug->d_filter_secs = 0; out: return dlm_debug; @@ -3282,27 +3280,19 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) { struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; - dlm_debug->d_locking_state = debugfs_create_file("locking_state", - S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_dlm_debug_fops); + debugfs_create_file("locking_state", S_IFREG|S_IRUSR, + osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); - dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", - 0600, - osb->osb_debug_root, - &dlm_debug->d_filter_secs); + debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, + &dlm_debug->d_filter_secs); } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) { struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; - if (dlm_debug) { - debugfs_remove(dlm_debug->d_locking_state); - debugfs_remove(dlm_debug->d_locking_filter); + if (dlm_debug) ocfs2_put_dlm_debug(dlm_debug); - } } int ocfs2_dlm_init(struct ocfs2_super *osb) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fddbbd60f434..9150cfa4df7d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -223,8 +223,6 @@ struct ocfs2_orphan_scan { struct ocfs2_dlm_debug { struct kref d_refcnt; - struct dentry *d_locking_state; - struct dentry *d_locking_filter; u32 d_filter_secs; struct list_head d_lockres_tracking; }; @@ -401,7 +399,6 @@ struct ocfs2_super struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; - struct dentry *osb_ctxt; wait_queue_head_t recovery_event; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8b2f39506648..c81e86c62380 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1080,10 +1080,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); - osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_osb_debug_fops); + debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, + osb, &ocfs2_osb_debug_fops); if (ocfs2_meta_ecc(osb)) ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, @@ -1861,8 +1859,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) kset_unregister(osb->osb_dev_kset); - debugfs_remove(osb->osb_ctxt); - /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); @@ -1918,7 +1914,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_dlm_shutdown(osb, hangup_needed); ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); - debugfs_remove(osb->osb_debug_root); + debugfs_remove_recursive(osb->osb_debug_root); if (hangup_needed) ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); From 3dd21cdbefa97cf3ec5559f9ab214af4d8ae2096 Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Mon, 23 Sep 2019 15:33:18 -0700 Subject: [PATCH 514/721] ocfs2: remove unused ocfs2_calc_tree_trunc_credits() ocfs2_calc_tree_trunc_credits() is not called anywhere. Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4014FC2050F@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: guozhonghua Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.h | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index f37473c20a7b..6d8dd3308aaf 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -575,34 +575,6 @@ static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) return ocfs2_extent_recs_per_gd(sb); } -static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, - unsigned int clusters_to_del, - struct ocfs2_dinode *fe, - struct ocfs2_extent_list *last_el) -{ - /* for dinode + all headers in this pass + update to next leaf */ - u16 next_free = le16_to_cpu(last_el->l_next_free_rec); - u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); - int credits = 1 + tree_depth + 1; - int i; - - i = next_free - 1; - BUG_ON(i < 0); - - /* We may be deleting metadata blocks, so metadata alloc dinode + - one desc. block for each possible delete. */ - if (tree_depth && next_free == 1 && - ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) - credits += 1 + tree_depth; - - /* update to the truncate log. */ - credits += OCFS2_TRUNCATE_LOG_UPDATE; - - credits += ocfs2_quota_trans_credits(sb); - - return credits; -} - static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { From bf5a52647963cd08b2e999f77b16739d6300ebce Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Mon, 23 Sep 2019 15:33:21 -0700 Subject: [PATCH 515/721] ocfs2: remove unused ocfs2_orphan_scan_exit() declaration ocfs2_orphan_scan_exit() is declared but not implemented. Also perform a minor cleanup in ocfs2_link_credits() Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4014FC208AC@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: guozhonghua Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 6d8dd3308aaf..3103ba7f97a2 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -144,7 +144,6 @@ static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, void ocfs2_orphan_scan_init(struct ocfs2_super *osb); void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); -void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); void ocfs2_complete_recovery(struct work_struct *work); void ocfs2_wait_for_recovery(struct ocfs2_super *osb); @@ -441,7 +440,7 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, * previous dirblock update in the free list */ static inline int ocfs2_link_credits(struct super_block *sb) { - return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + + return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 + ocfs2_quota_trans_credits(sb); } From 225dcadf8ee869d4429373ef04fdce49ac3fc266 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 23 Sep 2019 15:33:24 -0700 Subject: [PATCH 516/721] fs/ocfs2/namei.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/ocfs2/namei.c: In function ocfs2_create_inode_in_orphan: fs/ocfs2/namei.c:2503:23: warning: variable di set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566522588-63786-2-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: zhengbin Signed-off-by: Joseph Qi Reported-by: Hulk Robot Reviewed-by: Joseph Qi Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6f8e1c4fdb9c..8ea51cf27b97 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2486,7 +2486,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, struct inode *inode = NULL; struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); - struct ocfs2_dinode *di = NULL; handle_t *handle = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; struct buffer_head *parent_di_bh = NULL; @@ -2552,7 +2551,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - di = (struct ocfs2_dinode *)new_di_bh->b_data; status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, &orphan_insert, orphan_dir, false); if (status < 0) { From 236dcc2ae4945e00201dbab9b8f76905849b7515 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 23 Sep 2019 15:33:27 -0700 Subject: [PATCH 517/721] fs/ocfs2/file.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/ocfs2/file.c: In function ocfs2_prepare_inode_for_write: fs/ocfs2/file.c:2143:9: warning: variable end set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566522588-63786-3-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: zhengbin Signed-off-by: Joseph Qi Reported-by: Hulk Robot Reviewed-by: Joseph Qi Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index efe9988a5be4..2e982db3e1ae 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2130,7 +2130,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); struct buffer_head *di_bh = NULL; - loff_t end; /* * We start with a read level meta lock and only jump to an ex @@ -2194,8 +2193,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, } } - end = pos + count; - ret = ocfs2_check_range_for_refcount(inode, pos, count); if (ret == 1) { ocfs2_inode_unlock(inode, meta_level); From 77461ba1d17673da47eeb5cccb928e10cf97a01e Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 23 Sep 2019 15:33:30 -0700 Subject: [PATCH 518/721] fs/ocfs2/dir.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/ocfs2/dir.c: In function ocfs2_dx_dir_transfer_leaf: fs/ocfs2/dir.c:3653:42: warning: variable new_list set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566522588-63786-4-git-send-email-joseph.qi@linux.alibaba.com Signed-off-by: zhengbin Signed-off-by: Joseph Qi Reported-by: Hulk Robot Reviewed-by: Joseph Qi Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 784426dee56c..bdef72c0f099 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3636,7 +3636,7 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, int i, j, num_used; u32 major_hash; struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; - struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; + struct ocfs2_dx_entry_list *orig_list, *tmp_list; struct ocfs2_dx_entry *dx_entry; tmp_list = &tmp_dx_leaf->dl_list; @@ -3645,7 +3645,6 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; orig_list = &orig_dx_leaf->dl_list; new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; - new_list = &new_dx_leaf->dl_list; num_used = le16_to_cpu(orig_list->de_num_used); From a89bd89fae638965ca5a79a3467d79f926260882 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Mon, 23 Sep 2019 15:33:34 -0700 Subject: [PATCH 519/721] ocfs2: delete unnecessary checks before brelse() brelse() tests whether its argument is NULL and then returns immediately. Thus the tests around the shown calls are not needed. This issue was detected by using the Coccinelle software. Link: http://lkml.kernel.org/r/55cde320-394b-f985-56ce-1a2abea782aa@web.de Signed-off-by: Markus Elfring Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 7 ++----- fs/ocfs2/extent_map.c | 3 +-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index ad594fef2ab0..6e774c5ea13b 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2508,9 +2508,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, ocfs2_inode_unlock(inode, ex); } - if (local_bh) - brelse(local_bh); - + brelse(local_bh); return status; } @@ -2593,8 +2591,7 @@ int ocfs2_inode_lock_atime(struct inode *inode, *level = 1; if (ocfs2_should_update_atime(inode, vfsmnt)) ocfs2_update_inode_atime(inode, bh); - if (bh) - brelse(bh); + brelse(bh); } else *level = 0; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e66a249fe07c..e3e2d1b2af51 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -590,8 +590,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, *extent_flags = rec->e_flags; } out: - if (eb_bh) - brelse(eb_bh); + brelse(eb_bh); return ret; } From 0a3775e4f883912944481cf2ef36eb6383a9cc74 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Mon, 23 Sep 2019 15:33:37 -0700 Subject: [PATCH 520/721] ocfs2: wait for recovering done after direct unlock request There is a scenario causing ocfs2 umount hang when multiple hosts are rebooting at the same time. NODE1 NODE2 NODE3 send unlock requset to NODE2 dies become recovery master recover NODE2 find NODE2 dead mark resource RECOVERING directly remove lock from grant list calculate usage but RECOVERING marked **miss the window of purging clear RECOVERING To reproduce this issue, crash a host and then umount ocfs2 from another node. To solve this, just let unlock progress wait for recovery done. Link: http://lkml.kernel.org/r/1550124866-20367-1-git-send-email-gechangwei@live.cn Signed-off-by: Changwei Ge Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmunlock.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index e78657742bd8..3883633e82eb 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -90,7 +90,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, enum dlm_status status; int actions = 0; int in_use; - u8 owner; + u8 owner; + int recovery_wait = 0; mlog(0, "master_node = %d, valblk = %d\n", master_node, flags & LKM_VALBLK); @@ -193,9 +194,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, } if (flags & LKM_CANCEL) lock->cancel_pending = 0; - else - lock->unlock_pending = 0; - + else { + if (!lock->unlock_pending) + recovery_wait = 1; + else + lock->unlock_pending = 0; + } } /* get an extra ref on lock. if we are just switching @@ -229,6 +233,17 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, spin_unlock(&res->spinlock); wake_up(&res->wq); + if (recovery_wait) { + spin_lock(&res->spinlock); + /* Unlock request will directly succeed after owner dies, + * and the lock is already removed from grant list. We have to + * wait for RECOVERING done or we miss the chance to purge it + * since the removement is much faster than RECOVERING proc. + */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING); + spin_unlock(&res->spinlock); + } + /* let the caller's final dlm_lock_put handle the actual kfree */ if (actions & DLM_UNLOCK_FREE_LOCK) { /* this should always be coupled with list removal */ From d7283b39dbf3de4fe54870f07d7975effa3188da Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Mon, 23 Sep 2019 15:33:40 -0700 Subject: [PATCH 521/721] ocfs2: checkpoint appending truncate log transaction before flushing Appending truncate log(TA) and and flushing truncate log(TF) are two separated transactions. They can be both committed but not checkpointed. If crash occurs then, both transaction will be replayed with several already released to global bitmap clusters. Then truncate log will be replayed resulting in cluster double free. To reproduce this issue, just crash the host while punching hole to files. Signed-off-by: Changwei Ge Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f5d2bd15e0ca..f9baefc76cf9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5993,6 +5993,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct buffer_head *data_alloc_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; + struct ocfs2_journal *journal = osb->journal; BUG_ON(inode_trylock(tl_inode)); @@ -6013,6 +6014,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } + /* Appending truncate log(TA) and and flushing truncate log(TF) are + * two separated transactions. They can be both committed but not + * checkpointed. If crash occurs then, both two transaction will be + * replayed with several already released to global bitmap clusters. + * Then truncate log will be replayed resulting in cluster double free. + */ + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) { + mlog_errno(status); + goto out; + } + data_alloc_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); From 1c3ce5417b338ba3c697bac6485581ba117d8e52 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 23 Sep 2019 15:33:43 -0700 Subject: [PATCH 522/721] ocfs2: fix spelling mistake "ambigous" -> "ambiguous" There is a spelling mistake in a mlog_bug_on_msg message. Fix it. Link: http://lkml.kernel.org/r/831bdff4-064e-038b-f45d-c4d265cbff1e@linux.alibaba.com Signed-off-by: Colin Ian King Acked-by: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7ad9d6590818..7c9dfd50c1c1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -534,7 +534,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, */ mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), - "Inode %llu: system file state is ambigous\n", + "Inode %llu: system file state is ambiguous\n", (unsigned long long)args->fi_blkno); if (S_ISCHR(le16_to_cpu(fe->i_mode)) || From 04f768a39d55967246c002aa66b407b3bfdd8269 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 23 Sep 2019 15:33:46 -0700 Subject: [PATCH 523/721] mm, slab: extend slab/shrink to shrink all memcg caches Currently, a value of '1" is written to /sys/kernel/slab//shrink file to shrink the slab by flushing out all the per-cpu slabs and free slabs in partial lists. This can be useful to squeeze out a bit more memory under extreme condition as well as making the active object counts in /proc/slabinfo more accurate. This usually applies only to the root caches, as the SLUB_MEMCG_SYSFS_ON option is usually not enabled and "slub_memcg_sysfs=1" not set. Even if memcg sysfs is turned on, it is too cumbersome and impractical to manage all those per-memcg sysfs files in a real production system. So there is no practical way to shrink memcg caches. Fix this by enabling a proper write to the shrink sysfs file of the root cache to scan all the available memcg caches and shrink them as well. For a non-root memcg cache (when SLUB_MEMCG_SYSFS_ON or slub_memcg_sysfs is on), only that cache will be shrunk when written. On a 2-socket 64-core 256-thread arm64 system with 64k page after a parallel kernel build, the the amount of memory occupied by slabs before shrinking slabs were: # grep task_struct /proc/slabinfo task_struct 53137 53192 4288 61 4 : tunables 0 0 0 : slabdata 872 872 0 # grep "^S[lRU]" /proc/meminfo Slab: 3936832 kB SReclaimable: 399104 kB SUnreclaim: 3537728 kB After shrinking slabs (by echoing "1" to all shrink files): # grep "^S[lRU]" /proc/meminfo Slab: 1356288 kB SReclaimable: 263296 kB SUnreclaim: 1092992 kB # grep task_struct /proc/slabinfo task_struct 2764 6832 4288 61 4 : tunables 0 0 0 : slabdata 112 112 0 Link: http://lkml.kernel.org/r/20190723151445.7385-1-longman@redhat.com Signed-off-by: Waiman Long Acked-by: Roman Gushchin Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-kernel-slab | 13 +++++--- mm/slab.h | 1 + mm/slab_common.c | 37 +++++++++++++++++++++ mm/slub.c | 2 +- 4 files changed, 48 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 29601d93a1c2..ed35833ad7f0 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -429,10 +429,15 @@ KernelVersion: 2.6.22 Contact: Pekka Enberg , Christoph Lameter Description: - The shrink file is written when memory should be reclaimed from - a cache. Empty partial slabs are freed and the partial list is - sorted so the slabs with the fewest available objects are used - first. + The shrink file is used to reclaim unused slab cache + memory from a cache. Empty per-cpu or partial slabs + are freed and the partial list is sorted so the slabs + with the fewest available objects are used first. + It only accepts a value of "1" on write for shrinking + the cache. Other input values are considered invalid. + Shrinking slab caches might be expensive and can + adversely impact other running applications. So it + should be used with care. What: /sys/kernel/slab/cache/slab_size Date: May 2007 diff --git a/mm/slab.h b/mm/slab.h index 9057b8056b07..5bf615cb3f99 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -174,6 +174,7 @@ int __kmem_cache_shrink(struct kmem_cache *); void __kmemcg_cache_deactivate(struct kmem_cache *s); void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); void slab_kmem_cache_release(struct kmem_cache *); +void kmem_cache_shrink_all(struct kmem_cache *s); struct seq_file; struct file; diff --git a/mm/slab_common.c b/mm/slab_common.c index 807490fe217a..6491c3a41805 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -981,6 +981,43 @@ int kmem_cache_shrink(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_shrink); +/** + * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache + * @s: The cache pointer + */ +void kmem_cache_shrink_all(struct kmem_cache *s) +{ + struct kmem_cache *c; + + if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) { + kmem_cache_shrink(s); + return; + } + + get_online_cpus(); + get_online_mems(); + kasan_cache_shrink(s); + __kmem_cache_shrink(s); + + /* + * We have to take the slab_mutex to protect from the memcg list + * modification. + */ + mutex_lock(&slab_mutex); + for_each_memcg_cache(c, s) { + /* + * Don't need to shrink deactivated memcg caches. + */ + if (s->flags & SLAB_DEACTIVATED) + continue; + kasan_cache_shrink(c); + __kmem_cache_shrink(c); + } + mutex_unlock(&slab_mutex); + put_online_mems(); + put_online_cpus(); +} + bool slab_is_available(void) { return slab_state >= UP; diff --git a/mm/slub.c b/mm/slub.c index 8834563cdb4b..66808d3e97d2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5298,7 +5298,7 @@ static ssize_t shrink_store(struct kmem_cache *s, const char *buf, size_t length) { if (buf[0] == '1') - kmem_cache_shrink(s); + kmem_cache_shrink_all(s); else return -EINVAL; return length; From 9adeaa226988b97bc15928e12f40a9863134467c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 23 Sep 2019 15:33:49 -0700 Subject: [PATCH 524/721] mm, slab: move memcg_cache_params structure to mm/slab.h The memcg_cache_params structure is only embedded into the kmem_cache of slab and slub allocators as defined in slab_def.h and slub_def.h and used internally by mm code. There is no needed to expose it in a public header. So move it from include/linux/slab.h to mm/slab.h. It is just a refactoring patch with no code change. In fact both the slub_def.h and slab_def.h should be moved into the mm directory as well, but that will probably cause many merge conflicts. Link: http://lkml.kernel.org/r/20190718180827.18758-1-longman@redhat.com Signed-off-by: Waiman Long Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 62 ------------------------------------------- mm/slab.h | 63 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 62 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 56c9c7eed34e..ab2b98ad76e1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -595,68 +595,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) return __kmalloc_node(size, flags, node); } -struct memcg_cache_array { - struct rcu_head rcu; - struct kmem_cache *entries[0]; -}; - -/* - * This is the main placeholder for memcg-related information in kmem caches. - * Both the root cache and the child caches will have it. For the root cache, - * this will hold a dynamically allocated array large enough to hold - * information about the currently limited memcgs in the system. To allow the - * array to be accessed without taking any locks, on relocation we free the old - * version only after a grace period. - * - * Root and child caches hold different metadata. - * - * @root_cache: Common to root and child caches. NULL for root, pointer to - * the root cache for children. - * - * The following fields are specific to root caches. - * - * @memcg_caches: kmemcg ID indexed table of child caches. This table is - * used to index child cachces during allocation and cleared - * early during shutdown. - * - * @root_caches_node: List node for slab_root_caches list. - * - * @children: List of all child caches. While the child caches are also - * reachable through @memcg_caches, a child cache remains on - * this list until it is actually destroyed. - * - * The following fields are specific to child caches. - * - * @memcg: Pointer to the memcg this cache belongs to. - * - * @children_node: List node for @root_cache->children list. - * - * @kmem_caches_node: List node for @memcg->kmem_caches list. - */ -struct memcg_cache_params { - struct kmem_cache *root_cache; - union { - struct { - struct memcg_cache_array __rcu *memcg_caches; - struct list_head __root_caches_node; - struct list_head children; - bool dying; - }; - struct { - struct mem_cgroup *memcg; - struct list_head children_node; - struct list_head kmem_caches_node; - struct percpu_ref refcnt; - - void (*work_fn)(struct kmem_cache *); - union { - struct rcu_head rcu_head; - struct work_struct work; - }; - }; - }; -}; - int memcg_update_all_caches(int num_memcgs); /** diff --git a/mm/slab.h b/mm/slab.h index 5bf615cb3f99..68e455f2b698 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -30,6 +30,69 @@ struct kmem_cache { struct list_head list; /* List of all slab caches on the system */ }; +#else /* !CONFIG_SLOB */ + +struct memcg_cache_array { + struct rcu_head rcu; + struct kmem_cache *entries[0]; +}; + +/* + * This is the main placeholder for memcg-related information in kmem caches. + * Both the root cache and the child caches will have it. For the root cache, + * this will hold a dynamically allocated array large enough to hold + * information about the currently limited memcgs in the system. To allow the + * array to be accessed without taking any locks, on relocation we free the old + * version only after a grace period. + * + * Root and child caches hold different metadata. + * + * @root_cache: Common to root and child caches. NULL for root, pointer to + * the root cache for children. + * + * The following fields are specific to root caches. + * + * @memcg_caches: kmemcg ID indexed table of child caches. This table is + * used to index child cachces during allocation and cleared + * early during shutdown. + * + * @root_caches_node: List node for slab_root_caches list. + * + * @children: List of all child caches. While the child caches are also + * reachable through @memcg_caches, a child cache remains on + * this list until it is actually destroyed. + * + * The following fields are specific to child caches. + * + * @memcg: Pointer to the memcg this cache belongs to. + * + * @children_node: List node for @root_cache->children list. + * + * @kmem_caches_node: List node for @memcg->kmem_caches list. + */ +struct memcg_cache_params { + struct kmem_cache *root_cache; + union { + struct { + struct memcg_cache_array __rcu *memcg_caches; + struct list_head __root_caches_node; + struct list_head children; + bool dying; + }; + struct { + struct mem_cgroup *memcg; + struct list_head children_node; + struct list_head kmem_caches_node; + struct percpu_ref refcnt; + + void (*work_fn)(struct kmem_cache *); + union { + struct rcu_head rcu_head; + struct work_struct work; + }; + }; + }; +}; #endif /* CONFIG_SLOB */ #ifdef CONFIG_SLAB From 9d5f0be0f7556873cd00125dc579a9d9d186b0d0 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:33:52 -0700 Subject: [PATCH 525/721] mm/slub.c: fix -Wunused-function compiler warnings tid_to_cpu() and tid_to_event() are only used in note_cmpxchg_failure() when SLUB_DEBUG_CMPXCHG=y, so when SLUB_DEBUG_CMPXCHG=n by default, Clang will complain that those unused functions. Link: http://lkml.kernel.org/r/1568752232-5094-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 66808d3e97d2..17fe1cac11fb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2004,6 +2004,7 @@ static inline unsigned long next_tid(unsigned long tid) return tid + TID_STEP; } +#ifdef SLUB_DEBUG_CMPXCHG static inline unsigned int tid_to_cpu(unsigned long tid) { return tid % TID_STEP; @@ -2013,6 +2014,7 @@ static inline unsigned long tid_to_event(unsigned long tid) { return tid / TID_STEP; } +#endif static inline unsigned int init_tid(int cpu) { From b751c52bb587ae66f773b15204ef7a147467f4c7 Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Mon, 23 Sep 2019 15:33:55 -0700 Subject: [PATCH 526/721] kmemleak: increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE default to 16K The current default value (400) is too low on many systems (e.g. some ARM64 platform takes up 1000+ entries). syzbot uses 16000 as default value, and has proved to be enough on beefy configurations, so let's pick that value. This consumes more RAM on boot (each entry is 160 bytes, so in total ~2.5MB of RAM), but the memory would later be freed (early_log is __initdata). Link: http://lkml.kernel.org/r/20190730154027.101525-1-drinkcat@chromium.org Signed-off-by: Nicolas Boichat Suggested-by: Dmitry Vyukov Acked-by: Catalin Marinas Acked-by: Dmitry Vyukov Cc: Masahiro Yamada Cc: Kees Cook Cc: Petr Mladek Cc: Thomas Gleixner Cc: Tetsuo Handa Cc: Joe Lawrence Cc: Uladzislau Rezki Cc: Andy Shevchenko Cc: Stephen Rothwell Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e0e14780a13d..3c88e54da86c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -580,7 +580,7 @@ config DEBUG_KMEMLEAK_EARLY_LOG_SIZE int "Maximum kmemleak early log entries" depends on DEBUG_KMEMLEAK range 200 40000 - default 400 + default 16000 help Kmemleak must track all the memory allocations to avoid reporting false positives. Since memory may be allocated or From dba82d9431770e68c45b03f0ffa2daa8abfb9429 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Sep 2019 15:33:59 -0700 Subject: [PATCH 527/721] mm: kmemleak: make the tool tolerant to struct scan_area allocation failures Patch series "mm: kmemleak: Use a memory pool for kmemleak object allocations", v3. Following the discussions on v2 of this patch(set) [1], this series takes slightly different approach: - it implements its own simple memory pool that does not rely on the slab allocator - drops the early log buffer logic entirely since it can now allocate metadata from the memory pool directly before kmemleak is fully initialised - CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE option is renamed to CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE - moves the kmemleak_init() call earlier (mm_init()) - to avoid a separate memory pool for struct scan_area, it makes the tool robust when such allocations fail as scan areas are rather an optimisation [1] http://lkml.kernel.org/r/20190727132334.9184-1-catalin.marinas@arm.com This patch (of 3): Object scan areas are an optimisation aimed to decrease the false positives and slightly improve the scanning time of large objects known to only have a few specific pointers. If a struct scan_area fails to allocate, kmemleak can still function normally by scanning the full object. Introduce an OBJECT_FULL_SCAN flag and mark objects as such when scan_area allocation fails. Link: http://lkml.kernel.org/r/20190812160642.52134-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Cc: Michal Hocko Cc: Matthew Wilcox Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f6e602918dac..5ba7fad00fda 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -168,6 +168,8 @@ struct kmemleak_object { #define OBJECT_REPORTED (1 << 1) /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +/* flag set to fully scan the object when scan_area allocation failed */ +#define OBJECT_FULL_SCAN (1 << 3) #define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ @@ -773,12 +775,14 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) } area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); - if (!area) { - pr_warn("Cannot allocate a scan area\n"); - goto out; - } spin_lock_irqsave(&object->lock, flags); + if (!area) { + pr_warn_once("Cannot allocate a scan area, scanning the full object\n"); + /* mark the object for full scan to avoid false positives */ + object->flags |= OBJECT_FULL_SCAN; + goto out_unlock; + } if (size == SIZE_MAX) { size = object->pointer + object->size - ptr; } else if (ptr + size > object->pointer + object->size) { @@ -795,7 +799,6 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) hlist_add_head(&area->node, &object->area_list); out_unlock: spin_unlock_irqrestore(&object->lock, flags); -out: put_object(object); } @@ -1408,7 +1411,8 @@ static void scan_object(struct kmemleak_object *object) if (!(object->flags & OBJECT_ALLOCATED)) /* already freed object */ goto out; - if (hlist_empty(&object->area_list)) { + if (hlist_empty(&object->area_list) || + object->flags & OBJECT_FULL_SCAN) { void *start = (void *)object->pointer; void *end = (void *)(object->pointer + object->size); void *next; From 0647398a8c7bd55e0b7565c5076e86b7c3c204c5 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Sep 2019 15:34:02 -0700 Subject: [PATCH 528/721] mm: kmemleak: simple memory allocation pool for kmemleak objects Add a memory pool for struct kmemleak_object in case the normal kmem_cache_alloc() fails under the gfp constraints passed by the caller. The mem_pool[] array size is currently fixed at 16000. We are not using the existing mempool kernel API since this requires the slab allocator to be available (for pool->elements allocation). A subsequent kmemleak patch will replace the static early log buffer with the pool allocation introduced here and this functionality is required to be available before the slab was initialised. Link: http://lkml.kernel.org/r/20190812160642.52134-3-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Cc: Matthew Wilcox Cc: Michal Hocko Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5ba7fad00fda..2fb86524d70b 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -180,11 +180,17 @@ struct kmemleak_object { #define HEX_ASCII 1 /* max number of lines to be printed */ #define HEX_MAX_LINES 2 +/* memory pool size */ +#define MEM_POOL_SIZE 16000 /* the list of all allocated objects */ static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ static LIST_HEAD(gray_list); +/* memory pool allocation */ +static struct kmemleak_object mem_pool[MEM_POOL_SIZE]; +static int mem_pool_free_count = ARRAY_SIZE(mem_pool); +static LIST_HEAD(mem_pool_free_list); /* search tree for object boundaries */ static struct rb_root object_tree_root = RB_ROOT; /* rw_lock protecting the access to object_list and object_tree_root */ @@ -451,6 +457,50 @@ static int get_object(struct kmemleak_object *object) return atomic_inc_not_zero(&object->use_count); } +/* + * Memory pool allocation and freeing. kmemleak_lock must not be held. + */ +static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object; + + /* try the slab allocator first */ + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + if (object) + return object; + + /* slab allocation failed, try the memory pool */ + write_lock_irqsave(&kmemleak_lock, flags); + object = list_first_entry_or_null(&mem_pool_free_list, + typeof(*object), object_list); + if (object) + list_del(&object->object_list); + else if (mem_pool_free_count) + object = &mem_pool[--mem_pool_free_count]; + write_unlock_irqrestore(&kmemleak_lock, flags); + + return object; +} + +/* + * Return the object to either the slab allocator or the memory pool. + */ +static void mem_pool_free(struct kmemleak_object *object) +{ + unsigned long flags; + + if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) { + kmem_cache_free(object_cache, object); + return; + } + + /* add the object to the memory pool free list */ + write_lock_irqsave(&kmemleak_lock, flags); + list_add(&object->object_list, &mem_pool_free_list); + write_unlock_irqrestore(&kmemleak_lock, flags); +} + /* * RCU callback to free a kmemleak_object. */ @@ -469,7 +519,7 @@ static void free_object_rcu(struct rcu_head *rcu) hlist_del(&area->node); kmem_cache_free(scan_area_cache, area); } - kmem_cache_free(object_cache, object); + mem_pool_free(object); } /* @@ -552,7 +602,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct rb_node **link, *rb_parent; unsigned long untagged_ptr; - object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + object = mem_pool_alloc(gfp); if (!object) { pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); From c5665868183fec689dbab9fb8505188b2c4f0757 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Sep 2019 15:34:05 -0700 Subject: [PATCH 529/721] mm: kmemleak: use the memory pool for early allocations Currently kmemleak uses a static early_log buffer to trace all memory allocation/freeing before the slab allocator is initialised. Such early log is replayed during kmemleak_init() to properly initialise the kmemleak metadata for objects allocated up that point. With a memory pool that does not rely on the slab allocator, it is possible to skip this early log entirely. In order to remove the early logging, consider kmemleak_enabled == 1 by default while the kmem_cache availability is checked directly on the object_cache and scan_area_cache variables. The RCU callback is only invoked after object_cache has been initialised as we wouldn't have any concurrent list traversal before this. In order to reduce the number of callbacks before kmemleak is fully initialised, move the kmemleak_init() call to mm_init(). [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: remove WARN_ON(), per Catalin] Link: http://lkml.kernel.org/r/20190812160642.52134-4-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Cc: Matthew Wilcox Cc: Michal Hocko Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 2 +- lib/Kconfig.debug | 11 +- mm/kmemleak.c | 265 +++++----------------------------------------- 3 files changed, 33 insertions(+), 245 deletions(-) diff --git a/init/main.c b/init/main.c index 653693da8da6..3ca67e8b92fd 100644 --- a/init/main.c +++ b/init/main.c @@ -556,6 +556,7 @@ static void __init mm_init(void) report_meminit(); mem_init(); kmem_cache_init(); + kmemleak_init(); pgtable_init(); debug_objects_mem_init(); vmalloc_init(); @@ -740,7 +741,6 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif - kmemleak_init(); setup_per_cpu_pageset(); numa_policy_init(); acpi_early_init(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3c88e54da86c..c6975cded461 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -576,17 +576,18 @@ config DEBUG_KMEMLEAK In order to access the kmemleak file, debugfs needs to be mounted (usually at /sys/kernel/debug). -config DEBUG_KMEMLEAK_EARLY_LOG_SIZE - int "Maximum kmemleak early log entries" +config DEBUG_KMEMLEAK_MEM_POOL_SIZE + int "Kmemleak memory pool size" depends on DEBUG_KMEMLEAK range 200 40000 default 16000 help Kmemleak must track all the memory allocations to avoid reporting false positives. Since memory may be allocated or - freed before kmemleak is initialised, an early log buffer is - used to store these actions. If kmemleak reports "early log - buffer exceeded", please increase this value. + freed before kmemleak is fully initialised, use a static pool + of metadata objects to track such callbacks. After kmemleak is + fully initialised, this memory pool acts as an emergency one + if slab allocations fail. config DEBUG_KMEMLEAK_TEST tristate "Simple test for the kernel memory leak detector" diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2fb86524d70b..b8bbe9ac5472 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -180,15 +180,13 @@ struct kmemleak_object { #define HEX_ASCII 1 /* max number of lines to be printed */ #define HEX_MAX_LINES 2 -/* memory pool size */ -#define MEM_POOL_SIZE 16000 /* the list of all allocated objects */ static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ static LIST_HEAD(gray_list); /* memory pool allocation */ -static struct kmemleak_object mem_pool[MEM_POOL_SIZE]; +static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE]; static int mem_pool_free_count = ARRAY_SIZE(mem_pool); static LIST_HEAD(mem_pool_free_list); /* search tree for object boundaries */ @@ -201,13 +199,11 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static int kmemleak_enabled; +static int kmemleak_enabled = 1; /* same as above but only for the kmemleak_free() callback */ -static int kmemleak_free_enabled; +static int kmemleak_free_enabled = 1; /* set in the late_initcall if there were no errors */ static int kmemleak_initialized; -/* enables or disables early logging of the memory operations */ -static int kmemleak_early_log = 1; /* set if a kmemleak warning was issued */ static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ @@ -235,49 +231,6 @@ static bool kmemleak_found_leaks; static bool kmemleak_verbose; module_param_named(verbose, kmemleak_verbose, bool, 0600); -/* - * Early object allocation/freeing logging. Kmemleak is initialized after the - * kernel allocator. However, both the kernel allocator and kmemleak may - * allocate memory blocks which need to be tracked. Kmemleak defines an - * arbitrary buffer to hold the allocation/freeing information before it is - * fully initialized. - */ - -/* kmemleak operation type for early logging */ -enum { - KMEMLEAK_ALLOC, - KMEMLEAK_ALLOC_PERCPU, - KMEMLEAK_FREE, - KMEMLEAK_FREE_PART, - KMEMLEAK_FREE_PERCPU, - KMEMLEAK_NOT_LEAK, - KMEMLEAK_IGNORE, - KMEMLEAK_SCAN_AREA, - KMEMLEAK_NO_SCAN, - KMEMLEAK_SET_EXCESS_REF -}; - -/* - * Structure holding the information passed to kmemleak callbacks during the - * early logging. - */ -struct early_log { - int op_type; /* kmemleak operation type */ - int min_count; /* minimum reference count */ - const void *ptr; /* allocated/freed memory block */ - union { - size_t size; /* memory block size */ - unsigned long excess_ref; /* surplus reference passing */ - }; - unsigned long trace[MAX_TRACE]; /* stack trace */ - unsigned int trace_len; /* stack trace length */ -}; - -/* early logging buffer and current position */ -static struct early_log - early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; -static int crt_early_log __initdata; - static void kmemleak_disable(void); /* @@ -466,9 +419,11 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) struct kmemleak_object *object; /* try the slab allocator first */ - object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); - if (object) - return object; + if (object_cache) { + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + if (object) + return object; + } /* slab allocation failed, try the memory pool */ write_lock_irqsave(&kmemleak_lock, flags); @@ -478,6 +433,8 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) list_del(&object->object_list); else if (mem_pool_free_count) object = &mem_pool[--mem_pool_free_count]; + else + pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n"); write_unlock_irqrestore(&kmemleak_lock, flags); return object; @@ -537,7 +494,15 @@ static void put_object(struct kmemleak_object *object) /* should only get here after delete_object was called */ WARN_ON(object->flags & OBJECT_ALLOCATED); - call_rcu(&object->rcu, free_object_rcu); + /* + * It may be too early for the RCU callbacks, however, there is no + * concurrent object_list traversal when !object_cache and all objects + * came from the memory pool. Free the object directly. + */ + if (object_cache) + call_rcu(&object->rcu, free_object_rcu); + else + free_object_rcu(&object->rcu); } /* @@ -741,9 +706,7 @@ static void delete_object_part(unsigned long ptr, size_t size) /* * Create one or two objects that may result from the memory block * split. Note that partial freeing is only done by free_bootmem() and - * this happens before kmemleak_init() is called. The path below is - * only executed during early log recording in kmemleak_init(), so - * GFP_KERNEL is enough. + * this happens before kmemleak_init() is called. */ start = object->pointer; end = object->pointer + object->size; @@ -815,7 +778,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; - struct kmemleak_scan_area *area; + struct kmemleak_scan_area *area = NULL; object = find_and_get_object(ptr, 1); if (!object) { @@ -824,7 +787,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) return; } - area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); + if (scan_area_cache) + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); spin_lock_irqsave(&object->lock, flags); if (!area) { @@ -898,86 +862,6 @@ static void object_no_scan(unsigned long ptr) put_object(object); } -/* - * Log an early kmemleak_* call to the early_log buffer. These calls will be - * processed later once kmemleak is fully initialized. - */ -static void __init log_early(int op_type, const void *ptr, size_t size, - int min_count) -{ - unsigned long flags; - struct early_log *log; - - if (kmemleak_error) { - /* kmemleak stopped recording, just count the requests */ - crt_early_log++; - return; - } - - if (crt_early_log >= ARRAY_SIZE(early_log)) { - crt_early_log++; - kmemleak_disable(); - return; - } - - /* - * There is no need for locking since the kernel is still in UP mode - * at this stage. Disabling the IRQs is enough. - */ - local_irq_save(flags); - log = &early_log[crt_early_log]; - log->op_type = op_type; - log->ptr = ptr; - log->size = size; - log->min_count = min_count; - log->trace_len = __save_stack_trace(log->trace); - crt_early_log++; - local_irq_restore(flags); -} - -/* - * Log an early allocated block and populate the stack trace. - */ -static void early_alloc(struct early_log *log) -{ - struct kmemleak_object *object; - unsigned long flags; - int i; - - if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) - return; - - /* - * RCU locking needed to ensure object is not freed via put_object(). - */ - rcu_read_lock(); - object = create_object((unsigned long)log->ptr, log->size, - log->min_count, GFP_ATOMIC); - if (!object) - goto out; - spin_lock_irqsave(&object->lock, flags); - for (i = 0; i < log->trace_len; i++) - object->trace[i] = log->trace[i]; - object->trace_len = log->trace_len; - spin_unlock_irqrestore(&object->lock, flags); -out: - rcu_read_unlock(); -} - -/* - * Log an early allocated block and populate the stack trace. - */ -static void early_alloc_percpu(struct early_log *log) -{ - unsigned int cpu; - const void __percpu *ptr = log->ptr; - - for_each_possible_cpu(cpu) { - log->ptr = per_cpu_ptr(ptr, cpu); - early_alloc(log); - } -} - /** * kmemleak_alloc - register a newly allocated object * @ptr: pointer to beginning of the object @@ -999,8 +883,6 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, if (kmemleak_enabled && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); - else if (kmemleak_early_log) - log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -1028,8 +910,6 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, for_each_possible_cpu(cpu) create_object((unsigned long)per_cpu_ptr(ptr, cpu), size, 0, gfp); - else if (kmemleak_early_log) - log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -1054,11 +934,6 @@ void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp create_object((unsigned long)area->addr, size, 2, gfp); object_set_excess_ref((unsigned long)area, (unsigned long)area->addr); - } else if (kmemleak_early_log) { - log_early(KMEMLEAK_ALLOC, area->addr, size, 2); - /* reusing early_log.size for storing area->addr */ - log_early(KMEMLEAK_SET_EXCESS_REF, - area, (unsigned long)area->addr, 0); } } EXPORT_SYMBOL_GPL(kmemleak_vmalloc); @@ -1076,8 +951,6 @@ void __ref kmemleak_free(const void *ptr) if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -1096,8 +969,6 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); - else if (kmemleak_early_log) - log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -1118,8 +989,6 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) for_each_possible_cpu(cpu) delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu)); - else if (kmemleak_early_log) - log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); @@ -1170,8 +1039,6 @@ void __ref kmemleak_not_leak(const void *ptr) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -1190,8 +1057,6 @@ void __ref kmemleak_ignore(const void *ptr) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); @@ -1212,8 +1077,6 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); - else if (kmemleak_early_log) - log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -1232,8 +1095,6 @@ void __ref kmemleak_no_scan(const void *ptr) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); @@ -2020,7 +1881,6 @@ static void kmemleak_disable(void) /* stop any memory operation tracing */ kmemleak_enabled = 0; - kmemleak_early_log = 0; /* check whether it is too early for a kernel thread */ if (kmemleak_initialized) @@ -2048,20 +1908,11 @@ static int __init kmemleak_boot_config(char *str) } early_param("kmemleak", kmemleak_boot_config); -static void __init print_log_trace(struct early_log *log) -{ - pr_notice("Early log backtrace:\n"); - stack_trace_print(log->trace, log->trace_len, 2); -} - /* * Kmemleak initialization. */ void __init kmemleak_init(void) { - int i; - unsigned long flags; - #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { kmemleak_disable(); @@ -2069,28 +1920,15 @@ void __init kmemleak_init(void) } #endif + if (kmemleak_error) + return; + jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); - if (crt_early_log > ARRAY_SIZE(early_log)) - pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", - crt_early_log); - - /* the kernel is still in UP mode, so disabling the IRQs is enough */ - local_irq_save(flags); - kmemleak_early_log = 0; - if (kmemleak_error) { - local_irq_restore(flags); - return; - } else { - kmemleak_enabled = 1; - kmemleak_free_enabled = 1; - } - local_irq_restore(flags); - /* register the data/bss sections */ create_object((unsigned long)_sdata, _edata - _sdata, KMEMLEAK_GREY, GFP_ATOMIC); @@ -2101,57 +1939,6 @@ void __init kmemleak_init(void) create_object((unsigned long)__start_ro_after_init, __end_ro_after_init - __start_ro_after_init, KMEMLEAK_GREY, GFP_ATOMIC); - - /* - * This is the point where tracking allocations is safe. Automatic - * scanning is started during the late initcall. Add the early logged - * callbacks to the kmemleak infrastructure. - */ - for (i = 0; i < crt_early_log; i++) { - struct early_log *log = &early_log[i]; - - switch (log->op_type) { - case KMEMLEAK_ALLOC: - early_alloc(log); - break; - case KMEMLEAK_ALLOC_PERCPU: - early_alloc_percpu(log); - break; - case KMEMLEAK_FREE: - kmemleak_free(log->ptr); - break; - case KMEMLEAK_FREE_PART: - kmemleak_free_part(log->ptr, log->size); - break; - case KMEMLEAK_FREE_PERCPU: - kmemleak_free_percpu(log->ptr); - break; - case KMEMLEAK_NOT_LEAK: - kmemleak_not_leak(log->ptr); - break; - case KMEMLEAK_IGNORE: - kmemleak_ignore(log->ptr); - break; - case KMEMLEAK_SCAN_AREA: - kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); - break; - case KMEMLEAK_NO_SCAN: - kmemleak_no_scan(log->ptr); - break; - case KMEMLEAK_SET_EXCESS_REF: - object_set_excess_ref((unsigned long)log->ptr, - log->excess_ref); - break; - default: - kmemleak_warn("Unknown early log operation: %d\n", - log->op_type); - } - - if (kmemleak_warning) { - print_log_trace(log); - kmemleak_warning = 0; - } - } } /* From 0e965a6bda80f3227dfb74af6ae644e396beaacb Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:34:07 -0700 Subject: [PATCH 530/721] mm/kmemleak.c: record the current memory pool size The only way to obtain the current memory pool size for a running kernel is to check the kernel config file which is inconvenient. Record it in the kernel messages. [akpm@linux-foundation.org: s/memory pool size/memory pool/available/, per Catalin] Link: http://lkml.kernel.org/r/1565809631-28933-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index b8bbe9ac5472..03a8d84badad 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1967,7 +1967,8 @@ static int __init kmemleak_late_init(void) mutex_unlock(&scan_mutex); } - pr_info("Kernel memory leak detector initialized\n"); + pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n", + mem_pool_free_count); return 0; } From c59180ae3e5b43d3534748b19a57905712ea5fff Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:34:10 -0700 Subject: [PATCH 531/721] mm/kmemleak: increase the max mem pool to 1M There are some machines with slow disk and fast CPUs. When they are under memory pressure, it could take a long time to swap before the OOM kicks in to free up some memory. As the results, it needs a large mem pool for kmemleak or suffering from higher chance of a kmemleak metadata allocation failure. 524288 proves to be the good number for all architectures here. Increase the upper bound to 1M to leave some room for the future. Link: http://lkml.kernel.org/r/1565807572-26041-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c6975cded461..6b1b1703a646 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -579,7 +579,7 @@ config DEBUG_KMEMLEAK config DEBUG_KMEMLEAK_MEM_POOL_SIZE int "Kmemleak memory pool size" depends on DEBUG_KMEMLEAK - range 200 40000 + range 200 1000000 default 16000 help Kmemleak must track all the memory allocations to avoid From ae8f06b31a83e54777514308a63f669a1fed519e Mon Sep 17 00:00:00 2001 From: Walter Wu Date: Mon, 23 Sep 2019 15:34:13 -0700 Subject: [PATCH 532/721] kasan: add memory corruption identification for software tag-based mode Add memory corruption identification at bug report for software tag-based mode. The report shows whether it is "use-after-free" or "out-of-bound" error instead of "invalid-access" error. This will make it easier for programmers to see the memory corruption problem. We extend the slab to store five old free pointer tag and free backtrace, we can check if the tagged address is in the slab record and make a good guess if the object is more like "use-after-free" or "out-of-bound". therefore every slab memory corruption can be identified whether it's "use-after-free" or "out-of-bound". [aryabinin@virtuozzo.com: simplify & clenup code] Link: https://lkml.kernel.org/r/3318f9d7-a760-3cc8-b700-f06108ae745f@virtuozzo.com] Link: http://lkml.kernel.org/r/20190821180332.11450-1-aryabinin@virtuozzo.com Signed-off-by: Walter Wu Signed-off-by: Andrey Ryabinin Acked-by: Andrey Konovalov Cc: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kasan | 8 ++++++++ mm/kasan/common.c | 22 +++++++++++++++++++-- mm/kasan/kasan.h | 14 +++++++++++++- mm/kasan/report.c | 44 ++++++++++++++++++++++++++++++++---------- mm/kasan/tags_report.c | 24 +++++++++++++++++++++++ 5 files changed, 99 insertions(+), 13 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 7fa97a8b5717..6c9682ce0254 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -134,6 +134,14 @@ config KASAN_S390_4_LEVEL_PAGING to 3TB of RAM with KASan enabled). This options allows to force 4-level paging instead. +config KASAN_SW_TAGS_IDENTIFY + bool "Enable memory corruption identification" + depends on KASAN_SW_TAGS + help + This option enables best-effort identification of bug type + (use-after-free or out-of-bounds) at the cost of increased + memory consumption. + config TEST_KASAN tristate "Module for testing KASAN for bug detection" depends on m && KASAN diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 95d16a42db6b..6b6f1198c72b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -304,7 +304,6 @@ size_t kasan_metadata_size(struct kmem_cache *cache) struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, const void *object) { - BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); return (void *)object + cache->kasan_info.alloc_meta_offset; } @@ -315,6 +314,24 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache, return (void *)object + cache->kasan_info.free_meta_offset; } + +static void kasan_set_free_info(struct kmem_cache *cache, + void *object, u8 tag) +{ + struct kasan_alloc_meta *alloc_meta; + u8 idx = 0; + + alloc_meta = get_alloc_info(cache, object); + +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + idx = alloc_meta->free_track_idx; + alloc_meta->free_pointer_tag[idx] = tag; + alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; +#endif + + set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); +} + void kasan_poison_slab(struct page *page) { unsigned long i; @@ -452,7 +469,8 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object, unlikely(!(cache->flags & SLAB_KASAN))) return false; - set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); + kasan_set_free_info(cache, object, tag); + quarantine_put(get_free_info(cache, object), cache); return IS_ENABLED(CONFIG_KASAN_GENERIC); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 014f19e76247..35cff6bbb716 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -95,9 +95,19 @@ struct kasan_track { depot_stack_handle_t stack; }; +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY +#define KASAN_NR_FREE_STACKS 5 +#else +#define KASAN_NR_FREE_STACKS 1 +#endif + struct kasan_alloc_meta { struct kasan_track alloc_track; - struct kasan_track free_track; + struct kasan_track free_track[KASAN_NR_FREE_STACKS]; +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; + u8 free_track_idx; +#endif }; struct qlist_node { @@ -146,6 +156,8 @@ void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip); +struct page *kasan_addr_to_page(const void *addr); + #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 0e5f965f1882..621782100eaa 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -111,7 +111,7 @@ static void print_track(struct kasan_track *track, const char *prefix) } } -static struct page *addr_to_page(const void *addr) +struct page *kasan_addr_to_page(const void *addr) { if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) @@ -151,15 +151,38 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, (void *)(object_addr + cache->object_size)); } +static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + struct kasan_alloc_meta *alloc_meta; + int i = 0; + + alloc_meta = get_alloc_info(cache, object); + +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { + if (alloc_meta->free_pointer_tag[i] == tag) + break; + } + if (i == KASAN_NR_FREE_STACKS) + i = alloc_meta->free_track_idx; +#endif + + return &alloc_meta->free_track[i]; +} + static void describe_object(struct kmem_cache *cache, void *object, - const void *addr) + const void *addr, u8 tag) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); if (cache->flags & SLAB_KASAN) { + struct kasan_track *free_track; + print_track(&alloc_info->alloc_track, "Allocated"); pr_err("\n"); - print_track(&alloc_info->free_track, "Freed"); + free_track = kasan_get_free_track(cache, object, tag); + print_track(free_track, "Freed"); pr_err("\n"); } @@ -344,9 +367,9 @@ static void print_address_stack_frame(const void *addr) print_decoded_frame_descr(frame_descr); } -static void print_address_description(void *addr) +static void print_address_description(void *addr, u8 tag) { - struct page *page = addr_to_page(addr); + struct page *page = kasan_addr_to_page(addr); dump_stack(); pr_err("\n"); @@ -355,7 +378,7 @@ static void print_address_description(void *addr) struct kmem_cache *cache = page->slab_cache; void *object = nearest_obj(cache, page, addr); - describe_object(cache, object, addr); + describe_object(cache, object, addr, tag); } if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { @@ -435,13 +458,14 @@ static bool report_enabled(void) void kasan_report_invalid_free(void *object, unsigned long ip) { unsigned long flags; + u8 tag = get_tag(object); + object = reset_tag(object); start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); - print_tags(get_tag(object), reset_tag(object)); - object = reset_tag(object); + print_tags(tag, object); pr_err("\n"); - print_address_description(object); + print_address_description(object, tag); pr_err("\n"); print_shadow_for_address(object); end_report(&flags); @@ -479,7 +503,7 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon pr_err("\n"); if (addr_has_shadow(untagged_addr)) { - print_address_description(untagged_addr); + print_address_description(untagged_addr, get_tag(tagged_addr)); pr_err("\n"); print_shadow_for_address(info.first_bad_addr); } else { diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c index 8eaf5f722271..969ae08f59d7 100644 --- a/mm/kasan/tags_report.c +++ b/mm/kasan/tags_report.c @@ -36,6 +36,30 @@ const char *get_bug_type(struct kasan_access_info *info) { +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + struct kasan_alloc_meta *alloc_meta; + struct kmem_cache *cache; + struct page *page; + const void *addr; + void *object; + u8 tag; + int i; + + tag = get_tag(info->access_addr); + addr = reset_tag(info->access_addr); + page = kasan_addr_to_page(addr); + if (page && PageSlab(page)) { + cache = page->slab_cache; + object = nearest_obj(cache, page, (void *)addr); + alloc_meta = get_alloc_info(cache, object); + + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) + if (alloc_meta->free_pointer_tag[i] == tag) + return "use-after-free"; + return "out-of-bounds"; + } + +#endif return "invalid-access"; } From b92a953cb7f727c42a15ac2ea59bf3cf9c39370d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Sep 2019 15:34:16 -0700 Subject: [PATCH 533/721] lib/test_kasan.c: add roundtrip tests In several places we need to be able to operate on pointers which have gone via a roundtrip: virt -> {phys,page} -> virt With KASAN_SW_TAGS, we can't preserve the tag for SLUB objects, and the {phys,page} -> virt conversion will use KASAN_TAG_KERNEL. This patch adds tests to ensure that this works as expected, without false positives which have recently been spotted [1,2] in testing. [1] https://lore.kernel.org/linux-arm-kernel/20190819114420.2535-1-walter-zh.wu@mediatek.com/ [2] https://lore.kernel.org/linux-arm-kernel/20190819132347.GB9927@lakrids.cambridge.arm.com/ [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20190821153927.28630-1-mark.rutland@arm.com Signed-off-by: Mark Rutland Reviewed-by: Andrey Konovalov Tested-by: Andrey Konovalov Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index b63b367a94e8..49cc4d570a40 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -18,6 +18,9 @@ #include #include #include +#include + +#include /* * Note: test functions are marked noinline so that their names appear in @@ -337,6 +340,42 @@ static noinline void __init kmalloc_uaf2(void) kfree(ptr2); } +static noinline void __init kfree_via_page(void) +{ + char *ptr; + size_t size = 8; + struct page *page; + unsigned long offset; + + pr_info("invalid-free false positive (via page)\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + page = virt_to_page(ptr); + offset = offset_in_page(ptr); + kfree(page_address(page) + offset); +} + +static noinline void __init kfree_via_phys(void) +{ + char *ptr; + size_t size = 8; + phys_addr_t phys; + + pr_info("invalid-free false positive (via phys)\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + phys = virt_to_phys(ptr); + kfree(phys_to_virt(phys)); +} + static noinline void __init kmem_cache_oob(void) { char *p; @@ -737,6 +776,8 @@ static int __init kmalloc_tests_init(void) kmalloc_uaf(); kmalloc_uaf_memset(); kmalloc_uaf2(); + kfree_via_page(); + kfree_via_phys(); kmem_cache_oob(); memcg_accounted_kmem_cache(); kasan_stack_oob(); From dbf7684e29d171180346b25d2a124c2a0adf563e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 23 Sep 2019 15:34:19 -0700 Subject: [PATCH 534/721] mm/page_poison.c: fix a typo in a comment s/posioned/poisoned/ Link: http://lkml.kernel.org/r/20190721180908.6534-1-christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_poison.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_poison.c b/mm/page_poison.c index 21d4f97cb49b..34b9181ee5d1 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -101,7 +101,7 @@ static void unpoison_page(struct page *page) /* * Page poisoning when enabled poisons each and every page * that is freed to buddy. Thus no extra check is done to - * see if a page was posioned. + * see if a page was poisoned. */ check_poison_mem(addr, PAGE_SIZE); kunmap_atomic(addr); From 1f18b296699c83d858ca8ebb8b77dbc641d87cae Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 23 Sep 2019 15:34:22 -0700 Subject: [PATCH 535/721] mm/rmap.c: remove set but not used variable 'cstart' Fixes gcc '-Wunused-but-set-variable' warning: mm/rmap.c: In function page_mkclean_one: mm/rmap.c:906:17: warning: variable cstart set but not used [-Wunused-but-set-variable] It is not used any more since commit cdb07bdea28e ("mm/rmap.c: remove redundant variable cend") Link: http://lkml.kernel.org/r/20190724141453.38536-1-yuehaibing@huawei.com Signed-off-by: YueHaibing Reported-by: Hulk Robot Reviewed-by: Mike Kravetz Reviewed-by: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 003377e24232..31352bba197d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -903,10 +903,9 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { - unsigned long cstart; int ret = 0; - cstart = address = pvmw.address; + address = pvmw.address; if (pvmw.pte) { pte_t entry; pte_t *pte = pvmw.pte; @@ -933,7 +932,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); - cstart &= PMD_MASK; ret = 1; #else /* unexpected pmd-mapped page? */ From a50b854e073cd3335bbbada8dcff83a857297dd7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:25 -0700 Subject: [PATCH 536/721] mm: introduce page_size() Patch series "Make working with compound pages easier", v2. These three patches add three helpers and convert the appropriate places to use them. This patch (of 3): It's unnecessarily hard to find out the size of a potentially huge page. Replace 'PAGE_SIZE << compound_order(page)' with page_size(page). Link: http://lkml.kernel.org/r/20190721104612.19120-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Michal Hocko Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/flush.c | 3 +-- arch/arm64/mm/flush.c | 3 +-- arch/ia64/mm/init.c | 2 +- drivers/crypto/chelsio/chtls/chtls_io.c | 5 ++--- drivers/staging/android/ion/ion_system_heap.c | 4 ++-- drivers/target/tcm_fc/tfc_io.c | 3 +-- fs/io_uring.c | 2 +- include/linux/hugetlb.h | 2 +- include/linux/mm.h | 6 ++++++ lib/iov_iter.c | 2 +- mm/kasan/common.c | 8 +++----- mm/nommu.c | 2 +- mm/page_vma_mapped.c | 3 +-- mm/rmap.c | 6 ++---- mm/slob.c | 2 +- mm/slub.c | 18 +++++++++--------- net/xdp/xsk.c | 2 +- 17 files changed, 35 insertions(+), 38 deletions(-) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 6ecbda87ee46..4c7ebe094a83 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -204,8 +204,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page) * coherent with the kernels mapping. */ if (!PageHighMem(page)) { - size_t page_size = PAGE_SIZE << compound_order(page); - __cpuc_flush_dcache_area(page_address(page), page_size); + __cpuc_flush_dcache_area(page_address(page), page_size(page)); } else { unsigned long i; if (cache_is_vipt_nonaliasing()) { diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index dc19300309d2..ac485163a4a7 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -56,8 +56,7 @@ void __sync_icache_dcache(pte_t pte) struct page *page = pte_page(pte); if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - sync_icache_aliases(page_address(page), - PAGE_SIZE << compound_order(page)); + sync_icache_aliases(page_address(page), page_size(page)); } EXPORT_SYMBOL_GPL(__sync_icache_dcache); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 678b98a09c85..bf9df2625bc8 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -64,7 +64,7 @@ __ia64_sync_icache_dcache (pte_t pte) if (test_bit(PG_arch_1, &page->flags)) return; /* i-cache is already coherent with d-cache */ - flush_icache_range(addr, addr + (PAGE_SIZE << compound_order(page))); + flush_icache_range(addr, addr + page_size(page)); set_bit(PG_arch_1, &page->flags); /* mark page as clean */ } diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c index c70cb5f272cf..0891ab829b1b 100644 --- a/drivers/crypto/chelsio/chtls/chtls_io.c +++ b/drivers/crypto/chelsio/chtls/chtls_io.c @@ -1078,7 +1078,7 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) bool merge; if (page) - pg_size <<= compound_order(page); + pg_size = page_size(page); if (off < pg_size && skb_can_coalesce(skb, i, page, off)) { merge = 1; @@ -1105,8 +1105,7 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) __GFP_NORETRY, order); if (page) - pg_size <<= - compound_order(page); + pg_size <<= order; } if (!page) { page = alloc_page(gfp); diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c index aa8d8425be25..b83a1d16bd89 100644 --- a/drivers/staging/android/ion/ion_system_heap.c +++ b/drivers/staging/android/ion/ion_system_heap.c @@ -120,7 +120,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, if (!page) goto free_pages; list_add_tail(&page->lru, &pages); - size_remaining -= PAGE_SIZE << compound_order(page); + size_remaining -= page_size(page); max_order = compound_order(page); i++; } @@ -133,7 +133,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, sg = table->sgl; list_for_each_entry_safe(page, tmp_page, &pages, lru) { - sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0); + sg_set_page(sg, page, page_size(page), 0); sg = sg_next(sg); list_del(&page->lru); } diff --git a/drivers/target/tcm_fc/tfc_io.c b/drivers/target/tcm_fc/tfc_io.c index a254792d882c..1354a157e9af 100644 --- a/drivers/target/tcm_fc/tfc_io.c +++ b/drivers/target/tcm_fc/tfc_io.c @@ -136,8 +136,7 @@ int ft_queue_data_in(struct se_cmd *se_cmd) page, off_in_page, tlen); fr_len(fp) += tlen; fp_skb(fp)->data_len += tlen; - fp_skb(fp)->truesize += - PAGE_SIZE << compound_order(page); + fp_skb(fp)->truesize += page_size(page); } else { BUG_ON(!page); from = kmap_atomic(page + (mem_off >> PAGE_SHIFT)); diff --git a/fs/io_uring.c b/fs/io_uring.c index 0dadbdbead0f..f83de4c6a826 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3319,7 +3319,7 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) } page = virt_to_head_page(ptr); - if (sz > (PAGE_SIZE << compound_order(page))) + if (sz > page_size(page)) return -EINVAL; pfn = virt_to_phys(ptr) >> PAGE_SHIFT; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index edfca4278319..53fc34f930d0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -454,7 +454,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, static inline struct hstate *page_hstate(struct page *page) { VM_BUG_ON_PAGE(!PageHuge(page), page); - return size_to_hstate(PAGE_SIZE << compound_order(page)); + return size_to_hstate(page_size(page)); } static inline unsigned hstate_index_to_shift(unsigned index) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6e79b3df1582..d46d5585e2a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -805,6 +805,12 @@ static inline void set_compound_order(struct page *page, unsigned int order) page[1].compound_order = order; } +/* Returns the number of bytes in this potentially compound page. */ +static inline unsigned long page_size(struct page *page) +{ + return PAGE_SIZE << compound_order(page); +} + void free_compound_page(struct page *page); #ifdef CONFIG_MMU diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f1e0569b4539..639d5e7014c1 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -878,7 +878,7 @@ static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) head = compound_head(page); v += (page - head) << PAGE_SHIFT; - if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head)))) + if (likely(n <= v && v <= (page_size(head)))) return true; WARN_ON(1); return false; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6b6f1198c72b..307631d9c62b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -338,8 +338,7 @@ void kasan_poison_slab(struct page *page) for (i = 0; i < (1 << compound_order(page)); i++) page_kasan_tag_reset(page + i); - kasan_poison_shadow(page_address(page), - PAGE_SIZE << compound_order(page), + kasan_poison_shadow(page_address(page), page_size(page), KASAN_KMALLOC_REDZONE); } @@ -542,7 +541,7 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, page = virt_to_page(ptr); redzone_start = round_up((unsigned long)(ptr + size), KASAN_SHADOW_SCALE_SIZE); - redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page)); + redzone_end = (unsigned long)ptr + page_size(page); kasan_unpoison_shadow(ptr, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, @@ -578,8 +577,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip) kasan_report_invalid_free(ptr, ip); return; } - kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), - KASAN_FREE_PAGE); + kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE); } else { __kasan_slab_free(page->slab_cache, ptr, ip, false); } diff --git a/mm/nommu.c b/mm/nommu.c index fed1b6e9c89b..99b7ec318824 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -108,7 +108,7 @@ unsigned int kobjsize(const void *objp) * The ksize() function is only guaranteed to work for pointers * returned by kmalloc(). So handle arbitrary pointers here. */ - return PAGE_SIZE << compound_order(page); + return page_size(page); } /** diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 11df03e71288..eff4b4520c8d 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -153,8 +153,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address, - PAGE_SIZE << compound_order(page)); + pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); if (!pvmw->pte) return false; diff --git a/mm/rmap.c b/mm/rmap.c index 31352bba197d..f401732b20e8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -898,8 +898,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, - min(vma->vm_end, address + - (PAGE_SIZE << compound_order(page)))); + min(vma->vm_end, address + page_size(page))); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { @@ -1372,8 +1371,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, - min(vma->vm_end, address + - (PAGE_SIZE << compound_order(page)))); + min(vma->vm_end, address + page_size(page))); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted diff --git a/mm/slob.c b/mm/slob.c index 7f421d0ca9ab..cf377beab962 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -539,7 +539,7 @@ size_t __ksize(const void *block) sp = virt_to_page(block); if (unlikely(!PageSlab(sp))) - return PAGE_SIZE << compound_order(sp); + return page_size(sp); align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); m = (unsigned int *)(block - align); diff --git a/mm/slub.c b/mm/slub.c index 17fe1cac11fb..42c1b3af3c98 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -829,7 +829,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = PAGE_SIZE << compound_order(page); + length = page_size(page); end = start + length; remainder = length % s->size; if (!remainder) @@ -1074,13 +1074,14 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } -static void setup_page_debug(struct kmem_cache *s, void *addr, int order) +static +void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) { if (!(s->flags & SLAB_POISON)) return; metadata_access_enable(); - memset(addr, POISON_INUSE, PAGE_SIZE << order); + memset(addr, POISON_INUSE, page_size(page)); metadata_access_disable(); } @@ -1340,8 +1341,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} -static inline void setup_page_debug(struct kmem_cache *s, - void *addr, int order) {} +static inline +void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } @@ -1639,7 +1640,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; void *start, *p, *next; - int idx, order; + int idx; bool shuffle; flags &= gfp_allowed_mask; @@ -1673,7 +1674,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->objects = oo_objects(oo); - order = compound_order(page); page->slab_cache = s; __SetPageSlab(page); if (page_is_pfmemalloc(page)) @@ -1683,7 +1683,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) start = page_address(page); - setup_page_debug(s, start, order); + setup_page_debug(s, page, start); shuffle = shuffle_freelist(s, page); @@ -3932,7 +3932,7 @@ size_t __ksize(const void *object) if (unlikely(!PageSlab(page))) { WARN_ON(!PageCompound(page)); - return PAGE_SIZE << compound_order(page); + return page_size(page); } return slab_ksize(page->slab_cache); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c2f1af3b6a7c..fa8fbb8fa3c8 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -977,7 +977,7 @@ static int xsk_mmap(struct file *file, struct socket *sock, /* Matches the smp_wmb() in xsk_init_queue */ smp_rmb(); qpg = virt_to_head_page(q->ring); - if (size > (PAGE_SIZE << compound_order(qpg))) + if (size > page_size(qpg)) return -EINVAL; pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; From 94ad9338109fe9d0b8a4a16828719dd6dcaee4c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:28 -0700 Subject: [PATCH 537/721] mm: introduce page_shift() Replace PAGE_SHIFT + compound_order(page) with the new page_shift() function. Minor improvements in readability. [akpm@linux-foundation.org: fix build in tce_page_is_contained()] Link: http://lkml.kernel.org/r/201907241853.yNQTrJWd%25lkp@intel.com Link: http://lkml.kernel.org/r/20190721104612.19120-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Acked-by: Kirill A. Shutemov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/book3s64/iommu_api.c | 7 ++----- drivers/vfio/vfio_iommu_spapr_tce.c | 8 ++++---- include/linux/mm.h | 6 ++++++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index b056cae3388b..56cc84520577 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -129,11 +129,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, * Allow to use larger than 64k IOMMU pages. Only do that * if we are backed by hugetlb. */ - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { - struct page *head = compound_head(page); - - pageshift = compound_order(head) + PAGE_SHIFT; - } + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) + pageshift = page_shift(compound_head(page)); mem->pageshift = min(mem->pageshift, pageshift); /* * We don't need struct page reference any more, switch diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 3b18fa4d090a..26cef65b41e7 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -176,13 +176,13 @@ static long tce_iommu_register_pages(struct tce_container *container, } static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, - unsigned int page_shift) + unsigned int it_page_shift) { struct page *page; unsigned long size = 0; - if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) - return size == (1UL << page_shift); + if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size)) + return size == (1UL << it_page_shift); page = pfn_to_page(hpa >> PAGE_SHIFT); /* @@ -190,7 +190,7 @@ static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, * a page we just found. Otherwise the hardware can get access to * a bigger memory chunk that it should. */ - return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; + return page_shift(compound_head(page)) >= it_page_shift; } static inline bool tce_groups_attached(struct tce_container *container) diff --git a/include/linux/mm.h b/include/linux/mm.h index d46d5585e2a2..9238548bdec5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -811,6 +811,12 @@ static inline unsigned long page_size(struct page *page) return PAGE_SIZE << compound_order(page); } +/* Returns the number of bits needed for the number of bytes in a page */ +static inline unsigned int page_shift(struct page *page) +{ + return PAGE_SHIFT + compound_order(page); +} + void free_compound_page(struct page *page); #ifdef CONFIG_MMU From d8c6546b1aea843fbeb4d54a1202f1adda6504be Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:30 -0700 Subject: [PATCH 538/721] mm: introduce compound_nr() Replace 1 << compound_order(page) with compound_nr(page). Minor improvements in readability. Link: http://lkml.kernel.org/r/20190721104612.19120-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andrew Morton Reviewed-by: Ira Weiny Acked-by: Kirill A. Shutemov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/flush.c | 4 ++-- arch/powerpc/mm/hugetlbpage.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 6 ++++++ mm/compaction.c | 2 +- mm/filemap.c | 2 +- mm/gup.c | 2 +- mm/hugetlb_cgroup.c | 2 +- mm/kasan/common.c | 2 +- mm/memcontrol.c | 4 ++-- mm/memory_hotplug.c | 4 ++-- mm/migrate.c | 2 +- mm/page_alloc.c | 2 +- mm/rmap.c | 3 +-- mm/shmem.c | 8 ++++---- mm/swap_state.c | 2 +- mm/util.c | 2 +- mm/vmscan.c | 4 ++-- 18 files changed, 30 insertions(+), 25 deletions(-) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 4c7ebe094a83..6d89db7895d1 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -208,13 +208,13 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page) } else { unsigned long i; if (cache_is_vipt_nonaliasing()) { - for (i = 0; i < (1 << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { void *addr = kmap_atomic(page + i); __cpuc_flush_dcache_area(addr, PAGE_SIZE); kunmap_atomic(addr); } } else { - for (i = 0; i < (1 << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { void *addr = kmap_high_get(page + i); if (addr) { __cpuc_flush_dcache_area(addr, PAGE_SIZE); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a8953f108808..73d4873fc7f8 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -667,7 +667,7 @@ void flush_dcache_icache_hugepage(struct page *page) BUG_ON(!PageCompound(page)); - for (i = 0; i < (1UL << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { if (!PageHighMem(page)) { __flush_dcache_icache(page_address(page+i)); } else { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bf43d1d60059..ea1630465474 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -461,7 +461,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked) { - int i, nr = compound ? 1 << compound_order(page) : 1; + int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 9238548bdec5..69b7314c8d24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -805,6 +805,12 @@ static inline void set_compound_order(struct page *page, unsigned int order) page[1].compound_order = order; } +/* Returns the number of pages in this potentially compound page. */ +static inline unsigned long compound_nr(struct page *page) +{ + return 1UL << compound_order(page); +} + /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { diff --git a/mm/compaction.c b/mm/compaction.c index 952dc2fb24e5..777c088e9113 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -969,7 +969,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * is safe to read and it's 0 for tail pages. */ if (unlikely(PageCompound(page))) { - low_pfn += (1UL << compound_order(page)) - 1; + low_pfn += compound_nr(page) - 1; goto isolate_fail; } } diff --git a/mm/filemap.c b/mm/filemap.c index 40667c2f3383..5f30aedd7363 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -126,7 +126,7 @@ static void page_cache_delete(struct address_space *mapping, /* hugetlb pages are represented by a single entry in the xarray */ if (!PageHuge(page)) { xas_set_order(&xas, page->index, compound_order(page)); - nr = 1U << compound_order(page); + nr = compound_nr(page); } VM_BUG_ON_PAGE(!PageLocked(page), page); diff --git a/mm/gup.c b/mm/gup.c index 98f13ab37bac..84a36d80dd2e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1460,7 +1460,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, * gup may start from a tail page. Advance step by the left * part. */ - step = (1 << compound_order(head)) - (pages[i] - head); + step = compound_nr(head) - (pages[i] - head); /* * If we get a page from the CMA zone, since we are going to * be pinning these entries, we might as well move them out diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 68c2f2f3c05b..f1930fa0b445 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -139,7 +139,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, if (!page_hcg || page_hcg != h_cg) goto out; - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); if (!parent) { parent = root_h_cgroup; /* root has no limit */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 307631d9c62b..6814d6d6a023 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -336,7 +336,7 @@ void kasan_poison_slab(struct page *page) { unsigned long i; - for (i = 0; i < (1 << compound_order(page)); i++) + for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); kasan_poison_shadow(page_address(page), page_size(page), KASAN_KMALLOC_REDZONE); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3c15bb07cce..6c6032c03d1d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6511,7 +6511,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) unsigned int nr_pages = 1; if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); + nr_pages = compound_nr(page); ug->nr_huge += nr_pages; } if (PageAnon(page)) @@ -6523,7 +6523,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } ug->pgpgout++; } else { - ug->nr_kmem += 1 << compound_order(page); + ug->nr_kmem += compound_nr(page); __ClearPageKmemcg(page); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c73f09913165..5f2c83ce9fde 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1309,7 +1309,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) head = compound_head(page); if (page_huge_active(head)) return pfn; - skip = (1 << compound_order(head)) - (page - head); + skip = compound_nr(head) - (page - head); pfn += skip - 1; } return 0; @@ -1347,7 +1347,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { struct page *head = compound_head(page); - pfn = page_to_pfn(head) + (1<i_pages, index, compound_order(page)); unsigned long i = 0; - unsigned long nr = 1UL << compound_order(page); + unsigned long nr = compound_nr(page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -1884,7 +1884,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, lru_cache_add_anon(page); spin_lock_irq(&info->lock); - info->alloced += 1 << compound_order(page); + info->alloced += compound_nr(page); inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); @@ -1925,7 +1925,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page *head = compound_head(page); int i; - for (i = 0; i < (1 << compound_order(head)); i++) { + for (i = 0; i < compound_nr(head); i++) { clear_highpage(head + i); flush_dcache_page(head + i); } @@ -1952,7 +1952,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, * Error recovery. */ unacct: - shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); + shmem_inode_unacct_blocks(inode, compound_nr(page)); if (PageTransHuge(page)) { unlock_page(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index 8368621a0fc7..f844af5f09ba 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -116,7 +116,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); - unsigned long i, nr = 1UL << compound_order(page); + unsigned long i, nr = compound_nr(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); diff --git a/mm/util.c b/mm/util.c index e6351a80f248..bab284d69c8c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -521,7 +521,7 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; - for (i = 0; i < (1 << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } diff --git a/mm/vmscan.c b/mm/vmscan.c index a6c5d0b28321..8e03427cb64f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1149,7 +1149,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, VM_BUG_ON_PAGE(PageActive(page), page); - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); /* Account the number of base pages even though THP */ sc->nr_scanned += nr_pages; @@ -1705,7 +1705,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON_PAGE(!PageLRU(page), page); - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); total_scan += nr_pages; if (page_zonenum(page) > sc->reclaim_idx) { From e7a1aaf28770c1f7a06c50cbd02ca0f27ce61ec5 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 23 Sep 2019 15:34:33 -0700 Subject: [PATCH 539/721] mm: replace list_move_tail() with add_page_to_lru_list_tail() This is a cleanup patch that replaces two historical uses of list_move_tail() with relatively recent add_page_to_lru_list_tail(). Link: http://lkml.kernel.org/r/20190716212436.7137-1-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Vlastimil Babka Cc: Michal Hocko Cc: Jason Gunthorpe Cc: Dan Williams Cc: Matthew Wilcox Cc: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index ae300397dfda..0226c5346560 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -515,7 +515,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, lru + active); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); if (PageWriteback(page) || PageDirty(page)) { /* @@ -523,13 +522,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, * It can make readahead confusing. But race window * is _really_ small and it's non-critical problem. */ + add_page_to_lru_list(page, lruvec, lru); SetPageReclaim(page); } else { /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - list_move_tail(&page->lru, &lruvec->lists[lru]); + add_page_to_lru_list_tail(page, lruvec, lru); __count_vm_event(PGROTATED); } @@ -844,17 +844,15 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, get_page(page_tail); list_add_tail(&page_tail->lru, list); } else { - struct list_head *list_head; /* * Head page has not yet been counted, as an hpage, * so we must account for each subpage individually. * - * Use the standard add function to put page_tail on the list, - * but then correct its position so they all end up in order. + * Put page_tail on the list at the correct position + * so they all end up in order. */ - add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); - list_head = page_tail->lru.prev; - list_move_tail(&page_tail->lru, list_head); + add_page_to_lru_list_tail(page_tail, lruvec, + page_lru(page_tail)); } if (!PageUnevictable(page)) From 7e2f2a0cd17cfc42acb4b6a293d5cb6c7eda9862 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:34:36 -0700 Subject: [PATCH 540/721] mm, page_owner: record page owner for each subpage Patch series "debug_pagealloc improvements through page_owner", v2. The debug_pagealloc functionality serves a similar purpose on the page allocator level that slub_debug does on the kmalloc level, which is to detect bad users. One notable feature that slub_debug has is storing stack traces of who last allocated and freed the object. On page level we track allocations via page_owner, but that info is discarded when freeing, and we don't track freeing at all. This series improves those aspects. With both debug_pagealloc and page_owner enabled, we can then get bug reports such as the example in Patch 4. SLUB debug tracking additionally stores cpu, pid and timestamp. This could be added later, if deemed useful enough to justify the additional page_ext structure size. This patch (of 3): Currently, page owner info is only recorded for the first page of a high-order allocation, and copied to tail pages in the event of a split page. With the plan to keep previous owner info after freeing the page, it would be benefical to record page owner for each subpage upon allocation. This increases the overhead for high orders, but that should be acceptable for a debugging option. The order stored for each subpage is the order of the whole allocation. This makes it possible to calculate the "head" pfn and to recognize "tail" pages (quoted because not all high-order allocations are compound pages with true head and tail pages). When reading the page_owner debugfs file, keep skipping the "tail" pages so that stats gathered by existing scripts don't get inflated. Link: http://lkml.kernel.org/r/20190820131828.22684-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index addcbb2ae4e4..813fcb70547b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -154,18 +154,23 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) return handle; } -static inline void __set_page_owner_handle(struct page_ext *page_ext, - depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) +static inline void __set_page_owner_handle(struct page *page, + struct page_ext *page_ext, depot_stack_handle_t handle, + unsigned int order, gfp_t gfp_mask) { struct page_owner *page_owner; + int i; - page_owner = get_page_owner(page_ext); - page_owner->handle = handle; - page_owner->order = order; - page_owner->gfp_mask = gfp_mask; - page_owner->last_migrate_reason = -1; + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + page_owner->handle = handle; + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = -1; + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); - __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + page_ext = lookup_page_ext(page + i); + } } noinline void __set_page_owner(struct page *page, unsigned int order, @@ -178,7 +183,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order, return; handle = save_stack(gfp_mask); - __set_page_owner_handle(page_ext, handle, order, gfp_mask); + __set_page_owner_handle(page, page_ext, handle, order, gfp_mask); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -204,8 +209,11 @@ void __split_page_owner(struct page *page, unsigned int order) page_owner = get_page_owner(page_ext); page_owner->order = 0; - for (i = 1; i < (1 << order); i++) - __copy_page_owner(page, page + i); + for (i = 1; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + page_owner = get_page_owner(page_ext); + page_owner->order = 0; + } } void __copy_page_owner(struct page *oldpage, struct page *newpage) @@ -483,6 +491,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) page_owner = get_page_owner(page_ext); + /* + * Don't print "tail" pages of high-order allocations as that + * would inflate the stats. + */ + if (!IS_ALIGNED(pfn, 1 << page_owner->order)) + continue; + /* * Access to page_ext->handle isn't synchronous so we should * be careful to access it. @@ -562,7 +577,8 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; /* Found early allocated page */ - __set_page_owner_handle(page_ext, early_handle, 0, 0); + __set_page_owner_handle(page, page_ext, early_handle, + 0, 0); count++; } cond_resched(); From 37389167a281f3ccb6bc958c32b2e088c7269fe0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:34:39 -0700 Subject: [PATCH 541/721] mm, page_owner: keep owner info when freeing the page For debugging purposes it might be useful to keep the owner info even after page has been freed, and include it in e.g. dump_page() when detecting a bad page state. For that, change the PAGE_EXT_OWNER flag meaning to "page owner info has been set at least once" and add new PAGE_EXT_OWNER_ACTIVE for tracking whether page is supposed to be currently tracked allocated or free. Adjust dump_page() accordingly, distinguishing free and allocated pages. In the page_owner debugfs file, keep printing only allocated pages so that existing scripts are not confused, and also because free pages are irrelevant for the memory statistics or leak detection that's the typical use case of the file, anyway. Link: http://lkml.kernel.org/r/20190820131828.22684-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ext.h | 1 + mm/page_owner.c | 34 ++++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 09592951725c..682fd465df06 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -18,6 +18,7 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, + PAGE_EXT_OWNER_ACTIVE, #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, diff --git a/mm/page_owner.c b/mm/page_owner.c index 813fcb70547b..4a48e018dbdf 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -111,7 +111,7 @@ void __reset_page_owner(struct page *page, unsigned int order) page_ext = lookup_page_ext(page + i); if (unlikely(!page_ext)) continue; - __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); + __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); } } @@ -168,6 +168,7 @@ static inline void __set_page_owner_handle(struct page *page, page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + __set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); page_ext = lookup_page_ext(page + i); } @@ -243,6 +244,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) * the new page, which will be freed. */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); + __set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -302,7 +304,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (unlikely(!page_ext)) continue; - if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) continue; page_owner = get_page_owner(page_ext); @@ -413,21 +415,26 @@ void __dump_page_owner(struct page *page) mt = gfpflags_to_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { - pr_alert("page_owner info is not active (free page?)\n"); + pr_alert("page_owner info is not present (never set?)\n"); return; } + if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) + pr_alert("page_owner tracks the page as allocated\n"); + else + pr_alert("page_owner tracks the page as freed\n"); + + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); + handle = READ_ONCE(page_owner->handle); if (!handle) { - pr_alert("page_owner info is not active (free page?)\n"); - return; + pr_alert("page_owner allocation stack trace missing\n"); + } else { + nr_entries = stack_depot_fetch(handle, &entries); + stack_trace_print(entries, nr_entries, 0); } - nr_entries = stack_depot_fetch(handle, &entries); - pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", - page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); - stack_trace_print(entries, nr_entries, 0); - if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); @@ -489,6 +496,13 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; + /* + * Although we do have the info about past allocation of free + * pages, it's not relevant for current memory usage. + */ + if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) + continue; + page_owner = get_page_owner(page_ext); /* From 8974558f49a6a41b4a74db672e13bca616eff6d8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:34:42 -0700 Subject: [PATCH 542/721] mm, page_owner, debug_pagealloc: save and dump freeing stack trace The debug_pagealloc functionality is useful to catch buggy page allocator users that cause e.g. use after free or double free. When page inconsistency is detected, debugging is often simpler by knowing the call stack of process that last allocated and freed the page. When page_owner is also enabled, we record the allocation stack trace, but not freeing. This patch therefore adds recording of freeing process stack trace to page owner info, if both page_owner and debug_pagealloc are configured and enabled. With only page_owner enabled, this info is not useful for the memory leak debugging use case. dump_page() is adjusted to print the info. An example result of calling __free_pages() twice may look like this (note the page last free stack trace): BUG: Bad page state in process bash pfn:13d8f8 page:ffffc31984f63e00 refcount:-1 mapcount:0 mapping:0000000000000000 index:0x0 flags: 0x1affff800000000() raw: 01affff800000000 dead000000000100 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 ffffffffffffffff 0000000000000000 page dumped because: nonzero _refcount page_owner tracks the page as freed page last allocated via order 0, migratetype Unmovable, gfp_mask 0xcc0(GFP_KERNEL) prep_new_page+0x143/0x150 get_page_from_freelist+0x289/0x380 __alloc_pages_nodemask+0x13c/0x2d0 khugepaged+0x6e/0xc10 kthread+0xf9/0x130 ret_from_fork+0x3a/0x50 page last free stack trace: free_pcp_prepare+0x134/0x1e0 free_unref_page+0x18/0x90 khugepaged+0x7b/0xc10 kthread+0xf9/0x130 ret_from_fork+0x3a/0x50 Modules linked in: CPU: 3 PID: 271 Comm: bash Not tainted 5.3.0-rc4-2.g07a1a73-default+ #57 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x85/0xc0 bad_page.cold+0xba/0xbf rmqueue_pcplist.isra.0+0x6c5/0x6d0 rmqueue+0x2d/0x810 get_page_from_freelist+0x191/0x380 __alloc_pages_nodemask+0x13c/0x2d0 __get_free_pages+0xd/0x30 __pud_alloc+0x2c/0x110 copy_page_range+0x4f9/0x630 dup_mmap+0x362/0x480 dup_mm+0x68/0x110 copy_process+0x19e1/0x1b40 _do_fork+0x73/0x310 __x64_sys_clone+0x75/0x80 do_syscall_64+0x6e/0x1e0 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f10af854a10 ... Link: http://lkml.kernel.org/r/20190820131828.22684-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 2 + mm/Kconfig.debug | 4 +- mm/page_owner.c | 53 ++++++++++++++----- 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d3814789304f..8a54ed862049 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -809,6 +809,8 @@ enables the feature at boot time. By default, it is disabled and the system will work mostly the same as a kernel built without CONFIG_DEBUG_PAGEALLOC. + Note: to get most of debug_pagealloc error reports, it's + useful to also enable the page_owner functionality. on: enable the feature debugpat [X86] Enable PAT debugging diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 82b6a20898bd..327b3ebf23bf 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -21,7 +21,9 @@ config DEBUG_PAGEALLOC Also, the state of page tracking structures is checked more often as pages are being allocated and freed, as unexpected state changes often happen for same reasons as memory corruption (e.g. double free, - use-after-free). + use-after-free). The error reports for these checks can be augmented + with stack traces of last allocation and freeing of the page, when + PAGE_OWNER is also selected and enabled on boot. For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify diff --git a/mm/page_owner.c b/mm/page_owner.c index 4a48e018dbdf..dee931184788 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -24,6 +24,9 @@ struct page_owner { short last_migrate_reason; gfp_t gfp_mask; depot_stack_handle_t handle; +#ifdef CONFIG_DEBUG_PAGEALLOC + depot_stack_handle_t free_handle; +#endif }; static bool page_owner_disabled = true; @@ -102,19 +105,6 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext) return (void *)page_ext + page_owner_ops.offset; } -void __reset_page_owner(struct page *page, unsigned int order) -{ - int i; - struct page_ext *page_ext; - - for (i = 0; i < (1 << order); i++) { - page_ext = lookup_page_ext(page + i); - if (unlikely(!page_ext)) - continue; - __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); - } -} - static inline bool check_recursive_alloc(unsigned long *entries, unsigned int nr_entries, unsigned long ip) @@ -154,6 +144,32 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) return handle; } +void __reset_page_owner(struct page *page, unsigned int order) +{ + int i; + struct page_ext *page_ext; +#ifdef CONFIG_DEBUG_PAGEALLOC + depot_stack_handle_t handle = 0; + struct page_owner *page_owner; + + if (debug_pagealloc_enabled()) + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); +#endif + + for (i = 0; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + if (unlikely(!page_ext)) + continue; + __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); +#ifdef CONFIG_DEBUG_PAGEALLOC + if (debug_pagealloc_enabled()) { + page_owner = get_page_owner(page_ext); + page_owner->free_handle = handle; + } +#endif + } +} + static inline void __set_page_owner_handle(struct page *page, struct page_ext *page_ext, depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) @@ -435,6 +451,17 @@ void __dump_page_owner(struct page *page) stack_trace_print(entries, nr_entries, 0); } +#ifdef CONFIG_DEBUG_PAGEALLOC + handle = READ_ONCE(page_owner->free_handle); + if (!handle) { + pr_alert("page_owner free stack trace missing\n"); + } else { + nr_entries = stack_depot_fetch(handle, &entries); + pr_alert("page last free stack trace:\n"); + stack_trace_print(entries, nr_entries, 0); + } +#endif + if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); From c3aab9a0bd91b696a852169479b7db1ece6cbf8c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 23 Sep 2019 15:34:45 -0700 Subject: [PATCH 543/721] mm/filemap.c: don't initiate writeback if mapping has no dirty pages Functions like filemap_write_and_wait_range() should do nothing if inode has no dirty pages or pages currently under writeback. But they anyway construct struct writeback_control and this does some atomic operations if CONFIG_CGROUP_WRITEBACK=y - on fast path it locks inode->i_lock and updates state of writeback ownership, on slow path might be more work. Current this path is safely avoided only when inode mapping has no pages. For example generic_file_read_iter() calls filemap_write_and_wait_range() at each O_DIRECT read - pretty hot path. This patch skips starting new writeback if mapping has no dirty tags set. If writeback is already in progress filemap_write_and_wait_range() will wait for it. Link: http://lkml.kernel.org/r/156378816804.1087.8607636317907921438.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Jens Axboe Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 5f30aedd7363..82f3b9a3a940 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -408,7 +408,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_cap_writeback_dirty(mapping)) + if (!mapping_cap_writeback_dirty(mapping) || + !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(&wbc, mapping->host); From 875d91b11a201276ac3a9ab79f8b0fa3dc4ee8fd Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 23 Sep 2019 15:34:48 -0700 Subject: [PATCH 544/721] mm/filemap.c: rewrite mapping_needs_writeback in less fancy manner This actually checks that writeback is needed or in progress. Link: http://lkml.kernel.org/r/156378817069.1087.1302816672037672488.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Cc: Tejun Heo Cc: Jens Axboe Cc: Johannes Weiner Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 82f3b9a3a940..d5462d706f76 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -618,10 +618,13 @@ int filemap_fdatawait_keep_errors(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); +/* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { - return (!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional); + if (dax_mapping(mapping)) + return mapping->nrexceptional; + + return mapping->nrpages; } int filemap_write_and_wait(struct address_space *mapping) From 4101196b19d7f905dca5dcf46cd35eb758cf06c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 23 Sep 2019 15:34:52 -0700 Subject: [PATCH 545/721] mm: page cache: store only head pages in i_pages Transparent Huge Pages are currently stored in i_pages as pointers to consecutive subpages. This patch changes that to storing consecutive pointers to the head page in preparation for storing huge pages more efficiently in i_pages. Large parts of this are "inspired" by Kirill's patch https://lore.kernel.org/lkml/20170126115819.58875-2-kirill.shutemov@linux.intel.com/ Kirill and Huang Ying contributed several fixes. [willy@infradead.org: use compound_nr, squish uninit-var warning] Link: http://lkml.kernel.org/r/20190731210400.7419-1-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Jan Kara Reviewed-by: Kirill Shutemov Reviewed-by: Song Liu Tested-by: Song Liu Tested-by: William Kucharski Reviewed-by: William Kucharski Tested-by: Qian Cai Tested-by: Mikhail Gavrilov Cc: Hugh Dickins Cc: Chris Wilson Cc: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 10 +++ mm/filemap.c | 147 ++++++++++++++++------------------------ mm/huge_memory.c | 22 +++++- mm/khugepaged.c | 4 +- mm/memfd.c | 2 + mm/migrate.c | 2 +- mm/shmem.c | 2 +- mm/swap_state.c | 4 +- 8 files changed, 96 insertions(+), 97 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c7552459a15f..37a4d9e32cd3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -333,6 +333,16 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } +static inline struct page *find_subpage(struct page *page, pgoff_t offset) +{ + if (PageHuge(page)) + return page; + + VM_BUG_ON_PAGE(PageTail(page), page); + + return page + (offset & (compound_nr(page) - 1)); +} + struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, diff --git a/mm/filemap.c b/mm/filemap.c index d5462d706f76..533f271d6839 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -281,11 +281,11 @@ EXPORT_SYMBOL(delete_from_page_cache); * @pvec: pagevec with pages to delete * * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index. + * from the mapping. The function expects @pvec to be sorted by page index + * and is optimised for it to be dense. * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the - * mapping as well. + * @pvec. * * The function expects the i_pages lock to be held. */ @@ -294,40 +294,43 @@ static void page_cache_delete_batch(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; - int i = 0, tail_pages = 0; + int i = 0; struct page *page; mapping_set_update(&xas, mapping); xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec) && !tail_pages) + if (i >= pagevec_count(pvec)) break; + + /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(page)) continue; - if (!tail_pages) { - /* - * Some page got inserted in our range? Skip it. We - * have our pages locked so they are protected from - * being removed. - */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > - pvec->pages[i]->index, page); - continue; - } - WARN_ON_ONCE(!PageLocked(page)); - if (PageTransHuge(page) && !PageHuge(page)) - tail_pages = HPAGE_PMD_NR - 1; - page->mapping = NULL; - /* - * Leave page->index set: truncation lookup relies - * upon it - */ - i++; - } else { - VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages - != pvec->pages[i]->index, page); - tail_pages--; + /* + * A page got inserted in our range? Skip it. We have our + * pages locked so they are protected from being removed. + * If we see a page whose index is higher than ours, it + * means our page has been removed, which shouldn't be + * possible because we're holding the PageLock. + */ + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, + page); + continue; } + + WARN_ON_ONCE(!PageLocked(page)); + + if (page->index == xas.xa_index) + page->mapping = NULL; + /* Leave page->index set: truncation lookup relies on it */ + + /* + * Move to the next page in the vector if this is a regular + * page or the index is of the last sub-page of this compound + * page. + */ + if (page->index + compound_nr(page) - 1 == xas.xa_index) + i++; xas_store(&xas, NULL); total_pages++; } @@ -1520,7 +1523,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { XA_STATE(xas, &mapping->i_pages, offset); - struct page *head, *page; + struct page *page; rcu_read_lock(); repeat: @@ -1535,25 +1538,19 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) if (!page || xa_is_value(page)) goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } - /* - * Has the page moved? + * Has the page moved or been split? * This is part of the lockless pagecache protocol. See * include/linux/pagemap.h for details. */ if (unlikely(page != xas_reload(&xas))) { - put_page(head); + put_page(page); goto repeat; } + page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1735,7 +1732,6 @@ unsigned find_get_entries(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, page, ULONG_MAX) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1746,17 +1742,13 @@ unsigned find_get_entries(struct address_space *mapping, if (xa_is_value(page)) goto export; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; + page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -1765,7 +1757,7 @@ unsigned find_get_entries(struct address_space *mapping, break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1807,33 +1799,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); xas_for_each(&xas, page, end) { - struct page *head; if (xas_retry(&xas, page)) continue; /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1878,7 +1864,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1888,24 +1873,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, if (xa_is_value(page)) break; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1941,7 +1921,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, rcu_read_lock(); xas_for_each_marked(&xas, page, end, tag) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1952,26 +1931,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *index = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -2652,7 +2626,7 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct page *page; rcu_read_lock(); xas_for_each(&xas, page, end_pgoff) { @@ -2661,24 +2635,19 @@ void filemap_map_pages(struct vm_fault *vmf, if (xa_is_value(page)) goto next; - head = compound_head(page); - /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(head)) + if (PageLocked(page)) goto next; - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto next; - /* The page was split under us? */ - if (compound_head(page) != head) - goto skip; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto skip; + page = find_subpage(page, xas.xa_index); if (!PageUptodate(page) || PageReadahead(page) || diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de1f15969e27..483b07b2d6ae 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2497,6 +2497,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct page *head = compound_head(page); pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; + struct address_space *swap_cache = NULL; + unsigned long offset = 0; int i; lruvec = mem_cgroup_page_lruvec(head, pgdat); @@ -2504,6 +2506,14 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); + if (PageAnon(head) && PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + offset = swp_offset(entry); + swap_cache = swap_address_space(entry); + xa_lock(&swap_cache->i_pages); + } + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ @@ -2513,6 +2523,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); + } else if (!PageAnon(page)) { + __xa_store(&head->mapping->i_pages, head[i].index, + head + i, 0); + } else if (swap_cache) { + __xa_store(&swap_cache->i_pages, offset + i, + head + i, 0); } } @@ -2523,10 +2539,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ - if (PageSwapCache(head)) + if (PageSwapCache(head)) { page_ref_add(head, 2); - else + xa_unlock(&swap_cache->i_pages); + } else { page_ref_inc(head); + } } else { /* Additional pin to page cache */ page_ref_add(head, 2); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ccede2425c3f..04a54ff5a8ac 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1378,7 +1378,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); nr_none++; continue; } @@ -1454,7 +1454,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); continue; out_unlock: unlock_page(page); diff --git a/mm/memfd.c b/mm/memfd.c index 650e65a46b9c..2647c898990c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas) xas_for_each(xas, page, ULONG_MAX) { if (xa_is_value(page)) continue; + page = find_subpage(page, xas->xa_index); if (page_count(page) - page_mapcount(page) > 1) xas_set_mark(xas, MEMFD_TAG_PINNED); @@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) bool clear = true; if (xa_is_value(page)) continue; + page = find_subpage(page, xas.xa_index); if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags diff --git a/mm/migrate.c b/mm/migrate.c index aa72b49e0209..374ef2fcb722 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -460,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping, for (i = 1; i < HPAGE_PMD_NR; i++) { xas_next(&xas); - xas_store(&xas, newpage + i); + xas_store(&xas, newpage); } } diff --git a/mm/shmem.c b/mm/shmem.c index 15d26c86e5ef..57a6aedf6649 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -631,7 +631,7 @@ static int shmem_add_to_page_cache(struct page *page, if (xas_error(&xas)) goto unlock; next: - xas_store(&xas, page + i); + xas_store(&xas, page); if (++i < nr) { xas_next(&xas); goto next; diff --git a/mm/swap_state.c b/mm/swap_state.c index f844af5f09ba..8e7ce9a9bc5e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -133,7 +133,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); set_page_private(page + i, entry.val + i); - xas_store(&xas, page + i); + xas_store(&xas, page); xas_next(&xas); } address_space->nrpages += nr; @@ -168,7 +168,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, NULL); - VM_BUG_ON_PAGE(entry != page + i, entry); + VM_BUG_ON_PAGE(entry != page, entry); set_page_private(page + i, 0); xas_next(&xas); } From 0e4b01df865935007bd712cbc8e7299005b28894 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Mon, 23 Sep 2019 15:34:55 -0700 Subject: [PATCH 546/721] mm, memcg: throttle allocators when failing reclaim over memory.high We're trying to use memory.high to limit workloads, but have found that containment can frequently fail completely and cause OOM situations outside of the cgroup. This happens especially with swap space -- either when none is configured, or swap is full. These failures often also don't have enough warning to allow one to react, whether for a human or for a daemon monitoring PSI. Here is output from a simple program showing how long it takes in usec (column 2) to allocate a megabyte of anonymous memory (column 1) when a cgroup is already beyond its memory high setting, and no swap is available: [root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \ > --wait -t timeout 300 /root/mdf [...] 95 1035 96 1038 97 1000 98 1036 99 1048 100 1590 101 1968 102 1776 103 1863 104 1757 105 1921 106 1893 107 1760 108 1748 109 1843 110 1716 111 1924 112 1776 113 1831 114 1766 115 1836 116 1588 117 1912 118 1802 119 1857 120 1731 [...] [System OOM in 2-3 seconds] The delay does go up extremely marginally past the 100MB memory.high threshold, as now we spend time scanning before returning to usermode, but it's nowhere near enough to contain growth. It also doesn't get worse the more pages you have, since it only considers nr_pages. The current situation goes against both the expectations of users of memory.high, and our intentions as cgroup v2 developers. In cgroup-v2.txt, we claim that we will throttle and only under "extreme conditions" will memory.high protection be breached. Likewise, cgroup v2 users generally also expect that memory.high should throttle workloads as they exceed their high threshold. However, as seen above, this isn't always how it works in practice -- even on banal setups like those with no swap, or where swap has become exhausted, we can end up with memory.high being breached and us having no weapons left in our arsenal to combat runaway growth with, since reclaim is futile. It's also hard for system monitoring software or users to tell how bad the situation is, as "high" events for the memcg may in some cases be benign, and in others be catastrophic. The current status quo is that we fail containment in a way that doesn't provide any advance warning that things are about to go horribly wrong (for example, we are about to invoke the kernel OOM killer). This patch introduces explicit throttling when reclaim is failing to keep memcg size contained at the memory.high setting. It does so by applying an exponential delay curve derived from the memcg's overage compared to memory.high. In the normal case where the memcg is either below or only marginally over its memory.high setting, no throttling will be performed. This composes well with system health monitoring and remediation, as these allocator delays are factored into PSI's memory pressure calculations. This both creates a mechanism system administrators or applications consuming the PSI interface to trivially see that the memcg in question is struggling and use that to make more reasonable decisions, and permits them enough time to act. Either of these can act with significantly more nuance than that we can provide using the system OOM killer. This is a similar idea to memory.oom_control in cgroup v1 which would put the cgroup to sleep if the threshold was violated, but it's also significantly improved as it results in visible memory pressure, and also doesn't schedule indefinitely, which previously made tracing and other introspection difficult (ie. it's clamped at 2*HZ per allocation through MEMCG_MAX_HIGH_DELAY_JIFFIES). Contrast the previous results with a kernel with this patch: [root@ktst ~]# systemd-run -p MemoryHigh=100M -p MemorySwapMax=1 \ > --wait -t timeout 300 /root/mdf [...] 95 1002 96 1000 97 1002 98 1003 99 1000 100 1043 101 84724 102 330628 103 610511 104 1016265 105 1503969 106 2391692 107 2872061 108 3248003 109 4791904 110 5759832 111 6912509 112 8127818 113 9472203 114 12287622 115 12480079 116 14144008 117 15808029 118 16384500 119 16383242 120 16384979 [...] As you can see, in the normal case, memory allocation takes around 1000 usec. However, as we exceed our memory.high, things start to increase exponentially, but fairly leniently at first. Our first megabyte over memory.high takes us 0.16 seconds, then the next is 0.46 seconds, then the next is almost an entire second. This gets worse until we reach our eventual 2*HZ clamp per batch, resulting in 16 seconds per megabyte. However, this is still making forward progress, so permits tracing or further analysis with programs like GDB. We use an exponential curve for our delay penalty for a few reasons: 1. We run mem_cgroup_handle_over_high to potentially do reclaim after we've already performed allocations, which means that temporarily going over memory.high by a small amount may be perfectly legitimate, even for compliant workloads. We don't want to unduly penalise such cases. 2. An exponential curve (as opposed to a static or linear delay) allows ramping up memory pressure stats more gradually, which can be useful to work out that you have set memory.high too low, without destroying application performance entirely. This patch expands on earlier work by Johannes Weiner. Thanks! [akpm@linux-foundation.org: fix max() warning] [akpm@linux-foundation.org: fix __udivdi3 ref on 32-bit] [akpm@linux-foundation.org: fix it even more] [chris@chrisdown.name: fix 64-bit divide even more] Link: http://lkml.kernel.org/r/20190723180700.GA29459@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Roman Gushchin Cc: Michal Hocko Cc: Nathan Chancellor Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c6032c03d1d..1b5d0bb008e7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include "internal.h" #include @@ -2358,12 +2359,68 @@ static void high_work_func(struct work_struct *work) reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); } +/* + * Clamp the maximum sleep time per allocation batch to 2 seconds. This is + * enough to still cause a significant slowdown in most cases, while still + * allowing diagnostics and tracing to proceed without becoming stuck. + */ +#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) + +/* + * When calculating the delay, we use these either side of the exponentiation to + * maintain precision and scale to a reasonable number of jiffies (see the table + * below. + * + * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the + * overage ratio to a delay. + * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the + * proposed penalty in order to reduce to a reasonable number of jiffies, and + * to produce a reasonable delay curve. + * + * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a + * reasonable delay curve compared to precision-adjusted overage, not + * penalising heavily at first, but still making sure that growth beyond the + * limit penalises misbehaviour cgroups by slowing them down exponentially. For + * example, with a high of 100 megabytes: + * + * +-------+------------------------+ + * | usage | time to allocate in ms | + * +-------+------------------------+ + * | 100M | 0 | + * | 101M | 6 | + * | 102M | 25 | + * | 103M | 57 | + * | 104M | 102 | + * | 105M | 159 | + * | 106M | 230 | + * | 107M | 313 | + * | 108M | 409 | + * | 109M | 518 | + * | 110M | 639 | + * | 111M | 774 | + * | 112M | 921 | + * | 113M | 1081 | + * | 114M | 1254 | + * | 115M | 1439 | + * | 116M | 1638 | + * | 117M | 1849 | + * | 118M | 2000 | + * | 119M | 2000 | + * | 120M | 2000 | + * +-------+------------------------+ + */ + #define MEMCG_DELAY_PRECISION_SHIFT 20 + #define MEMCG_DELAY_SCALING_SHIFT 14 + /* * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. */ void mem_cgroup_handle_over_high(void) { + unsigned long usage, high, clamped_high; + unsigned long pflags; + unsigned long penalty_jiffies, overage; unsigned int nr_pages = current->memcg_nr_pages_over_high; struct mem_cgroup *memcg; @@ -2372,8 +2429,75 @@ void mem_cgroup_handle_over_high(void) memcg = get_mem_cgroup_from_mm(current->mm); reclaim_high(memcg, nr_pages, GFP_KERNEL); - css_put(&memcg->css); current->memcg_nr_pages_over_high = 0; + + /* + * memory.high is breached and reclaim is unable to keep up. Throttle + * allocators proactively to slow down excessive growth. + * + * We use overage compared to memory.high to calculate the number of + * jiffies to sleep (penalty_jiffies). Ideally this value should be + * fairly lenient on small overages, and increasingly harsh when the + * memcg in question makes it clear that it has no intention of stopping + * its crazy behaviour, so we exponentially increase the delay based on + * overage amount. + */ + + usage = page_counter_read(&memcg->memory); + high = READ_ONCE(memcg->high); + + if (usage <= high) + goto out; + + /* + * Prevent division by 0 in overage calculation by acting as if it was a + * threshold of 1 page + */ + clamped_high = max(high, 1UL); + + overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, + clamped_high); + + penalty_jiffies = ((u64)overage * overage * HZ) + >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); + + /* + * Factor in the task's own contribution to the overage, such that four + * N-sized allocations are throttled approximately the same as one + * 4N-sized allocation. + * + * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or + * larger the current charge patch is than that. + */ + penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; + + /* + * Clamp the max delay per usermode return so as to still keep the + * application moving forwards and also permit diagnostics, albeit + * extremely slowly. + */ + penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + + /* + * Don't sleep if the amount of jiffies this memcg owes us is so low + * that it's not even worth doing, in an attempt to be nice to those who + * go only a small amount over their memory.high value and maybe haven't + * been aggressively reclaimed enough yet. + */ + if (penalty_jiffies <= HZ / 100) + goto out; + + /* + * If we exit early, we're guaranteed to die (since + * schedule_timeout_killable sets TASK_KILLABLE). This means we don't + * need to account for any ill-begotten jiffies to pay them off later. + */ + psi_memstall_enter(&pflags); + schedule_timeout_killable(penalty_jiffies); + psi_memstall_leave(&pflags); + +out: + css_put(&memcg->css); } static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, From e1a366be5cb4f849ec4de170d50eebc08bb0af20 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 23 Sep 2019 15:34:58 -0700 Subject: [PATCH 547/721] mm: memcontrol: switch to rcu protection in drain_all_stock() Commit 72f0184c8a00 ("mm, memcg: remove hotplug locking from try_charge") introduced css_tryget()/css_put() calls in drain_all_stock(), which are supposed to protect the target memory cgroup from being released during the mem_cgroup_is_descendant() call. However, it's not completely safe. In theory, memcg can go away between reading stock->cached pointer and calling css_tryget(). This can happen if drain_all_stock() races with drain_local_stock() performed on the remote cpu as a result of a work, scheduled by the previous invocation of drain_all_stock(). The race is a bit theoretical and there are few chances to trigger it, but the current code looks a bit confusing, so it makes sense to fix it anyway. The code looks like as if css_tryget() and css_put() are used to protect stocks drainage. It's not necessary because stocked pages are holding references to the cached cgroup. And it obviously won't work for works, scheduled on other cpus. So, let's read the stock->cached pointer and evaluate the memory cgroup inside a rcu read section, and get rid of css_tryget()/css_put() calls. Link: http://lkml.kernel.org/r/20190802192241.3253165-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Cc: Hillf Danton Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b5d0bb008e7..80131025f2da 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2271,21 +2271,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; + bool flush = false; + rcu_read_lock(); memcg = stock->cached; - if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) - continue; - if (!mem_cgroup_is_descendant(memcg, root_memcg)) { - css_put(&memcg->css); - continue; - } - if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (memcg && stock->nr_pages && + mem_cgroup_is_descendant(memcg, root_memcg)) + flush = true; + rcu_read_unlock(); + + if (flush && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) drain_local_stock(&stock->work); else schedule_work_on(cpu, &stock->work); } - css_put(&memcg->css); } put_cpu(); mutex_unlock(&percpu_charge_mutex); From 1ba6fc9af35bf97c84567d9b3eeb26629d1e3af0 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 23 Sep 2019 15:35:01 -0700 Subject: [PATCH 548/721] mm: vmscan: do not share cgroup iteration between reclaimers One of our services observed a high rate of cgroup OOM kills in the presence of large amounts of clean cache. Debugging showed that the culprit is the shared cgroup iteration in page reclaim. Under high allocation concurrency, multiple threads enter reclaim at the same time. Fearing overreclaim when we first switched from the single global LRU to cgrouped LRU lists, we introduced a shared iteration state for reclaim invocations - whether 1 or 20 reclaimers are active concurrently, we only walk the cgroup tree once: the 1st reclaimer reclaims the first cgroup, the second the second one etc. With more reclaimers than cgroups, we start another walk from the top. This sounded reasonable at the time, but the problem is that reclaim concurrency doesn't scale with allocation concurrency. As reclaim concurrency increases, the amount of memory individual reclaimers get to scan gets smaller and smaller. Individual reclaimers may only see one cgroup per cycle, and that may not have much reclaimable memory. We see individual reclaimers declare OOM when there is plenty of reclaimable memory available in cgroups they didn't visit. This patch does away with the shared iterator, and every reclaimer is allowed to scan the full cgroup tree and see all of reclaimable memory, just like it would on a non-cgrouped system. This way, when OOM is declared, we know that the reclaimer actually had a chance. To still maintain fairness in reclaim pressure, disallow cgroup reclaim from bailing out of the tree walk early. Kswapd and regular direct reclaim already don't bail, so it's not clear why limit reclaim would have to, especially since it only walks subtrees to begin with. This change completely eliminates the OOM kills on our service, while showing no signs of overreclaim - no increased scan rates, %sys time, or abrupt free memory spikes. I tested across 100 machines that have 64G of RAM and host about 300 cgroups each. [ It's possible overreclaim never was a *practical* issue to begin with - it was simply a concern we had on the mailing lists at the time, with no real data to back it up. But we have also added more bail-out conditions deeper inside reclaim (e.g. the proportional exit in shrink_node_memcg) since. Regardless, now we have data that suggests full walks are more reliable and scale just fine. ] Link: http://lkml.kernel.org/r/20190812192316.13615-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e03427cb64f..2ee4d9283738 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2664,10 +2664,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) do { struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup_reclaim_cookie reclaim = { - .pgdat = pgdat, - .priority = sc->priority, - }; unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; @@ -2676,7 +2672,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - memcg = mem_cgroup_iter(root, NULL, &reclaim); + memcg = mem_cgroup_iter(root, NULL, NULL); do { unsigned long lru_pages; unsigned long reclaimed; @@ -2719,21 +2715,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); - /* - * Kswapd have to scan all memory cgroups to fulfill - * the overall scan target for the node. - * - * Limit reclaim, on the other hand, only cares about - * nr_to_reclaim pages to be reclaimed and it will - * retry with decreasing priority if one round over the - * whole hierarchy is not sufficient. - */ - if (!current_is_kswapd() && - sc->nr_reclaimed >= sc->nr_to_reclaim) { - mem_cgroup_iter_break(root, memcg); - break; - } - } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; From 2d15eb31b50a1b93927bdb7cc5d9960b90f7c1e4 Mon Sep 17 00:00:00 2001 From: "akpm@linux-foundation.org" Date: Mon, 23 Sep 2019 15:35:04 -0700 Subject: [PATCH 549/721] mm/gup: add make_dirty arg to put_user_pages_dirty_lock() [11~From: John Hubbard Subject: mm/gup: add make_dirty arg to put_user_pages_dirty_lock() Patch series "mm/gup: add make_dirty arg to put_user_pages_dirty_lock()", v3. There are about 50+ patches in my tree [2], and I'll be sending out the remaining ones in a few more groups: * The block/bio related changes (Jerome mostly wrote those, but I've had to move stuff around extensively, and add a little code) * mm/ changes * other subsystem patches * an RFC that shows the current state of the tracking patch set. That can only be applied after all call sites are converted, but it's good to get an early look at it. This is part a tree-wide conversion, as described in fc1d8e7cca2d ("mm: introduce put_user_page*(), placeholder versions"). This patch (of 3): Provide more capable variation of put_user_pages_dirty_lock(), and delete put_user_pages_dirty(). This is based on the following: 1. Lots of call sites become simpler if a bool is passed into put_user_page*(), instead of making the call site choose which put_user_page*() variant to call. 2. Christoph Hellwig's observation that set_page_dirty_lock() is usually correct, and set_page_dirty() is usually a bug, or at least questionable, within a put_user_page*() calling chain. This leads to the following API choices: * put_user_pages_dirty_lock(page, npages, make_dirty) * There is no put_user_pages_dirty(). You have to hand code that, in the rare case that it's required. [jhubbard@nvidia.com: remove unused variable in siw_free_plist()] Link: http://lkml.kernel.org/r/20190729074306.10368-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20190724044537.10458-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Matthew Wilcox Cc: Jan Kara Cc: Christoph Hellwig Cc: Ira Weiny Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/core/umem.c | 5 +- drivers/infiniband/hw/hfi1/user_pages.c | 5 +- drivers/infiniband/hw/qib/qib_user_pages.c | 5 +- drivers/infiniband/hw/usnic/usnic_uiom.c | 5 +- drivers/infiniband/sw/siw/siw_mem.c | 10 +- include/linux/mm.h | 5 +- mm/gup.c | 115 +++++++++------------ 7 files changed, 58 insertions(+), 92 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 41f9e268e3fb..24244a2f68cc 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -54,10 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { page = sg_page_iter_page(&sg_iter); - if (umem->writable && dirty) - put_user_pages_dirty_lock(&page, 1); - else - put_user_page(page); + put_user_pages_dirty_lock(&page, 1, umem->writable && dirty); } sg_free_table(&umem->sg_head); diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index b89a9b9aef7a..469acb961fbd 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -118,10 +118,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, size_t npages, bool dirty) { - if (dirty) - put_user_pages_dirty_lock(p, npages); - else - put_user_pages(p, npages); + put_user_pages_dirty_lock(p, npages, dirty); if (mm) { /* during close after signal, mm can be NULL */ atomic64_sub(npages, &mm->pinned_vm); diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index bfbfbb7e0ff4..6bf764e41891 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -40,10 +40,7 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages, int dirty) { - if (dirty) - put_user_pages_dirty_lock(p, num_pages); - else - put_user_pages(p, num_pages); + put_user_pages_dirty_lock(p, num_pages, dirty); } /** diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 0b0237d41613..62e6ffa9ad78 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -75,10 +75,7 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) for_each_sg(chunk->page_list, sg, chunk->nents, i) { page = sg_page(sg); pa = sg_phys(sg); - if (dirty) - put_user_pages_dirty_lock(&page, 1); - else - put_user_page(page); + put_user_pages_dirty_lock(&page, 1, dirty); usnic_dbg("pa: %pa\n", &pa); } kfree(chunk); diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index 87a56039f0ef..e99983f07663 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -63,15 +63,7 @@ struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, bool dirty) { - struct page **p = chunk->plist; - - while (num_pages--) { - if (!PageDirty(*p) && dirty) - put_user_pages_dirty_lock(p, 1); - else - put_user_page(*p); - p++; - } + put_user_pages_dirty_lock(chunk->plist, num_pages, dirty); } void siw_umem_release(struct siw_umem *umem, bool dirty) diff --git a/include/linux/mm.h b/include/linux/mm.h index 69b7314c8d24..57a9fa34f159 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1075,8 +1075,9 @@ static inline void put_user_page(struct page *page) put_page(page); } -void put_user_pages_dirty(struct page **pages, unsigned long npages); -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty); + void put_user_pages(struct page **pages, unsigned long npages); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) diff --git a/mm/gup.c b/mm/gup.c index 84a36d80dd2e..012060efddf1 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -29,85 +29,70 @@ struct follow_page_context { unsigned int page_mask; }; -typedef int (*set_dirty_func_t)(struct page *page); - -static void __put_user_pages_dirty(struct page **pages, - unsigned long npages, - set_dirty_func_t sdf) -{ - unsigned long index; - - for (index = 0; index < npages; index++) { - struct page *page = compound_head(pages[index]); - - /* - * Checking PageDirty at this point may race with - * clear_page_dirty_for_io(), but that's OK. Two key cases: - * - * 1) This code sees the page as already dirty, so it skips - * the call to sdf(). That could happen because - * clear_page_dirty_for_io() called page_mkclean(), - * followed by set_page_dirty(). However, now the page is - * going to get written back, which meets the original - * intention of setting it dirty, so all is well: - * clear_page_dirty_for_io() goes on to call - * TestClearPageDirty(), and write the page back. - * - * 2) This code sees the page as clean, so it calls sdf(). - * The page stays dirty, despite being written back, so it - * gets written back again in the next writeback cycle. - * This is harmless. - */ - if (!PageDirty(page)) - sdf(page); - - put_user_page(page); - } -} - /** - * put_user_pages_dirty() - release and dirty an array of gup-pinned pages - * @pages: array of pages to be marked dirty and released. + * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages + * @pages: array of pages to be maybe marked dirty, and definitely released. * @npages: number of pages in the @pages array. + * @make_dirty: whether to mark the pages dirty * * "gup-pinned page" refers to a page that has had one of the get_user_pages() * variants called on that page. * * For each page in the @pages array, make that page (or its head page, if a - * compound page) dirty, if it was previously listed as clean. Then, release - * the page using put_user_page(). + * compound page) dirty, if @make_dirty is true, and if the page was previously + * listed as clean. In any case, releases all pages using put_user_page(), + * possibly via put_user_pages(), for the non-dirty case. * * Please see the put_user_page() documentation for details. * - * set_page_dirty(), which does not lock the page, is used here. - * Therefore, it is the caller's responsibility to ensure that this is - * safe. If not, then put_user_pages_dirty_lock() should be called instead. + * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is + * required, then the caller should a) verify that this is really correct, + * because _lock() is usually required, and b) hand code it: + * set_page_dirty_lock(), put_user_page(). * */ -void put_user_pages_dirty(struct page **pages, unsigned long npages) +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty) { - __put_user_pages_dirty(pages, npages, set_page_dirty); -} -EXPORT_SYMBOL(put_user_pages_dirty); + unsigned long index; -/** - * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages - * @pages: array of pages to be marked dirty and released. - * @npages: number of pages in the @pages array. - * - * For each page in the @pages array, make that page (or its head page, if a - * compound page) dirty, if it was previously listed as clean. Then, release - * the page using put_user_page(). - * - * Please see the put_user_page() documentation for details. - * - * This is just like put_user_pages_dirty(), except that it invokes - * set_page_dirty_lock(), instead of set_page_dirty(). - * - */ -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) -{ - __put_user_pages_dirty(pages, npages, set_page_dirty_lock); + /* + * TODO: this can be optimized for huge pages: if a series of pages is + * physically contiguous and part of the same compound page, then a + * single operation to the head page should suffice. + */ + + if (!make_dirty) { + put_user_pages(pages, npages); + return; + } + + for (index = 0; index < npages; index++) { + struct page *page = compound_head(pages[index]); + /* + * Checking PageDirty at this point may race with + * clear_page_dirty_for_io(), but that's OK. Two key + * cases: + * + * 1) This code sees the page as already dirty, so it + * skips the call to set_page_dirty(). That could happen + * because clear_page_dirty_for_io() called + * page_mkclean(), followed by set_page_dirty(). + * However, now the page is going to get written back, + * which meets the original intention of setting it + * dirty, so all is well: clear_page_dirty_for_io() goes + * on to call TestClearPageDirty(), and write the page + * back. + * + * 2) This code sees the page as clean, so it calls + * set_page_dirty(). The page stays dirty, despite being + * written back, so it gets written back again in the + * next writeback cycle. This is harmless. + */ + if (!PageDirty(page)) + set_page_dirty_lock(page); + put_user_page(page); + } } EXPORT_SYMBOL(put_user_pages_dirty_lock); From 6f553ce498a72d91c1c9361fd49bcf07e17c9703 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 23 Sep 2019 15:35:07 -0700 Subject: [PATCH 550/721] drivers/gpu/drm/via: convert put_page() to put_user_page*() For pages that were retained via get_user_pages*(), release those pages via the new put_user_page*() routines, instead of via put_page() or release_pages(). This is part a tree-wide conversion, as described in fc1d8e7cca2d ("mm: introduce put_user_page*(), placeholder versions"). Also reverse the order of a comparison, in order to placate checkpatch.pl. Link: http://lkml.kernel.org/r/20190724044537.10458-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: David Airlie Cc: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/via/via_dmablit.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index feaa538026a0..3db000aacd26 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -174,7 +174,6 @@ via_map_blit_for_device(struct pci_dev *pdev, static void via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) { - struct page *page; int i; switch (vsg->state) { @@ -189,13 +188,8 @@ via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) kfree(vsg->desc_pages); /* fall through */ case dr_via_pages_locked: - for (i = 0; i < vsg->num_pages; ++i) { - if (NULL != (page = vsg->pages[i])) { - if (!PageReserved(page) && (DMA_FROM_DEVICE == vsg->direction)) - SetPageDirty(page); - put_page(page); - } - } + put_user_pages_dirty_lock(vsg->pages, vsg->num_pages, + (vsg->direction == DMA_FROM_DEVICE)); /* fall through */ case dr_via_pages_alloc: vfree(vsg->pages); From 1edc97694d0fa95d143e3457be892544e41f794a Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 23 Sep 2019 15:35:10 -0700 Subject: [PATCH 551/721] net/xdp: convert put_page() to put_user_page*() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For pages that were retained via get_user_pages*(), release those pages via the new put_user_page*() routines, instead of via put_page() or release_pages(). This is part a tree-wide conversion, as described in fc1d8e7cca2d ("mm: introduce put_user_page*(), placeholder versions"). Link: http://lkml.kernel.org/r/20190724044537.10458-4-jhubbard@nvidia.com Signed-off-by: John Hubbard Acked-by: Björn Töpel Cc: Björn Töpel Cc: Magnus Karlsson Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/xdp/xdp_umem.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 947b8ff0227e..bba3104f128f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -206,14 +206,7 @@ static int xdp_umem_map_pages(struct xdp_umem *umem) static void xdp_umem_unpin_pages(struct xdp_umem *umem) { - unsigned int i; - - for (i = 0; i < umem->npgs; i++) { - struct page *page = umem->pgs[i]; - - set_page_dirty_lock(page); - put_page(page); - } + put_user_pages_dirty_lock(umem->pgs, umem->npgs, true); kfree(umem->pgs); umem->pgs = NULL; From 9da99f20ecf8f81a5446a47fd8de62036c20ae61 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 23 Sep 2019 15:35:13 -0700 Subject: [PATCH 552/721] mm: remove redundant assignment of entry Since ptent will not be changed after previous assignment of entry, it is not necessary to do the assignment again. Link: http://lkml.kernel.org/r/20190708082740.21111-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Acked-by: Matthew Wilcox (Oracle) Cc: Will Deacon Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index b1dff75640b7..676020552b32 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1093,7 +1093,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (unlikely(details)) continue; - entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) rss[MM_SWAPENTS]--; else if (is_migration_entry(entry)) { From 7b167b681013f5715b6e5c4f458e346501464259 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 24 Sep 2019 00:02:24 +0000 Subject: [PATCH 553/721] mm: release the spinlock on zap_pte_range In our testing (camera recording), Miguel and Wei found unmap_page_range() takes above 6ms with preemption disabled easily. When I see that, the reason is it holds page table spinlock during entire 512 page operation in a PMD. 6.2ms is never trivial for user experince if RT task couldn't run in the time because it could make frame drop or glitch audio problem. I had a time to benchmark it via adding some trace_printk hooks between pte_offset_map_lock and pte_unmap_unlock in zap_pte_range. The testing device is 2018 premium mobile device. I can get 2ms delay rather easily to release 2M(ie, 512 pages) when the task runs on little core even though it doesn't have any IPI and LRU lock contention. It's already too heavy. If I remove activate_page, 35-40% overhead of zap_pte_range is gone so most of overhead(about 0.7ms) comes from activate_page via mark_page_accessed. Thus, if there are LRU contention, that 0.7ms could accumulate up to several ms. So this patch adds a check for need_resched() in the loop, and a preemption point if necessary. Link: http://lkml.kernel.org/r/20190731061440.GC155569@google.com Signed-off-by: Minchan Kim Reported-by: Miguel de Dios Reported-by: Wei Wang Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 676020552b32..9e6ac954c70b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1026,6 +1026,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_none(ptent)) continue; + if (need_resched()) + break; + if (pte_present(ptent)) { struct page *page; @@ -1123,8 +1126,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (force_flush) { force_flush = 0; tlb_flush_mmu(tlb); - if (addr != end) - goto again; + } + + if (addr != end) { + cond_resched(); + goto again; } return addr; From 13224794cb0832caa403ad583d8605202cabc6bc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 23 Sep 2019 15:35:19 -0700 Subject: [PATCH 554/721] mm: remove quicklist page table caches Patch series "mm: remove quicklist page table caches". A while ago Nicholas proposed to remove quicklist page table caches [1]. I've rebased his patch on the curren upstream and switched ia64 and sh to use generic versions of PTE allocation. [1] https://lore.kernel.org/linux-mm/20190711030339.20892-1-npiggin@gmail.com This patch (of 3): Remove page table allocator "quicklists". These have been around for a long time, but have not got much traction in the last decade and are only used on ia64 and sh architectures. The numbers in the initial commit look interesting but probably don't apply anymore. If anybody wants to resurrect this it's in the git history, but it's unhelpful to have this code and divergent allocator behaviour for minor archs. Also it might be better to instead make more general improvements to page allocator if this is still so slow. Link: http://lkml.kernel.org/r/1565250728-21721-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Nicholas Piggin Signed-off-by: Mike Rapoport Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgalloc.h | 2 - arch/arc/include/asm/pgalloc.h | 1 - arch/arm/include/asm/pgalloc.h | 2 - arch/arm64/include/asm/pgalloc.h | 2 - arch/csky/include/asm/pgalloc.h | 2 - arch/hexagon/include/asm/pgalloc.h | 2 - arch/ia64/Kconfig | 4 - arch/ia64/include/asm/pgalloc.h | 32 +++----- arch/m68k/include/asm/pgtable_mm.h | 2 - arch/m68k/include/asm/pgtable_no.h | 2 - arch/microblaze/include/asm/pgalloc.h | 89 ++-------------------- arch/microblaze/mm/pgtable.c | 4 - arch/mips/include/asm/pgalloc.h | 2 - arch/nds32/include/asm/pgalloc.h | 2 - arch/nios2/include/asm/pgalloc.h | 2 - arch/openrisc/include/asm/pgalloc.h | 2 - arch/parisc/include/asm/pgalloc.h | 2 - arch/powerpc/include/asm/pgalloc.h | 2 - arch/riscv/include/asm/pgalloc.h | 4 - arch/s390/include/asm/pgtable.h | 1 - arch/sh/include/asm/pgalloc.h | 22 ++---- arch/sh/mm/Kconfig | 3 - arch/sparc/include/asm/pgalloc_32.h | 2 - arch/sparc/include/asm/pgalloc_64.h | 2 - arch/sparc/mm/init_32.c | 1 - arch/um/include/asm/pgalloc.h | 2 - arch/unicore32/include/asm/pgalloc.h | 2 - arch/x86/include/asm/pgtable_32.h | 1 - arch/x86/include/asm/pgtable_64.h | 1 - arch/xtensa/include/asm/tlbflush.h | 3 - fs/proc/meminfo.c | 4 - include/asm-generic/pgalloc.h | 5 -- include/linux/quicklist.h | 94 ----------------------- kernel/sched/idle.c | 1 - lib/show_mem.c | 5 -- mm/Kconfig | 5 -- mm/Makefile | 1 - mm/mmu_gather.c | 2 - mm/quicklist.c | 103 -------------------------- 39 files changed, 25 insertions(+), 395 deletions(-) delete mode 100644 include/linux/quicklist.h delete mode 100644 mm/quicklist.c diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index 71ded3b7d82d..eb91f1e85629 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h @@ -53,6 +53,4 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -#define check_pgt_cache() do { } while (0) - #endif /* _ALPHA_PGALLOC_H */ diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 9bdb8ed5b0db..4751f2251cd9 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -129,7 +129,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptep) #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) -#define check_pgt_cache() do { } while (0) #define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) #endif /* _ASM_ARC_PGALLOC_H */ diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index a2a68b751971..069da393110c 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -15,8 +15,6 @@ #include #include -#define check_pgt_cache() do { } while (0) - #ifdef CONFIG_MMU #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER)) diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 14d0bc44d451..172d76fa0245 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -15,8 +15,6 @@ #include /* for pte_{alloc,free}_one */ -#define check_pgt_cache() do { } while (0) - #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #if CONFIG_PGTABLE_LEVELS > 2 diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index 98c5716708d6..d089113fe41f 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -75,8 +75,6 @@ do { \ tlb_remove_page(tlb, pte); \ } while (0) -#define check_pgt_cache() do {} while (0) - extern void pagetable_init(void); extern void pre_mmu_init(void); extern void pre_trap_init(void); diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index d6544dc71258..5a6e79e7926d 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -13,8 +13,6 @@ #include /* for pte_{alloc,free}_one */ -#define check_pgt_cache() do {} while (0) - extern unsigned long long kmap_generation; /* diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 685a3df126ca..16714477eef4 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -72,10 +72,6 @@ config 64BIT config ZONE_DMA32 def_bool y -config QUICKLIST - bool - default y - config MMU bool default y diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index c9e481023c25..b03d993f5d7e 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -19,18 +19,17 @@ #include #include #include -#include #include static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - quicklist_free(0, NULL, pgd); + free_page((unsigned long)pgd); } #if CONFIG_PGTABLE_LEVELS == 4 @@ -42,12 +41,12 @@ pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) { - quicklist_free(0, NULL, pud); + free_page((unsigned long)pud); } #define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) #endif /* CONFIG_PGTABLE_LEVELS == 4 */ @@ -60,12 +59,12 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { - quicklist_free(0, NULL, pmd); + free_page((unsigned long)pmd); } #define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) @@ -86,14 +85,12 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte) static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; - void *pg; - pg = quicklist_alloc(0, GFP_KERNEL, NULL); - if (!pg) + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) return NULL; - page = virt_to_page(pg); if (!pgtable_page_ctor(page)) { - quicklist_free(0, NULL, pg); + __free_page(page); return NULL; } return page; @@ -101,23 +98,18 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pte_free(struct mm_struct *mm, pgtable_t pte) { pgtable_page_dtor(pte); - quicklist_free_page(0, NULL, pte); + __free_page(pte); } static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - quicklist_free(0, NULL, pte); -} - -static inline void check_pgt_cache(void) -{ - quicklist_trim(0, NULL, 25, 16); + free_page((unsigned long)pte); } #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index fde4534b974f..cc476c1d72e5 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -181,6 +181,4 @@ pgprot_t pgprot_dmacoherent(pgprot_t prot); */ #define pgtable_cache_init() do { } while (0) -#define check_pgt_cache() do { } while (0) - #endif /* _M68K_PGTABLE_H */ diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index fc3a96c77bd8..69e271101223 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -60,6 +60,4 @@ extern void paging_init(void); #include -#define check_pgt_cache() do { } while (0) - #endif /* _M68KNOMMU_PGTABLE_H */ diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index f4cc9ffc449e..ac0731881751 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -21,83 +21,20 @@ #include #include -#define PGDIR_ORDER 0 - -/* - * This is handled very differently on MicroBlaze since out page tables - * are all 0's and I want to be able to use these zero'd pages elsewhere - * as well - it gives us quite a speedup. - * -- Cort - */ -extern struct pgtable_cache_struct { - unsigned long *pgd_cache; - unsigned long *pte_cache; - unsigned long pgtable_cache_sz; -} quicklists; - -#define pgd_quicklist (quicklists.pgd_cache) -#define pmd_quicklist ((unsigned long *)0) -#define pte_quicklist (quicklists.pte_cache) -#define pgtable_cache_size (quicklists.pgtable_cache_sz) - -extern unsigned long *zero_cache; /* head linked list of pre-zero'd pages */ -extern atomic_t zero_sz; /* # currently pre-zero'd pages */ -extern atomic_t zeropage_hits; /* # zero'd pages request that we've done */ -extern atomic_t zeropage_calls; /* # zero'd pages request that've been made */ -extern atomic_t zerototal; /* # pages zero'd over time */ - -#define zero_quicklist (zero_cache) -#define zero_cache_sz (zero_sz) -#define zero_cache_calls (zeropage_calls) -#define zero_cache_hits (zeropage_hits) -#define zero_cache_total (zerototal) - -/* - * return a pre-zero'd page from the list, - * return NULL if none available -- Cort - */ -extern unsigned long get_zero_page_fast(void); - extern void __bad_pte(pmd_t *pmd); -static inline pgd_t *get_pgd_slow(void) +static inline pgd_t *get_pgd(void) { - pgd_t *ret; - - ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER); - if (ret != NULL) - clear_page(ret); - return ret; + return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0); } -static inline pgd_t *get_pgd_fast(void) -{ - unsigned long *ret; - - ret = pgd_quicklist; - if (ret != NULL) { - pgd_quicklist = (unsigned long *)(*ret); - ret[0] = 0; - pgtable_cache_size--; - } else - ret = (unsigned long *)get_pgd_slow(); - return (pgd_t *)ret; -} - -static inline void free_pgd_fast(pgd_t *pgd) -{ - *(unsigned long **)pgd = pgd_quicklist; - pgd_quicklist = (unsigned long *) pgd; - pgtable_cache_size++; -} - -static inline void free_pgd_slow(pgd_t *pgd) +static inline void free_pgd(pgd_t *pgd) { free_page((unsigned long)pgd); } -#define pgd_free(mm, pgd) free_pgd_fast(pgd) -#define pgd_alloc(mm) get_pgd_fast() +#define pgd_free(mm, pgd) free_pgd(pgd) +#define pgd_alloc(mm) get_pgd() #define pmd_pgtable(pmd) pmd_page(pmd) @@ -115,15 +52,14 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) struct page *ptepage; #ifdef CONFIG_HIGHPTE - int flags = GFP_KERNEL | __GFP_HIGHMEM; + int flags = GFP_KERNEL | __GFP_ZERO | __GFP_HIGHMEM; #else - int flags = GFP_KERNEL; + int flags = GFP_KERNEL | __GFP_ZERO; #endif ptepage = alloc_pages(flags, 0); if (!ptepage) return NULL; - clear_highpage(ptepage); if (!pgtable_page_ctor(ptepage)) { __free_page(ptepage); return NULL; @@ -131,13 +67,6 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) return ptepage; } -static inline void pte_free_fast(pte_t *pte) -{ - *(unsigned long **)pte = pte_quicklist; - pte_quicklist = (unsigned long *) pte; - pgtable_cache_size++; -} - static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); @@ -171,10 +100,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *ptepage) #define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x) #define pgd_populate(mm, pmd, pte) BUG() -extern int do_check_pgt_cache(int, int); - #endif /* CONFIG_MMU */ -#define check_pgt_cache() do { } while (0) - #endif /* _ASM_MICROBLAZE_PGALLOC_H */ diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 8fe54fda31dc..010bb9cee2e4 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -44,10 +44,6 @@ unsigned long ioremap_base; unsigned long ioremap_bot; EXPORT_SYMBOL(ioremap_bot); -#ifndef CONFIG_SMP -struct pgtable_cache_struct quicklists; -#endif - static void __iomem *__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) { diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index aa16b85ddffc..aa73cb187a07 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -105,8 +105,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) #endif /* __PAGETABLE_PUD_FOLDED */ -#define check_pgt_cache() do { } while (0) - extern void pagetable_init(void); #endif /* _ASM_PGALLOC_H */ diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h index e78b43d8389f..37125e6884d7 100644 --- a/arch/nds32/include/asm/pgalloc.h +++ b/arch/nds32/include/asm/pgalloc.h @@ -23,8 +23,6 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); -#define check_pgt_cache() do { } while (0) - static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { pgtable_t pte; diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 4bc8cf72067e..750d18d5980b 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -45,6 +45,4 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) tlb_remove_page((tlb), (pte)); \ } while (0) -#define check_pgt_cache() do { } while (0) - #endif /* _ASM_NIOS2_PGALLOC_H */ diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 3d4b397c2d06..787c1b9d2f6d 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -101,6 +101,4 @@ do { \ #define pmd_pgtable(pmd) pmd_page(pmd) -#define check_pgt_cache() do { } while (0) - #endif diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index 4f2059a50fae..d98647c29b74 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -124,6 +124,4 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) pmd_populate_kernel(mm, pmd, page_address(pte_page)) #define pmd_pgtable(pmd) pmd_page(pmd) -#define check_pgt_cache() do { } while (0) - #endif diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 2b2c60a1a66d..6dd78a2dc03a 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -64,8 +64,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) extern struct kmem_cache *pgtable_cache[]; #define PGT_CACHE(shift) pgtable_cache[shift] -static inline void check_pgt_cache(void) { } - #ifdef CONFIG_PPC_BOOK3S #include #else diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 56a67d66f72f..f66a00d8cb19 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -82,8 +82,4 @@ do { \ tlb_remove_page((tlb), pte); \ } while (0) -static inline void check_pgt_cache(void) -{ -} - #endif /* _ASM_RISCV_PGALLOC_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0c4600725fc2..8f59454ac407 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1686,7 +1686,6 @@ extern void s390_reset_cmma(struct mm_struct *mm); * No page table caches to initialise */ static inline void pgtable_cache_init(void) { } -static inline void check_pgt_cache(void) { } #include diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index b56f908b1395..9e15054858b4 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -2,11 +2,8 @@ #ifndef __ASM_SH_PGALLOC_H #define __ASM_SH_PGALLOC_H -#include #include -#define QUICK_PT 0 /* Other page table pages that are zero on free */ - extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); @@ -34,20 +31,18 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, */ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { struct page *page; - void *pg; - pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); - if (!pg) + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) return NULL; - page = virt_to_page(pg); if (!pgtable_page_ctor(page)) { - quicklist_free(QUICK_PT, NULL, pg); + __free_page(page); return NULL; } return page; @@ -55,13 +50,13 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - quicklist_free(QUICK_PT, NULL, pte); + free_page((unsigned long)pte); } static inline void pte_free(struct mm_struct *mm, pgtable_t pte) { pgtable_page_dtor(pte); - quicklist_free_page(QUICK_PT, NULL, pte); + __free_page(pte); } #define __pte_free_tlb(tlb,pte,addr) \ @@ -79,9 +74,4 @@ do { \ } while (0); #endif -static inline void check_pgt_cache(void) -{ - quicklist_trim(QUICK_PT, NULL, 25, 16); -} - #endif /* __ASM_SH_PGALLOC_H */ diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 02ed2df25a54..5c8a2ebfc720 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -1,9 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 menu "Memory management options" -config QUICKLIST - def_bool y - config MMU bool "Support for memory management hardware" depends on !CPU_SH2 diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index 282be50a4adf..10538a4d1a1e 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h @@ -17,8 +17,6 @@ void srmmu_free_nocache(void *addr, int size); extern struct resource sparc_iomap; -#define check_pgt_cache() do { } while (0) - pgd_t *get_pgd_fast(void); static inline void free_pgd_fast(pgd_t *pgd) { diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 48abccba4991..9d3e5cc95bbb 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h @@ -69,8 +69,6 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage); #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) #define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD)) -#define check_pgt_cache() do { } while (0) - void pgtable_free(void *table, bool is_page); #ifdef CONFIG_SMP diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 046ab116cc8c..906eda1158b4 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -31,7 +31,6 @@ #include #include #include -#include /* bug in asm-generic/tlb.h: check_pgt_cache */ #include #include #include diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 023599c3fa51..446e0c0f4018 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -43,7 +43,5 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) #endif -#define check_pgt_cache() do { } while (0) - #endif diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index 3f0903bd98e9..ba1c9a79993b 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -18,8 +18,6 @@ #define __HAVE_ARCH_PTE_ALLOC_ONE #include -#define check_pgt_cache() do { } while (0) - #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) #define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index c78da8eda8f2..b9b9f8aa963e 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -30,7 +30,6 @@ extern pgd_t initial_page_table[1024]; extern pmd_t initial_pg_pmd[]; static inline void pgtable_cache_init(void) { } -static inline void check_pgt_cache(void) { } void paging_init(void); void sync_initial_page_table(void); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 4990d26dfc73..a26d2d58b9c9 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -242,7 +242,6 @@ extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #define pgtable_cache_init() do { } while (0) -#define check_pgt_cache() do { } while (0) #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 diff --git a/arch/xtensa/include/asm/tlbflush.h b/arch/xtensa/include/asm/tlbflush.h index 06875feb27c2..856e2da2e397 100644 --- a/arch/xtensa/include/asm/tlbflush.h +++ b/arch/xtensa/include/asm/tlbflush.h @@ -160,9 +160,6 @@ static inline void invalidate_dtlb_mapping (unsigned address) invalidate_dtlb_entry(tlb_entry); } -#define check_pgt_cache() do { } while (0) - - /* * DO NOT USE THESE FUNCTIONS. These instructions aren't part of the Xtensa * ISA and exist only for test purposes.. diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 465ea0153b2a..4bd80e68eb03 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -106,9 +105,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); -#ifdef CONFIG_QUICKLIST - show_val_kb(m, "Quicklists: ", quicklist_total_size()); -#endif show_val_kb(m, "NFS_Unstable: ", global_node_page_state(NR_UNSTABLE_NFS)); diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 8476175c07e7..6f8cc06ee44e 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -102,11 +102,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) __free_page(pte_page); } -#else /* CONFIG_MMU */ - -/* This is enough for a nommu architecture */ -#define check_pgt_cache() do { } while (0) - #endif /* CONFIG_MMU */ #endif /* __ASM_GENERIC_PGALLOC_H */ diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h deleted file mode 100644 index 034982c98c8b..000000000000 --- a/include/linux/quicklist.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_QUICKLIST_H -#define LINUX_QUICKLIST_H -/* - * Fast allocations and disposal of pages. Pages must be in the condition - * as needed after allocation when they are freed. Per cpu lists of pages - * are kept that only contain node local pages. - * - * (C) 2007, SGI. Christoph Lameter - */ -#include -#include -#include - -#ifdef CONFIG_QUICKLIST - -struct quicklist { - void *page; - int nr_pages; -}; - -DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; - -/* - * The two key functions quicklist_alloc and quicklist_free are inline so - * that they may be custom compiled for the platform. - * Specifying a NULL ctor can remove constructor support. Specifying - * a constant quicklist allows the determination of the exact address - * in the per cpu area. - * - * The fast patch in quicklist_alloc touched only a per cpu cacheline and - * the first cacheline of the page itself. There is minmal overhead involved. - */ -static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *)) -{ - struct quicklist *q; - void **p = NULL; - - q =&get_cpu_var(quicklist)[nr]; - p = q->page; - if (likely(p)) { - q->page = p[0]; - p[0] = NULL; - q->nr_pages--; - } - put_cpu_var(quicklist); - if (likely(p)) - return p; - - p = (void *)__get_free_page(flags | __GFP_ZERO); - if (ctor && p) - ctor(p); - return p; -} - -static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p, - struct page *page) -{ - struct quicklist *q; - - q = &get_cpu_var(quicklist)[nr]; - *(void **)p = q->page; - q->page = p; - q->nr_pages++; - put_cpu_var(quicklist); -} - -static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp) -{ - __quicklist_free(nr, dtor, pp, virt_to_page(pp)); -} - -static inline void quicklist_free_page(int nr, void (*dtor)(void *), - struct page *page) -{ - __quicklist_free(nr, dtor, page_address(page), page); -} - -void quicklist_trim(int nr, void (*dtor)(void *), - unsigned long min_pages, unsigned long max_free); - -unsigned long quicklist_total_size(void); - -#else - -static inline unsigned long quicklist_total_size(void) -{ - return 0; -} - -#endif - -#endif /* LINUX_QUICKLIST_H */ - diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c892c6280c9f..8dad5aa600ea 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -238,7 +238,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - check_pgt_cache(); rmb(); local_irq_disable(); diff --git a/lib/show_mem.c b/lib/show_mem.c index 5c86ef4c899f..1c26c14ffbb9 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -6,7 +6,6 @@ */ #include -#include #include void show_mem(unsigned int filter, nodemask_t *nodemask) @@ -39,10 +38,6 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_CMA printk("%lu pages cma reserved\n", totalcma_pages); #endif -#ifdef CONFIG_QUICKLIST - printk("%lu pages in pagetable cache\n", - quicklist_total_size()); -#endif #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif diff --git a/mm/Kconfig b/mm/Kconfig index 2fe4902ad755..f88be1cbcfb2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -273,11 +273,6 @@ config BOUNCE by default when ZONE_DMA or HIGHMEM is selected, but you may say n to override this. -config NR_QUICK - int - depends on QUICKLIST - default "1" - config VIRT_TO_BUS bool help diff --git a/mm/Makefile b/mm/Makefile index d0b295c3b764..d11de59e9c3c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -72,7 +72,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o -obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 8c943a6e1696..7d70e5c78f97 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -271,8 +271,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, tlb_flush_mmu(tlb); - /* keep the page table cache within bounds */ - check_pgt_cache(); #ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER tlb_batch_list_free(tlb); #endif diff --git a/mm/quicklist.c b/mm/quicklist.c deleted file mode 100644 index 5e98ac78e410..000000000000 --- a/mm/quicklist.c +++ /dev/null @@ -1,103 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Quicklist support. - * - * Quicklists are light weight lists of pages that have a defined state - * on alloc and free. Pages must be in the quicklist specific defined state - * (zero by default) when the page is freed. It seems that the initial idea - * for such lists first came from Dave Miller and then various other people - * improved on it. - * - * Copyright (C) 2007 SGI, - * Christoph Lameter - * Generalized, added support for multiple lists and - * constructors / destructors. - */ -#include - -#include -#include -#include -#include - -DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); - -#define FRACTION_OF_NODE_MEM 16 - -static unsigned long max_pages(unsigned long min_pages) -{ - unsigned long node_free_pages, max; - int node = numa_node_id(); - struct zone *zones = NODE_DATA(node)->node_zones; - int num_cpus_on_node; - - node_free_pages = -#ifdef CONFIG_ZONE_DMA - zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) + -#endif -#ifdef CONFIG_ZONE_DMA32 - zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) + -#endif - zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); - - max = node_free_pages / FRACTION_OF_NODE_MEM; - - num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); - max /= num_cpus_on_node; - - return max(max, min_pages); -} - -static long min_pages_to_free(struct quicklist *q, - unsigned long min_pages, long max_free) -{ - long pages_to_free; - - pages_to_free = q->nr_pages - max_pages(min_pages); - - return min(pages_to_free, max_free); -} - -/* - * Trim down the number of pages in the quicklist - */ -void quicklist_trim(int nr, void (*dtor)(void *), - unsigned long min_pages, unsigned long max_free) -{ - long pages_to_free; - struct quicklist *q; - - q = &get_cpu_var(quicklist)[nr]; - if (q->nr_pages > min_pages) { - pages_to_free = min_pages_to_free(q, min_pages, max_free); - - while (pages_to_free > 0) { - /* - * We pass a gfp_t of 0 to quicklist_alloc here - * because we will never call into the page allocator. - */ - void *p = quicklist_alloc(nr, 0, NULL); - - if (dtor) - dtor(p); - free_page((unsigned long)p); - pages_to_free--; - } - } - put_cpu_var(quicklist); -} - -unsigned long quicklist_total_size(void) -{ - unsigned long count = 0; - int cpu; - struct quicklist *ql, *q; - - for_each_online_cpu(cpu) { - ql = per_cpu(quicklist, cpu); - for (q = ql; q < ql + CONFIG_NR_QUICK; q++) - count += q->nr_pages; - } - return count; -} - From 013199211c8bfe9cad9312f5083f0d5a576cf74a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:35:22 -0700 Subject: [PATCH 555/721] ia64: switch to generic version of pte allocation The ia64 implementation pte_alloc_one(), pte_alloc_one_kernel(), pte_free_kernel() and pte_free() is identical to the generic except of lack of __GFP_ACCOUNT for the user PTEs allocation. Switch ia64 to use generic version of these functions. Link: http://lkml.kernel.org/r/1565250728-21721-3-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/pgalloc.h | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index b03d993f5d7e..f4c491044882 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -20,6 +20,8 @@ #include #include +#include + #include static inline pgd_t *pgd_alloc(struct mm_struct *mm) @@ -82,36 +84,6 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte) pmd_val(*pmd_entry) = __pa(pte); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *page; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; -} - -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) #endif /* _ASM_IA64_PGALLOC_H */ From 6fb12766f7fcd7934b09fbfbd32a725cc2febf96 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:35:25 -0700 Subject: [PATCH 556/721] sh: switch to generic version of pte allocation The sh implementation pte_alloc_one(), pte_alloc_one_kernel(), pte_free_kernel() and pte_free() is identical to the generic except of lack of __GFP_ACCOUNT for the user PTEs allocation. Switch sh to use generic version of these functions. Link: http://lkml.kernel.org/r/1565250728-21721-4-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/pgalloc.h | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 9e15054858b4..8c6341a4d807 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -3,6 +3,7 @@ #define __ASM_SH_PGALLOC_H #include +#include extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); @@ -26,39 +27,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, } #define pmd_pgtable(pmd) pmd_page(pmd) -/* - * Allocate and free page tables. - */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *page; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - __free_page(pte); -} - #define __pte_free_tlb(tlb,pte,addr) \ do { \ pgtable_page_dtor(pte); \ From 1b9a9d8564cbc49e1e5529460ca70bf7353aa4d1 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:35:28 -0700 Subject: [PATCH 557/721] microblaze: switch to generic version of pte allocation The microblaze implementation of pte_alloc_one() has a provision to allocated PTEs from high memory, but neither CONFIG_HIGHPTE nor pte_map*() versions for suitable for HIGHPTE are defined. Except that, microblaze version of pte_alloc_one() is identical to the generic one as well as the implementations of pte_free() and pte_free_kernel(). Switch microblaze to use the generic versions of these functions. Also remove pte_free_slow() that is not referenced anywhere in the code. Link: http://lkml.kernel.org/r/1565690952-32158-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Mark Rutland Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/pgalloc.h | 39 +++------------------------ 1 file changed, 3 insertions(+), 36 deletions(-) diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index ac0731881751..7ecb05baa601 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -21,6 +21,9 @@ #include #include +#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL +#include + extern void __bad_pte(pmd_t *pmd); static inline pgd_t *get_pgd(void) @@ -47,42 +50,6 @@ static inline void free_pgd(pgd_t *pgd) extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -static inline struct page *pte_alloc_one(struct mm_struct *mm) -{ - struct page *ptepage; - -#ifdef CONFIG_HIGHPTE - int flags = GFP_KERNEL | __GFP_ZERO | __GFP_HIGHMEM; -#else - int flags = GFP_KERNEL | __GFP_ZERO; -#endif - - ptepage = alloc_pages(flags, 0); - if (!ptepage) - return NULL; - if (!pgtable_page_ctor(ptepage)) { - __free_page(ptepage); - return NULL; - } - return ptepage; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free_slow(struct page *ptepage) -{ - __free_page(ptepage); -} - -static inline void pte_free(struct mm_struct *mm, struct page *ptepage) -{ - pgtable_page_dtor(ptepage); - __free_page(ptepage); -} - #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) #define pmd_populate(mm, pmd, pte) \ From 782de70c42930baae55234f3df0dc90774924447 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:35:31 -0700 Subject: [PATCH 558/721] mm: consolidate pgtable_cache_init() and pgd_cache_init() Both pgtable_cache_init() and pgd_cache_init() are used to initialize kmem cache for page table allocations on several architectures that do not use PAGE_SIZE tables for one or more levels of the page table hierarchy. Most architectures do not implement these functions and use __weak default NOP implementation of pgd_cache_init(). Since there is no such default for pgtable_cache_init(), its empty stub is duplicated among most architectures. Rename the definitions of pgd_cache_init() to pgtable_cache_init() and drop empty stubs of pgtable_cache_init(). Link: http://lkml.kernel.org/r/1566457046-22637-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Will Deacon [arm64] Acked-by: Thomas Gleixner [x86] Cc: Catalin Marinas Cc: Ingo Molnar Cc: Borislav Petkov Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgtable.h | 5 ----- arch/arc/include/asm/pgtable.h | 5 ----- arch/arm/include/asm/pgtable-nommu.h | 5 ----- arch/arm/include/asm/pgtable.h | 2 -- arch/arm64/include/asm/pgtable.h | 2 -- arch/arm64/mm/pgd.c | 2 +- arch/c6x/include/asm/pgtable.h | 5 ----- arch/csky/include/asm/pgtable.h | 5 ----- arch/h8300/include/asm/pgtable.h | 6 ------ arch/hexagon/include/asm/pgtable.h | 3 --- arch/hexagon/mm/Makefile | 2 +- arch/hexagon/mm/pgalloc.c | 10 ---------- arch/ia64/include/asm/pgtable.h | 5 ----- arch/m68k/include/asm/pgtable_mm.h | 5 ----- arch/m68k/include/asm/pgtable_no.h | 5 ----- arch/microblaze/include/asm/pgtable.h | 7 ------- arch/mips/include/asm/pgtable.h | 5 ----- arch/nds32/include/asm/pgtable.h | 2 -- arch/nios2/include/asm/pgtable.h | 2 -- arch/openrisc/include/asm/pgtable.h | 5 ----- arch/parisc/include/asm/pgtable.h | 2 -- arch/powerpc/include/asm/pgtable.h | 1 - arch/riscv/include/asm/pgtable.h | 5 ----- arch/s390/include/asm/pgtable.h | 5 ----- arch/sh/include/asm/pgtable.h | 5 ----- arch/sh/mm/nommu.c | 4 ---- arch/sparc/include/asm/pgtable_32.h | 5 ----- arch/sparc/include/asm/pgtable_64.h | 1 - arch/um/include/asm/pgtable.h | 2 -- arch/unicore32/include/asm/pgtable.h | 2 -- arch/x86/include/asm/pgtable_32.h | 1 - arch/x86/include/asm/pgtable_64.h | 2 -- arch/x86/mm/pgtable.c | 6 +----- arch/xtensa/include/asm/pgtable.h | 1 - include/asm-generic/pgtable.h | 2 +- init/main.c | 3 +-- 36 files changed, 5 insertions(+), 130 deletions(-) delete mode 100644 arch/hexagon/mm/pgalloc.c diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 89c2032f9960..065b57f408c3 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -359,11 +359,6 @@ extern void paging_init(void); #include -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 1d87c18a2976..7addd0301c51 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -395,11 +395,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, /* to cope with aliasing VIPT cache */ #define HAVE_ARCH_UNMAPPED_AREA -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index d0de24f06724..010fa1a35a68 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h @@ -70,11 +70,6 @@ typedef pte_t *pte_addr_t; */ extern unsigned int kobjsize(const void *objp); -/* - * No page table caches to initialise. - */ -#define pgtable_cache_init() do { } while (0) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index f2e990dc27e7..3ae120cd1715 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -368,8 +368,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define pgtable_cache_init() do { } while (0) - #endif /* !__ASSEMBLY__ */ #endif /* CONFIG_MMU */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 57427d17580e..7576df00eb50 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -861,8 +861,6 @@ extern int kern_addr_valid(unsigned long addr); #include -static inline void pgtable_cache_init(void) { } - /* * On AArch64, the cache coherency is handled via the set_pte_at() function. */ diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 7548f9ca1f11..4a64089e5771 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -35,7 +35,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } -void __init pgd_cache_init(void) +void __init pgtable_cache_init(void) { if (PGD_SIZE == PAGE_SIZE) return; diff --git a/arch/c6x/include/asm/pgtable.h b/arch/c6x/include/asm/pgtable.h index 0bd805964ea6..0b6919c00413 100644 --- a/arch/c6x/include/asm/pgtable.h +++ b/arch/c6x/include/asm/pgtable.h @@ -59,11 +59,6 @@ extern unsigned long empty_zero_page; #define swapper_pg_dir ((pgd_t *) 0) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* * c6x is !MMU, so define the simpliest implementation */ diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index c429a6f347de..0040b3a05b61 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -296,11 +296,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ #define kern_addr_valid(addr) (1) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do {} while (0) - #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ remap_pfn_range(vma, vaddr, pfn, size, prot) diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h index a99caa49d265..4d00152fab58 100644 --- a/arch/h8300/include/asm/pgtable.h +++ b/arch/h8300/include/asm/pgtable.h @@ -4,7 +4,6 @@ #define __ARCH_USE_5LEVEL_HACK #include #include -#define pgtable_cache_init() do { } while (0) extern void paging_init(void); #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ @@ -34,11 +33,6 @@ static inline int pte_file(pte_t pte) { return 0; } extern unsigned int kobjsize(const void *objp); extern int is_in_rom(unsigned long); -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index a3ff6d24c09e..2fec20ad939e 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -431,9 +431,6 @@ static inline int pte_exec(pte_t pte) #define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -/* I think this is in case we have page table caches; needed by init/main.c */ -#define pgtable_cache_init() do { } while (0) - /* * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is * interpreted as swap information. The remaining free bits are interpreted as diff --git a/arch/hexagon/mm/Makefile b/arch/hexagon/mm/Makefile index 1894263ae5bc..893838499591 100644 --- a/arch/hexagon/mm/Makefile +++ b/arch/hexagon/mm/Makefile @@ -3,5 +3,5 @@ # Makefile for Hexagon memory management subsystem # -obj-y := init.o pgalloc.o ioremap.o uaccess.o vm_fault.o cache.o +obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o diff --git a/arch/hexagon/mm/pgalloc.c b/arch/hexagon/mm/pgalloc.c deleted file mode 100644 index 4d4316140237..000000000000 --- a/arch/hexagon/mm/pgalloc.c +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. - */ - -#include - -void __init pgtable_cache_init(void) -{ -} diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index b1e7468eb65a..d602e7c622db 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -566,11 +566,6 @@ extern struct page *zero_page_memmap_ptr; #define KERNEL_TR_PAGE_SHIFT _PAGE_SIZE_64M #define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* These tell get_user_pages() that the first gate page is accessible from user-level. */ #define FIXADDR_USER_START GATE_ADDR #ifdef HAVE_BUGGY_SEGREL diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index cc476c1d72e5..646c174fff99 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -176,9 +176,4 @@ pgprot_t pgprot_dmacoherent(pgprot_t prot); #include #endif /* !__ASSEMBLY__ */ -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* _M68K_PGTABLE_H */ diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index 69e271101223..c18165b0d904 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -44,11 +44,6 @@ extern void paging_init(void); */ #define ZERO_PAGE(vaddr) (virt_to_page(0)) -/* - * No page table caches to initialise. - */ -#define pgtable_cache_init() do { } while (0) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 142d3f004848..954b69af451f 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -46,8 +46,6 @@ extern int mem_init_done; #define swapper_pg_dir ((pgd_t *) NULL) -#define pgtable_cache_init() do {} while (0) - #define arch_enter_lazy_cpu_mode() do {} while (0) #define pgprot_noncached_wc(prot) prot @@ -526,11 +524,6 @@ extern unsigned long iopa(unsigned long addr); /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ #define kern_addr_valid(addr) (1) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - void do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code); diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 4dca733d5076..f85bd5b15f51 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -661,9 +661,4 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* _ASM_PGTABLE_H */ diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h index c70cc56bec09..0588ec99725c 100644 --- a/arch/nds32/include/asm/pgtable.h +++ b/arch/nds32/include/asm/pgtable.h @@ -403,8 +403,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; * into virtual address `from' */ -#define pgtable_cache_init() do { } while (0) - #endif /* !__ASSEMBLY__ */ #endif /* _ASMNDS32_PGTABLE_H */ diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 95237b7f6fc1..99985d8b7166 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -291,8 +291,6 @@ static inline void pte_clear(struct mm_struct *mm, #include -#define pgtable_cache_init() do { } while (0) - extern void __init paging_init(void); extern void __init mmu_init(void); diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 2fe9ff5b5d6f..248d22d8faa7 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -443,11 +443,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #include -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - typedef pte_t *pte_addr_t; #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 6d58c1739b42..4ac374b3a99f 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -132,8 +132,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #define PTRS_PER_PTE (1UL << BITS_PER_PTE) /* Definitions for 2nd level */ -#define pgtable_cache_init() do { } while (0) - #define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE) #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 8b7865a2d576..4053b2ab427c 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -87,7 +87,6 @@ extern unsigned long ioremap_bot; unsigned long vmalloc_to_phys(void *vmalloc_addr); void pgtable_cache_add(unsigned int shift); -void pgtable_cache_init(void); #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32) void mark_initmem_nx(void); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 80905b27ee98..c60123f018f5 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -424,11 +424,6 @@ extern void *dtb_early_va; extern void setup_bootmem(void); extern void paging_init(void); -static inline void pgtable_cache_init(void) -{ - /* No page table caches to initialize */ -} - #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 8f59454ac407..36c578c0ff96 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1682,11 +1682,6 @@ extern void s390_reset_cmma(struct mm_struct *mm); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -/* - * No page table caches to initialise - */ -static inline void pgtable_cache_init(void) { } - #include #endif /* _S390_PAGE_H */ diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 9085d1142fa3..cbd0f3c55a0c 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -123,11 +123,6 @@ typedef pte_t *pte_addr_t; #define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) -/* - * Initialise the page table caches - */ -extern void pgtable_cache_init(void); - struct vm_area_struct; struct mm_struct; diff --git a/arch/sh/mm/nommu.c b/arch/sh/mm/nommu.c index cc779a90d917..dca946f426c6 100644 --- a/arch/sh/mm/nommu.c +++ b/arch/sh/mm/nommu.c @@ -97,7 +97,3 @@ void __init page_table_range_init(unsigned long start, unsigned long end, void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { } - -void pgtable_cache_init(void) -{ -} diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 4eebed6c6781..31da44826645 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -445,9 +445,4 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, /* We provide our own get_unmapped_area to cope with VA holes for userland */ #define HAVE_ARCH_UNMAPPED_AREA -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* !(_SPARC_PGTABLE_H) */ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 1599de730532..b57f9c631eca 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1135,7 +1135,6 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long, unsigned long); #define HAVE_ARCH_FB_UNMAPPED_AREA -void pgtable_cache_init(void); void sun4v_register_fault_status(void); void sun4v_ktsb_register(void); void __init cheetah_ecache_flush_init(void); diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index e4d3ed980d82..36a44d58f373 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -32,8 +32,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* zero page used for uninitialized stuff */ extern unsigned long *empty_zero_page; -#define pgtable_cache_init() do ; while (0) - /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the * physical memory until the kernel virtual memory starts. That means that diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h index 126e961a8cb0..c8f7ba12f309 100644 --- a/arch/unicore32/include/asm/pgtable.h +++ b/arch/unicore32/include/asm/pgtable.h @@ -285,8 +285,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; #include -#define pgtable_cache_init() do { } while (0) - #endif /* !__ASSEMBLY__ */ #endif /* __UNICORE_PGTABLE_H__ */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index b9b9f8aa963e..0dca7f7aeff2 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -29,7 +29,6 @@ extern pgd_t swapper_pg_dir[1024]; extern pgd_t initial_page_table[1024]; extern pmd_t initial_pg_pmd[]; -static inline void pgtable_cache_init(void) { } void paging_init(void); void sync_initial_page_table(void); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index a26d2d58b9c9..0b6c4042942a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -241,8 +241,6 @@ extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define pgtable_cache_init() do { } while (0) - #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 44816ff6411f..463940faf52f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -357,7 +357,7 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, static struct kmem_cache *pgd_cache; -void __init pgd_cache_init(void) +void __init pgtable_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use @@ -402,10 +402,6 @@ static inline void _pgd_free(pgd_t *pgd) } #else -void __init pgd_cache_init(void) -{ -} - static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index ce3ff5e591b9..3f7fe5a8c286 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -238,7 +238,6 @@ extern void paging_init(void); # define swapper_pg_dir NULL static inline void paging_init(void) { } #endif -static inline void pgtable_cache_init(void) { } /* * The pmd contains the kernel virtual address of the pte page. diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 75d9d68a6de7..fae6abb3d586 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1126,7 +1126,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, static inline void init_espfix_bsp(void) { } #endif -extern void __init pgd_cache_init(void); +extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) diff --git a/init/main.c b/init/main.c index 3ca67e8b92fd..99a5f55e0d02 100644 --- a/init/main.c +++ b/init/main.c @@ -507,7 +507,7 @@ void __init __weak mem_encrypt_init(void) { } void __init __weak poking_init(void) { } -void __init __weak pgd_cache_init(void) { } +void __init __weak pgtable_cache_init(void) { } bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); @@ -565,7 +565,6 @@ static void __init mm_init(void) init_espfix_bsp(); /* Should be run after espfix64 is set up. */ pti_init(); - pgd_cache_init(); } void __init __weak arch_call_rest_init(void) From 6aa9b8b2c6355fc3339d6819ec250eadfe79fdb5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 23 Sep 2019 15:35:34 -0700 Subject: [PATCH 559/721] mm: do not hash address in print_bad_pte() Using %px to show the actual address in print_bad_pte() to help us to debug issue. Link: http://lkml.kernel.org/r/20190831011816.141002-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 9e6ac954c70b..b1ca51a079f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -518,7 +518,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); - pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", vma->vm_file, From 3fccb74cf3a688002c3340329e9e310c7aa8c816 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:37 -0700 Subject: [PATCH 560/721] mm/memory_hotplug: remove move_pfn_range() Let's remove this indirection. We need the zone in the caller either way, so let's just detect it there. Add some documentation for move_pfn_range_to_zone() instead. [akpm@linux-foundation.org: restore newline, per David] Link: http://lkml.kernel.org/r/20190724142324.3686-1-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: David Hildenbrand Cc: Pavel Tatashin Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5f2c83ce9fde..5b8811945bbb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -714,8 +714,13 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon pgdat->node_start_pfn = start_pfn; pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; -} +} +/* + * Associate the pfn range with the given zone, initializing the memmaps + * and resizing the pgdat/zone data to span the added pages. After this + * call, all affected pages are PG_reserved. + */ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { @@ -804,20 +809,6 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -/* - * Associates the given pfn range with the given node and the zone appropriate - * for the given online type. - */ -static struct zone * __meminit move_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages) -{ - struct zone *zone; - - zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); - move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL); - return zone; -} - int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -840,7 +831,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ put_device(&mem->dev); /* associate pfn range with the zone */ - zone = move_pfn_range(online_type, nid, pfn, nr_pages); + zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); arg.start_pfn = pfn; arg.nr_pages = nr_pages; From d84f2f5a755208da3f93e17714631485cb3da11c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:40 -0700 Subject: [PATCH 561/721] drivers/base/node.c: simplify unregister_memory_block_under_nodes() We don't allow to offline memory block devices that belong to multiple numa nodes. Therefore, such devices can never get removed. It is sufficient to process a single node when removing the memory block. No need to iterate over each and every PFN. We already have the nid stored for each memory block. Make sure that the nid always has a sane value. Please note that checking for node_online(nid) is not required. If we would have a memory block belonging to a node that is no longer offline, then we would have a BUG in the node offlining code. Link: http://lkml.kernel.org/r/20190719135244.15242-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: Stephen Rothwell Cc: Pavel Tatashin Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 1 + drivers/base/node.c | 39 +++++++++++++++------------------------ 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 20c39d1bcef8..154d5d4a0779 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -674,6 +674,7 @@ static int init_memory_block(struct memory_block **memory, mem->state = state; start_pfn = section_nr_to_pfn(mem->start_section_nr); mem->phys_device = arch_get_memory_phys_device(start_pfn); + mem->nid = NUMA_NO_NODE; ret = register_memory(mem); diff --git a/drivers/base/node.c b/drivers/base/node.c index 75b7e6f6535b..840c95baa1d8 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -759,8 +759,6 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, int ret, nid = *(int *)arg; unsigned long pfn, sect_start_pfn, sect_end_pfn; - mem_blk->nid = nid; - sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); sect_end_pfn += PAGES_PER_SECTION - 1; @@ -789,6 +787,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, if (page_nid != nid) continue; } + + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. + */ + mem_blk->nid = nid; + ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, kobject_name(&mem_blk->dev.kobj)); @@ -804,32 +809,18 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, } /* - * Unregister memory block device under all nodes that it spans. - * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes). + * Unregister a memory block device under the node it spans. Memory blocks + * with multiple nodes cannot be offlined and therefore also never be removed. */ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { - unsigned long pfn, sect_start_pfn, sect_end_pfn; - static nodemask_t unlinked_nodes; + if (mem_blk->nid == NUMA_NO_NODE) + return; - nodes_clear(unlinked_nodes); - sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); - sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); - for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { - int nid; - - nid = get_nid_for_pfn(pfn); - if (nid < 0) - continue; - if (!node_online(nid)) - continue; - if (node_test_and_set(nid, unlinked_nodes)) - continue; - sysfs_remove_link(&node_devices[nid]->dev.kobj, - kobject_name(&mem_blk->dev.kobj)); - sysfs_remove_link(&mem_blk->dev.kobj, - kobject_name(&node_devices[nid]->dev.kobj)); - } + sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj, + kobject_name(&mem_blk->dev.kobj)); + sysfs_remove_link(&mem_blk->dev.kobj, + kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); } int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) From f915fb7fb2c1c57be05880012b46d2edd5124797 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:43 -0700 Subject: [PATCH 562/721] drivers/base/memory.c: fixup documentation of removable/phys_index/block_size_bytes Let's rephrase to memory block terminology and add some further clarifications. Link: http://lkml.kernel.org/r/20190806080826.5963-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 154d5d4a0779..63fc5aa51c21 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -116,10 +116,8 @@ static unsigned long get_memory_block_size(void) } /* - * use this as the physical section index that this memsection - * uses. + * Show the first physical section index (number) of this memory block. */ - static ssize_t phys_index_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -131,7 +129,10 @@ static ssize_t phys_index_show(struct device *dev, } /* - * Show whether the section of memory is likely to be hot-removable + * Show whether the memory block is likely to be offlineable (or is already + * offline). Once offline, the memory block could be removed. The return + * value does, however, not indicate that there is a way to remove the + * memory block. */ static ssize_t removable_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -455,7 +456,7 @@ static DEVICE_ATTR_RO(phys_device); static DEVICE_ATTR_RO(removable); /* - * Block size attribute stuff + * Show the memory block size (shared by all memory blocks). */ static ssize_t block_size_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) From 902ce63b337381092ff865f542e854ff3d0ebe2b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:46 -0700 Subject: [PATCH 563/721] driver/base/memory.c: validate memory block size early Let's validate the memory block size early, when initializing the memory device infrastructure. Fail hard in case the value is not suitable. As nobody checks the return value of memory_dev_init(), turn it into a void function and fail with a panic in all scenarios instead. Otherwise, we'll crash later during boot when core/drivers expect that the memory device infrastructure (including memory_block_size_bytes()) works as expected. I think long term, we should move the whole memory block size configuration (set_memory_block_size_order() and memory_block_size_bytes()) into drivers/base/memory.c. Link: http://lkml.kernel.org/r/20190806090142.22709-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Pavel Tatashin Cc: Michal Hocko Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 31 +++++++++---------------------- include/linux/memory.h | 6 +++--- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 63fc5aa51c21..763154c82eb3 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -100,21 +100,6 @@ unsigned long __weak memory_block_size_bytes(void) } EXPORT_SYMBOL_GPL(memory_block_size_bytes); -static unsigned long get_memory_block_size(void) -{ - unsigned long block_sz; - - block_sz = memory_block_size_bytes(); - - /* Validate blk_sz is a power of 2 and not less than section size */ - if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { - WARN_ON(1); - block_sz = MIN_MEMORY_BLOCK_SIZE; - } - - return block_sz; -} - /* * Show the first physical section index (number) of this memory block. */ @@ -461,7 +446,7 @@ static DEVICE_ATTR_RO(removable); static ssize_t block_size_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%lx\n", get_memory_block_size()); + return sprintf(buf, "%lx\n", memory_block_size_bytes()); } static DEVICE_ATTR_RO(block_size_bytes); @@ -812,19 +797,22 @@ static const struct attribute_group *memory_root_attr_groups[] = { /* * Initialize the sysfs support for memory devices... */ -int __init memory_dev_init(void) +void __init memory_dev_init(void) { int ret; int err; unsigned long block_sz, nr; + /* Validate the configured memory block size */ + block_sz = memory_block_size_bytes(); + if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) + panic("Memory block size not suitable: 0x%lx\n", block_sz); + sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; + ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); if (ret) goto out; - block_sz = get_memory_block_size(); - sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - /* * Create entries for memory sections that were found * during boot and have been initialized @@ -840,8 +828,7 @@ int __init memory_dev_init(void) out: if (ret) - printk(KERN_ERR "%s() failed: %d\n", __func__, ret); - return ret; + panic("%s() failed: %d\n", __func__, ret); } /** diff --git a/include/linux/memory.h b/include/linux/memory.h index 02e633f3ede0..b3d9df060626 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -80,9 +80,9 @@ struct mem_section; #define IPC_CALLBACK_PRI 10 #ifndef CONFIG_MEMORY_HOTPLUG_SPARSE -static inline int memory_dev_init(void) +static inline void memory_dev_init(void) { - return 0; + return; } static inline int register_memory_notifier(struct notifier_block *nb) { @@ -113,7 +113,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size); void remove_memory_block_devices(unsigned long start, unsigned long size); -extern int memory_dev_init(void); +extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern int memory_isolate_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block(struct mem_section *); From b6c88d3b9d38f9448e0fcf44847a075ea81d5ca2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:49 -0700 Subject: [PATCH 564/721] drivers/base/memory.c: don't store end_section_nr in memory blocks Each memory block spans the same amount of sections/pages/bytes. The size is determined before the first memory block is created. No need to store what we can easily calculate - and the calculations even look simpler now. Michal brought up the idea of variable-sized memory blocks. However, if we ever implement something like this, we will need an API compatibility switch and reworks at various places (most code assumes a fixed memory block size). So let's cleanup what we have right now. While at it, fix the variable naming in register_mem_sect_under_node() - we no longer talk about a single section. Link: http://lkml.kernel.org/r/20190809110200.2746-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Pavel Tatashin Cc: Michal Hocko Cc: Dan Williams Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 1 - drivers/base/node.c | 10 +++++----- include/linux/memory.h | 1 - mm/memory_hotplug.c | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 763154c82eb3..6bea4f3f8040 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -656,7 +656,6 @@ static int init_memory_block(struct memory_block **memory, return -ENOMEM; mem->start_section_nr = block_id * sections_per_block; - mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; mem->state = state; start_pfn = section_nr_to_pfn(mem->start_section_nr); mem->phys_device = arch_get_memory_phys_device(start_pfn); diff --git a/drivers/base/node.c b/drivers/base/node.c index 840c95baa1d8..257449cf061f 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -756,13 +756,13 @@ static int __ref get_nid_for_pfn(unsigned long pfn) static int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) { + unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE; + unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); + unsigned long end_pfn = start_pfn + memory_block_pfns - 1; int ret, nid = *(int *)arg; - unsigned long pfn, sect_start_pfn, sect_end_pfn; + unsigned long pfn; - sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); - sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); - sect_end_pfn += PAGES_PER_SECTION - 1; - for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { + for (pfn = start_pfn; pfn <= end_pfn; pfn++) { int page_nid; /* diff --git a/include/linux/memory.h b/include/linux/memory.h index b3d9df060626..0ebb105eb261 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -25,7 +25,6 @@ struct memory_block { unsigned long start_section_nr; - unsigned long end_section_nr; unsigned long state; /* serialized by the dev->lock */ int section_count; /* serialized by mem_sysfs_mutex */ int online_type; /* for passing data to online routine */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5b8811945bbb..3706a137d880 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1654,7 +1654,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) phys_addr_t beginpa, endpa; beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); - endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; + endpa = beginpa + memory_block_size_bytes() - 1; pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); From 33fce0113da2f8a5c1bbce0c46a8b131500f1677 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 23 Sep 2019 15:35:52 -0700 Subject: [PATCH 565/721] mm/memory_hotplug.c: prevent memory leak when reusing pgdat When offlining a node in try_offline_node(), pgdat is not released. So that pgdat could be reused in hotadd_new_pgdat(). While we reallocate pgdat->per_cpu_nodestats if this pgdat is reused. This patch prevents the memory leak by just allocating per_cpu_nodestats when it is a new pgdat. Link: http://lkml.kernel.org/r/20190813020608.10194-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Acked-by: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3706a137d880..c28e5dd017ba 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -925,8 +925,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) if (!pgdat) return NULL; + pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); arch_refresh_nodedata(nid, pgdat); } else { + int cpu; /* * Reset the nr_zones, order and classzone_idx before reuse. * Note that kswapd will init kswapd_classzone_idx properly @@ -935,6 +938,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) pgdat->nr_zones = 0; pgdat->kswapd_order = 0; pgdat->kswapd_classzone_idx = 0; + for_each_online_cpu(cpu) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + memset(p, 0, sizeof(*p)); + } } /* we can use NODE_DATA(nid) from here */ @@ -944,7 +953,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); - pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* * The node we allocated has no zone fallback lists. For avoiding From 00ff9a91bdb74933648a5b346d9f0edb99bd76d3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:55 -0700 Subject: [PATCH 566/721] mm/memory_hotplug.c: use PFN_UP / PFN_DOWN in walk_system_ram_range() Patch series "mm/memory_hotplug: online_pages() cleanups", v2. Some cleanups (+ one fix for a special case) in the context of online_pages(). This patch (of 5): This makes it clearer that we will never call func() with duplicate PFNs in case we have multiple sub-page memory resources. All unaligned parts of PFNs are completely discarded. Link: http://lkml.kernel.org/r/20190814154109.3448-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Wei Yang Cc: Dan Williams Cc: Borislav Petkov Cc: Bjorn Helgaas Cc: Ingo Molnar Cc: Dave Hansen Cc: Nadav Amit Cc: Oscar Salvador Cc: Arun KS Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 74877e9d90ca..76036a41143b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, while (start < end && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, false, &res)) { - pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; - end_pfn = (res.end + 1) >> PAGE_SHIFT; + pfn = PFN_UP(res.start); + end_pfn = PFN_DOWN(res.end + 1); if (end_pfn > pfn) ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) From 5ecae6359e3a37624cd34d02e4b3401cf98bb62f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:35:59 -0700 Subject: [PATCH 567/721] mm/memory_hotplug: drop PageReserved() check in online_pages_range() move_pfn_range_to_zone() will set all pages to PG_reserved via memmap_init_zone(). The only way a page could no longer be reserved would be if a MEM_GOING_ONLINE notifier would clear PG_reserved - which is not done (the online_page callback is used for that purpose by e.g., Hyper-V instead). walk_system_ram_range() will never call online_pages_range() with duplicate PFNs, so drop the PageReserved() check. This seems to be a leftover from ancient times where the memmap was initialized when adding memory and we wanted to check for already onlined memory. Link: http://lkml.kernel.org/r/20190814154109.3448-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Dan Williams Cc: Arun KS Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Nadav Amit Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c28e5dd017ba..c38050785d12 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -653,9 +653,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, { unsigned long onlined_pages = *(unsigned long *)arg; - if (PageReserved(pfn_to_page(start_pfn))) - onlined_pages += online_pages_blocks(start_pfn, nr_pages); - + onlined_pages += online_pages_blocks(start_pfn, nr_pages); online_mem_sections(start_pfn, start_pfn + nr_pages); *(unsigned long *)arg = onlined_pages; From b2c2ab208e4fa12b0ae5692d14565006899a11fd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:36:02 -0700 Subject: [PATCH 568/721] mm/memory_hotplug: simplify online_pages_range() online_pages always corresponds to nr_pages. Simplify the code, getting rid of online_pages_blocks(). Add some comments. Link: http://lkml.kernel.org/r/20190814154109.3448-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Dan Williams Cc: Arun KS Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Nadav Amit Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c38050785d12..b5ad646df86b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -632,31 +632,27 @@ static void generic_online_page(struct page *page, unsigned int order) #endif } -static int online_pages_blocks(unsigned long start, unsigned long nr_pages) -{ - unsigned long end = start + nr_pages; - int order, onlined_pages = 0; - - while (start < end) { - order = min(MAX_ORDER - 1, - get_order(PFN_PHYS(end) - PFN_PHYS(start))); - (*online_page_callback)(pfn_to_page(start), order); - - onlined_pages += (1UL << order); - start += (1UL << order); - } - return onlined_pages; -} - static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { - unsigned long onlined_pages = *(unsigned long *)arg; + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + int order; - onlined_pages += online_pages_blocks(start_pfn, nr_pages); - online_mem_sections(start_pfn, start_pfn + nr_pages); + /* + * Online the pages. The callback might decide to keep some pages + * PG_reserved (to add them to the buddy later), but we still account + * them as being online/belonging to this zone ("present"). + */ + for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) { + order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn))); + (*online_page_callback)(pfn_to_page(pfn), order); + } - *(unsigned long *)arg = onlined_pages; + /* mark all involved sections as online */ + online_mem_sections(start_pfn, end_pfn); + + *(unsigned long *)arg += nr_pages; return 0; } From bd02cc01d342b43618c86e25552150f7a7e09080 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:36:05 -0700 Subject: [PATCH 569/721] mm/memory_hotplug: make sure the pfn is aligned to the order when onlining Commit a9cd410a3d29 ("mm/page_alloc.c: memory hotplug: free pages as higher order") assumed that any PFN we get via memory resources is aligned to to MAX_ORDER - 1, I am not convinced that is always true. Let's play safe, check the alignment and fallback to single pages. akpm: warn in this situation so we get to find out if and why this ever occurs. [akpm@linux-foundation.org: add WARN_ON_ONCE()] Link: http://lkml.kernel.org/r/20190814154109.3448-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Arun KS Cc: Oscar Salvador Cc: Michal Hocko Cc: Pavel Tatashin Cc: Dan Williams Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Nadav Amit Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b5ad646df86b..aa54e15ea830 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -646,6 +646,9 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, */ for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) { order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn))); + /* __free_pages_core() wants pfns to be aligned to the order */ + if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order))) + order = 0; (*online_page_callback)(pfn_to_page(pfn), order); } From ca9a46f8a4f00c83dcc6c787ae659d117433cbd2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 23 Sep 2019 15:36:08 -0700 Subject: [PATCH 570/721] mm/memory_hotplug: online_pages cannot be 0 in online_pages() walk_system_ram_range() will fail with -EINVAL in case online_pages_range() was never called (== no resource applicable in the range). Otherwise, we will always call online_pages_range() with nr_pages > 0 and, therefore, have online_pages > 0. Remove that special handling. Link: http://lkml.kernel.org/r/20190814154109.3448-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Michal Hocko Cc: Pavel Tatashin Cc: Dan Williams Cc: Arun KS Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Nadav Amit Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aa54e15ea830..49f7bf91c25a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -853,6 +853,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + /* not a single memory resource was applicable */ if (need_zonelists_rebuild) zone_pcp_reset(zone); goto failed_addition; @@ -866,27 +867,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ shuffle_zone(zone); - if (onlined_pages) { - node_states_set_node(nid, &arg); - if (need_zonelists_rebuild) - build_all_zonelists(NULL); - else - zone_pcp_update(zone); - } + node_states_set_node(nid, &arg); + if (need_zonelists_rebuild) + build_all_zonelists(NULL); + else + zone_pcp_update(zone); init_per_zone_wmark_min(); - if (onlined_pages) { - kswapd_run(nid); - kcompactd_run(nid); - } + kswapd_run(nid); + kcompactd_run(nid); vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); - if (onlined_pages) - memory_notify(MEM_ONLINE, &arg); + memory_notify(MEM_ONLINE, &arg); mem_hotplug_done(); return 0; From 29a90db9299575a4bba82158f9d4e81405c54646 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 23 Sep 2019 15:36:18 -0700 Subject: [PATCH 571/721] mm/memory_hotplug.c: s/is/if Correct typo in comment. Link: http://lkml.kernel.org/r/1568233954-3913-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 49f7bf91c25a..b1be791f772d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1793,7 +1793,7 @@ void __remove_memory(int nid, u64 start, u64 size) { /* - * trigger BUG() is some memory is not offlined prior to calling this + * trigger BUG() if some memory is not offlined prior to calling this * function */ if (try_remove_memory(nid, start, size)) From ae83189405ea5c693683327fa69ac95a23ec59be Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Mon, 23 Sep 2019 15:36:21 -0700 Subject: [PATCH 572/721] mm/sparse.c: fix memory leak of sparsemap_buf in aligned memory sparse_buffer_alloc(xsize) gets the size of memory from sparsemap_buf after being aligned with the size. However, the size is at least PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION) and usually larger than PAGE_SIZE. Also, sparse_buffer_fini() only frees memory between sparsemap_buf and sparsemap_buf_end, since sparsemap_buf may be changed by PTR_ALIGN() first, the aligned space before sparsemap_buf is wasted and no one will touch it. In our ARM32 platform (without SPARSEMEM_VMEMMAP) Sparse_buffer_init Reserve d359c000 - d3e9c000 (9M) Sparse_buffer_alloc Alloc d3a00000 - d3E80000 (4.5M) Sparse_buffer_fini Free d3e80000 - d3e9c000 (~=100k) The reserved memory between d359c000 - d3a00000 (~=4.4M) is unfreed. In ARM64 platform (with SPARSEMEM_VMEMMAP) sparse_buffer_init Reserve ffffffc07d623000 - ffffffc07f623000 (32M) Sparse_buffer_alloc Alloc ffffffc07d800000 - ffffffc07f600000 (30M) Sparse_buffer_fini Free ffffffc07f600000 - ffffffc07f623000 (140K) The reserved memory between ffffffc07d623000 - ffffffc07d800000 (~=1.9M) is unfreed. Let's explicit free redundant aligned memory. [arnd@arndb.de: mark sparse_buffer_free as __meminit] Link: http://lkml.kernel.org/r/20190709185528.3251709-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20190705114730.28534-1-lecopzer.chen@mediatek.com Signed-off-by: Lecopzer Chen Signed-off-by: Mark-PK Tsai Signed-off-by: Arnd Bergmann Cc: YJ Chiang Cc: Lecopzer Chen Cc: Pavel Tatashin Cc: Oscar Salvador Cc: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 72f010d9bff5..2bfd078301f8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -470,6 +470,12 @@ struct page __init *__populate_section_memmap(unsigned long pfn, static void *sparsemap_buf __meminitdata; static void *sparsemap_buf_end __meminitdata; +static inline void __meminit sparse_buffer_free(unsigned long size) +{ + WARN_ON(!sparsemap_buf || size == 0); + memblock_free_early(__pa(sparsemap_buf), size); +} + static void __init sparse_buffer_init(unsigned long size, int nid) { phys_addr_t addr = __pa(MAX_DMA_ADDRESS); @@ -486,7 +492,7 @@ static void __init sparse_buffer_fini(void) unsigned long size = sparsemap_buf_end - sparsemap_buf; if (sparsemap_buf && size > 0) - memblock_free_early(__pa(sparsemap_buf), size); + sparse_buffer_free(size); sparsemap_buf = NULL; } @@ -498,8 +504,12 @@ void * __meminit sparse_buffer_alloc(unsigned long size) ptr = PTR_ALIGN(sparsemap_buf, size); if (ptr + size > sparsemap_buf_end) ptr = NULL; - else + else { + /* Free redundant aligned space */ + if ((unsigned long)(ptr - sparsemap_buf) > 0) + sparse_buffer_free((unsigned long)(ptr - sparsemap_buf)); sparsemap_buf = ptr + size; + } } return ptr; } From db57e98d87908b8837352abe08515e42752270c1 Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Mon, 23 Sep 2019 15:36:24 -0700 Subject: [PATCH 573/721] mm/sparse.c: fix ALIGN() without power of 2 in sparse_buffer_alloc() The size argument passed into sparse_buffer_alloc() has already been aligned with PAGE_SIZE or PMD_SIZE. If the size after aligned is not power of 2 (e.g. 0x480000), the PTR_ALIGN() will return wrong value. Use roundup to round sparsemap_buf up to next multiple of size. Link: http://lkml.kernel.org/r/20190705114826.28586-1-lecopzer.chen@mediatek.com Signed-off-by: Lecopzer Chen Signed-off-by: Mark-PK Tsai Cc: YJ Chiang Cc: Lecopzer Chen Cc: Pavel Tatashin Cc: Oscar Salvador Cc: Michal Hocko Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index 2bfd078301f8..79355a86064f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -501,7 +501,7 @@ void * __meminit sparse_buffer_alloc(unsigned long size) void *ptr = NULL; if (sparsemap_buf) { - ptr = PTR_ALIGN(sparsemap_buf, size); + ptr = (void *) roundup((unsigned long)sparsemap_buf, size); if (ptr + size > sparsemap_buf_end) ptr = NULL; else { From c1cbc3eebf7a9ae46473a66710c0859aaafc1607 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 23 Sep 2019 15:36:27 -0700 Subject: [PATCH 574/721] mm/sparse.c: use __nr_to_section(section_nr) to get mem_section __pfn_to_section is defined as __nr_to_section(pfn_to_section_nr(pfn)). Since we already get section_nr, it is not necessary to get mem_section from start_pfn. By doing so, we reduce one redundant operation. Link: http://lkml.kernel.org/r/20190809010242.29797-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: Anshuman Khandual Tested-by: Anshuman Khandual Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Michal Hocko Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index 79355a86064f..a1679024ab5c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -877,7 +877,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, */ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); - ms = __pfn_to_section(start_pfn); + ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); section_mark_present(ms); From 9f82883c6d9af516c2a7f9fe85eb09e9c25bbe0a Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 23 Sep 2019 15:36:30 -0700 Subject: [PATCH 575/721] mm/sparse.c: don't manually decrement num_poisoned_pages Use the function written to do it instead. Link: http://lkml.kernel.org/r/20190827053656.32191-2-alastair@au1.ibm.com Signed-off-by: Alastair D'Silva Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index a1679024ab5c..d7af5cfdc810 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include "internal.h" #include @@ -908,7 +910,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) for (i = 0; i < nr_pages; i++) { if (PageHWPoison(&memmap[i])) { - atomic_long_sub(1, &num_poisoned_pages); + num_poisoned_pages_dec(); ClearPageHWPoison(&memmap[i]); } } From 5ed867037eb1f15b7e8cc92497671fd4b3864e4a Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 23 Sep 2019 15:36:33 -0700 Subject: [PATCH 576/721] mm/sparse.c: remove NULL check in clear_hwpoisoned_pages() There is no possibility for memmap to be NULL in the current codebase. This check was added in commit 95a4774d055c ("memory-hotplug: update mce_bad_pages when removing the memory") where memmap was originally inited to NULL, and only conditionally given a value. The code that could have passed a NULL has been removed by commit ba72b4c8cf60 ("mm/sparsemem: support sub-section hotplug"), so there is no longer a possibility that memmap can be NULL. Link: http://lkml.kernel.org/r/20190829035151.20975-1-alastair@d-silva.org Signed-off-by: Alastair D'Silva Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Mike Rapoport Cc: Wei Yang Cc: Qian Cai Cc: Alexander Duyck Cc: Logan Gunthorpe Cc: Baoquan He Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index d7af5cfdc810..bf32de9e666b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -896,9 +896,6 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { int i; - if (!memmap) - return; - /* * A further optimization is to have per section refcounted * num_poisoned_pages. But that would need more space per memmap, so From dd3b8353bae79395b12a178de057b183ff0122eb Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 23 Sep 2019 15:36:36 -0700 Subject: [PATCH 577/721] mm/vmalloc: do not keep unpurged areas in the busy tree The busy tree can be quite big, even though the area is freed or unmapped it still stays there until "purge" logic removes it. 1) Optimize and reduce the size of "busy" tree by removing a node from it right away as soon as user triggers free paths. It is possible to do so, because the allocation is done using another augmented tree. The vmalloc test driver shows the difference, for example the "fix_size_alloc_test" is ~11% better comparing with default configuration: sudo ./test_vmalloc.sh performance Summary: fix_size_alloc_test loops: 1000000 avg: 993985 usec Summary: full_fit_alloc_test loops: 1000000 avg: 973554 usec Summary: long_busy_list_alloc_test loops: 1000000 avg: 12617652 usec Summary: fix_size_alloc_test loops: 1000000 avg: 882263 usec Summary: full_fit_alloc_test loops: 1000000 avg: 973407 usec Summary: long_busy_list_alloc_test loops: 1000000 avg: 12593929 usec 2) Since the busy tree now contains allocated areas only and does not interfere with lazily free nodes, introduce the new function show_purge_info() that dumps "unpurged" areas that is propagated through "/proc/vmallocinfo". 3) Eliminate VM_LAZY_FREE flag. Link: http://lkml.kernel.org/r/20190716152656.12255-2-lpf.vector@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Pengfei Li Cc: Roman Gushchin Cc: Uladzislau Rezki Cc: Hillf Danton Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c1246d77cf75..d535ef125bda 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -329,7 +329,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 -#define VM_LAZY_FREE 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); @@ -1282,7 +1281,14 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) llist_for_each_entry_safe(va, n_va, valist, purge_list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; - __free_vmap_area(va); + /* + * Finally insert or merge lazily-freed area. It is + * detached and there is no need to "unlink" it from + * anything. + */ + merge_or_add_vmap_area(va, + &free_vmap_area_root, &free_vmap_area_list); + atomic_long_sub(nr, &vmap_lazy_nr); if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) @@ -1324,6 +1330,10 @@ static void free_vmap_area_noflush(struct vmap_area *va) { unsigned long nr_lazy; + spin_lock(&vmap_area_lock); + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -2143,14 +2153,13 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); - va = find_vmap_area((unsigned long)addr); + spin_lock(&vmap_area_lock); + va = __find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; - spin_lock(&vmap_area_lock); va->vm = NULL; va->flags &= ~VM_VM_AREA; - va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); kasan_free_shadow(vm); @@ -2158,6 +2167,8 @@ struct vm_struct *remove_vm_area(const void *addr) return vm; } + + spin_unlock(&vmap_area_lock); return NULL; } @@ -3450,6 +3461,22 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) } } +static void show_purge_info(struct seq_file *m) +{ + struct llist_node *head; + struct vmap_area *va; + + head = READ_ONCE(vmap_purge_list.first); + if (head == NULL) + return; + + llist_for_each_entry(va, head, purge_list) { + seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + } +} + static int s_show(struct seq_file *m, void *p) { struct vmap_area *va; @@ -3462,10 +3489,9 @@ static int s_show(struct seq_file *m, void *p) * behalf of vmap area is being tear down or vm_map_ram allocation. */ if (!(va->flags & VM_VM_AREA)) { - seq_printf(m, "0x%pK-0x%pK %7ld %s\n", + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start, - va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); + va->va_end - va->va_start); return 0; } @@ -3504,6 +3530,16 @@ static int s_show(struct seq_file *m, void *p) show_numa_info(m, v); seq_putc(m, '\n'); + + /* + * As a final step, dump "unpurged" areas. Note, + * that entire "/proc/vmallocinfo" output will not + * be address sorted, because the purge list is not + * sorted. + */ + if (list_is_last(&va->list, &vmap_area_list)) + show_purge_info(m); + return 0; } From 688fcbfc06e4fdfbb7e1d5a942a1460fe6379d2d Mon Sep 17 00:00:00 2001 From: Pengfei Li Date: Mon, 23 Sep 2019 15:36:39 -0700 Subject: [PATCH 578/721] mm/vmalloc: modify struct vmap_area to reduce its size Objective --------- The current implementation of struct vmap_area wasted space. After applying this commit, sizeof(struct vmap_area) has been reduced from 11 words to 8 words. Description ----------- 1) Pack "subtree_max_size", "vm" and "purge_list". This is no problem because A) "subtree_max_size" is only used when vmap_area is in "free" tree B) "vm" is only used when vmap_area is in "busy" tree C) "purge_list" is only used when vmap_area is in vmap_purge_list 2) Eliminate "flags". ;Since only one flag VM_VM_AREA is being used, and the same thing can be done by judging whether "vm" is NULL, then the "flags" can be eliminated. Link: http://lkml.kernel.org/r/20190716152656.12255-3-lpf.vector@gmail.com Signed-off-by: Pengfei Li Suggested-by: Uladzislau Rezki (Sony) Reviewed-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Roman Gushchin Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 20 +++++++++++++------- mm/vmalloc.c | 24 ++++++++++-------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index dfa718ffdd4f..4e7809408073 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -53,15 +53,21 @@ struct vmap_area { unsigned long va_start; unsigned long va_end; - /* - * Largest available free size in subtree. - */ - unsigned long subtree_max_size; - unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ - struct llist_node purge_list; /* "lazy purge" list */ - struct vm_struct *vm; + + /* + * The following three variables can be packed, because + * a vmap_area object is always one of the three states: + * 1) in "free" tree (root is vmap_area_root) + * 2) in "busy" tree (root is free_vmap_area_root) + * 3) in purge list (head is vmap_purge_list) + */ + union { + unsigned long subtree_max_size; /* in "free" tree */ + struct vm_struct *vm; /* in "busy" tree */ + struct llist_node purge_list; /* in purge list */ + }; }; /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d535ef125bda..f095843fc243 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -329,7 +329,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 -#define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ @@ -1115,7 +1114,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va->va_start = addr; va->va_end = addr + size; - va->flags = 0; + va->vm = NULL; insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); @@ -1928,7 +1927,6 @@ void __init vmalloc_init(void) if (WARN_ON_ONCE(!va)) continue; - va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; @@ -2026,7 +2024,6 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; - va->flags |= VM_VM_AREA; spin_unlock(&vmap_area_lock); } @@ -2131,10 +2128,10 @@ struct vm_struct *find_vm_area(const void *addr) struct vmap_area *va; va = find_vmap_area((unsigned long)addr); - if (va && va->flags & VM_VM_AREA) - return va->vm; + if (!va) + return NULL; - return NULL; + return va->vm; } /** @@ -2155,11 +2152,10 @@ struct vm_struct *remove_vm_area(const void *addr) spin_lock(&vmap_area_lock); va = __find_vmap_area((unsigned long)addr); - if (va && va->flags & VM_VM_AREA) { + if (va && va->vm) { struct vm_struct *vm = va->vm; va->vm = NULL; - va->flags &= ~VM_VM_AREA; spin_unlock(&vmap_area_lock); kasan_free_shadow(vm); @@ -2862,7 +2858,7 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!(va->flags & VM_VM_AREA)) + if (!va->vm) continue; vm = va->vm; @@ -2942,7 +2938,7 @@ long vwrite(char *buf, char *addr, unsigned long count) if (!count) break; - if (!(va->flags & VM_VM_AREA)) + if (!va->vm) continue; vm = va->vm; @@ -3485,10 +3481,10 @@ static int s_show(struct seq_file *m, void *p) va = list_entry(p, struct vmap_area, list); /* - * s_show can encounter race with remove_vm_area, !VM_VM_AREA on - * behalf of vmap area is being tear down or vm_map_ram allocation. + * s_show can encounter race with remove_vm_area, !vm on behalf + * of vmap area is being tear down or vm_map_ram allocation. */ - if (!(va->flags & VM_VM_AREA)) { + if (!va->vm) { seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); From 7ea362427c170061b8822dd41bafaa72b3bcb9ad Mon Sep 17 00:00:00 2001 From: Austin Kim Date: Mon, 23 Sep 2019 15:36:42 -0700 Subject: [PATCH 579/721] mm/vmalloc.c: move 'area->pages' after if statement If !area->pages statement is true where memory allocation fails, area is freed. In this case 'area->pages = pages' should not executed. So move 'area->pages = pages' after if statement. [akpm@linux-foundation.org: give area->pages the same treatment] Link: http://lkml.kernel.org/r/20190830035716.GA190684@LGEARND20B15 Signed-off-by: Austin Kim Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Uladzislau Rezki (Sony) Cc: Roman Gushchin Cc: Roman Penyaev Cc: Rick Edgecombe Cc: Mike Rapoport Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f095843fc243..fcadd3e25c0c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2409,7 +2409,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); - area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, @@ -2417,13 +2416,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } else { pages = kmalloc_node(array_size, nested_gfp, node); } - area->pages = pages; - if (!area->pages) { + + if (!pages) { remove_vm_area(area->addr); kfree(area); return NULL; } + area->pages = pages; + area->nr_pages = nr_pages; + for (i = 0; i < area->nr_pages; i++) { struct page *page; From 2286bf4e4d8f2fda407222b54f685dff3b80e0a0 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:36:45 -0700 Subject: [PATCH 580/721] mm: use CPU_BITS_NONE to initialize init_mm.cpu_bitmask Replace open-coded bitmap array initialization of init_mm.cpu_bitmask with neat CPU_BITS_NONE macro. And, since init_mm.cpu_bitmask is statically set to zero, there is no way to clear it again in start_kernel(). Link: http://lkml.kernel.org/r/1565703815-8584-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 1 - mm/init-mm.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index 99a5f55e0d02..208b8fa1808e 100644 --- a/init/main.c +++ b/init/main.c @@ -594,7 +594,6 @@ asmlinkage __visible void __init start_kernel(void) page_address_init(); pr_notice("%s", linux_banner); setup_arch(&command_line); - mm_init_cpumask(&init_mm); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); diff --git a/mm/init-mm.c b/mm/init-mm.c index a787a319211e..fb1e15028ef0 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -35,6 +35,6 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, + .cpu_bitmap = CPU_BITS_NONE, INIT_MM_CONTEXT(init_mm) }; From b57a775f5130dd83ba0e7f46cd86741bf19b71c5 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:36:48 -0700 Subject: [PATCH 581/721] mm: silence -Woverride-init/initializer-overrides When compiling a kernel with W=1, there are several of those warnings due to arm64 overriding a field on purpose. Just disable those warnings for both GCC and Clang of this file, so it will help dig "gems" hidden in the W=1 warnings by reducing some noises. mm/init-mm.c:39:2: warning: initializer overrides prior initialization of this subobject [-Winitializer-overrides] INIT_MM_CONTEXT(init_mm) ^~~~~~~~~~~~~~~~~~~~~~~~ ./arch/arm64/include/asm/mmu.h:133:9: note: expanded from macro 'INIT_MM_CONTEXT' .pgd = init_pg_dir, ^~~~~~~~~~~ mm/init-mm.c:30:10: note: previous initialization is here .pgd = swapper_pg_dir, ^~~~~~~~~~~~~~ Note: there is a side project trying to support explicitly allowing specific initializer overrides in Clang, but there is no guarantee it will happen or not. https://github.com/ClangBuiltLinux/linux/issues/639 Link: http://lkml.kernel.org/r/1566920867-27453-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Cc: Nick Desaulniers Cc: Masahiro Yamada Cc: Mark Rutland Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/Makefile b/mm/Makefile index d11de59e9c3c..d996846697ef 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n +CFLAGS_init-mm.o += $(call cc-disable-warning, override-init) +CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides) + mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ From 63398413c00c7836ea87a1fa205c91d2199b25cf Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 23 Sep 2019 15:36:51 -0700 Subject: [PATCH 582/721] z3fold: fix memory leak in kmem cache Currently there is a leak in init_z3fold_page() -- it allocates handles from kmem cache even for headless pages, but then they are never used and never freed, so eventually kmem cache may get exhausted. This patch provides a fix for that. Link: http://lkml.kernel.org/r/20190917185352.44cf285d3ebd9e64548de5de@gmail.com Signed-off-by: Vitaly Wool Reported-by: Markus Linnala Tested-by: Markus Linnala Cc: Dan Streetman Cc: Henry Burns Cc: Shakeel Butt Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index d637d9ee005c..05bdf90646e7 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -295,14 +295,11 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool) } /* Initializes the z3fold header of a newly allocated z3fold page */ -static struct z3fold_header *init_z3fold_page(struct page *page, +static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, struct z3fold_pool *pool, gfp_t gfp) { struct z3fold_header *zhdr = page_address(page); - struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp); - - if (!slots) - return NULL; + struct z3fold_buddy_slots *slots; INIT_LIST_HEAD(&page->lru); clear_bit(PAGE_HEADLESS, &page->private); @@ -310,6 +307,12 @@ static struct z3fold_header *init_z3fold_page(struct page *page, clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); clear_bit(PAGE_CLAIMED, &page->private); + if (headless) + return zhdr; + + slots = alloc_slots(pool, gfp); + if (!slots) + return NULL; spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); @@ -930,7 +933,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, if (!page) return -ENOMEM; - zhdr = init_z3fold_page(page, pool, gfp); + zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); if (!zhdr) { __free_page(page); return -ENOMEM; From a94b525241c0fff3598809131d7cfcfe1d572d8c Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 23 Sep 2019 15:36:54 -0700 Subject: [PATCH 583/721] mm/compaction.c: clear total_{migrate,free}_scanned before scanning a new zone total_{migrate,free}_scanned will be added to COMPACTMIGRATE_SCANNED and COMPACTFREE_SCANNED in compact_zone(). We should clear them before scanning a new zone. In the proc triggered compaction, we forgot clearing them. [laoar.shao@gmail.com: introduce a helper compact_zone_counters_init()] Link: http://lkml.kernel.org/r/1563869295-25748-1-git-send-email-laoar.shao@gmail.com [akpm@linux-foundation.org: expand compact_zone_counters_init() into its single callsite, per mhocko] [vbabka@suse.cz: squash compact_zone() list_head init as well] Link: http://lkml.kernel.org/r/1fb6f7da-f776-9e42-22f8-bbb79b030b98@suse.cz [akpm@linux-foundation.org: kcompactd_do_work(): avoid unnecessary initialization of cc.zone] Link: http://lkml.kernel.org/r/1563789275-9639-1-git-send-email-laoar.shao@gmail.com Fixes: 7f354a548d1c ("mm, compaction: add vmstats for kcompactd work") Signed-off-by: Yafang Shao Signed-off-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Yafang Shao Cc: Mel Gorman Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 777c088e9113..d99d59412c75 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2078,6 +2078,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; + /* + * These counters track activities during zone compaction. Initialize + * them before compacting a new zone. + */ + cc->total_migrate_scanned = 0; + cc->total_free_scanned = 0; + cc->nr_migratepages = 0; + cc->nr_freepages = 0; + INIT_LIST_HEAD(&cc->freepages); + INIT_LIST_HEAD(&cc->migratepages); + cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, cc->classzone_idx); @@ -2281,10 +2292,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, { enum compact_result ret; struct compact_control cc = { - .nr_freepages = 0, - .nr_migratepages = 0, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .order = order, .search_order = order, .gfp_mask = gfp_mask, @@ -2305,8 +2312,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, if (capture) current->capture_control = &capc; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); ret = compact_zone(&cc, &capc); @@ -2408,8 +2413,6 @@ static void compact_node(int nid) struct zone *zone; struct compact_control cc = { .order = -1, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, @@ -2423,11 +2426,7 @@ static void compact_node(int nid) if (!populated_zone(zone)) continue; - cc.nr_freepages = 0; - cc.nr_migratepages = 0; cc.zone = zone; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); compact_zone(&cc, NULL); @@ -2529,8 +2528,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct compact_control cc = { .order = pgdat->kcompactd_max_order, .search_order = pgdat->kcompactd_max_order, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = false, @@ -2554,16 +2551,10 @@ static void kcompactd_do_work(pg_data_t *pgdat) COMPACT_CONTINUE) continue; - cc.nr_freepages = 0; - cc.nr_migratepages = 0; - cc.total_migrate_scanned = 0; - cc.total_free_scanned = 0; - cc.zone = zone; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); - if (kthread_should_stop()) return; + + cc.zone = zone; status = compact_zone(&cc, NULL); if (status == COMPACT_SUCCESS) { From 32aaf0553df99cc4314f6e9f43216cd83afc6c20 Mon Sep 17 00:00:00 2001 From: Pengfei Li Date: Mon, 23 Sep 2019 15:36:58 -0700 Subject: [PATCH 584/721] mm/compaction.c: remove unnecessary zone parameter in isolate_migratepages() Like commit 40cacbcb3240 ("mm, compaction: remove unnecessary zone parameter in some instances"), remove unnecessary zone parameter. No functional change. Link: http://lkml.kernel.org/r/20190806151616.21107-1-lpf.vector@gmail.com Signed-off-by: Pengfei Li Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Qian Cai Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index d99d59412c75..ce08b39d85d4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1737,8 +1737,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) * starting at the block pointed to by the migrate scanner pfn within * compact_control. */ -static isolate_migrate_t isolate_migratepages(struct zone *zone, - struct compact_control *cc) +static isolate_migrate_t isolate_migratepages(struct compact_control *cc) { unsigned long block_start_pfn; unsigned long block_end_pfn; @@ -1756,8 +1755,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, */ low_pfn = fast_find_migrateblock(cc); block_start_pfn = pageblock_start_pfn(low_pfn); - if (block_start_pfn < zone->zone_start_pfn) - block_start_pfn = zone->zone_start_pfn; + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; /* * fast_find_migrateblock marks a pageblock skipped so to avoid @@ -1787,8 +1786,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) cond_resched(); - page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, - zone); + page = pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone); if (!page) continue; @@ -2169,7 +2168,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->rescan = true; } - switch (isolate_migratepages(cc->zone, cc)) { + switch (isolate_migratepages(cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); From 4406548ee39c2268876b61a927acad45da6f9aef Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 23 Sep 2019 15:37:01 -0700 Subject: [PATCH 585/721] mm/mempolicy.c: remove unnecessary nodemask check in kernel_migrate_pages() 1) task_nodes = cpuset_mems_allowed(current); -> cpuset_mems_allowed() guaranteed to return some non-empty subset of node_states[N_MEMORY]. 2) nodes_and(*new, *new, task_nodes); -> after nodes_and(), the 'new' should be empty or appropriate nodemask(online node and with memory). After 1) and 2), we could remove unnecessary check whether the 'new' AND node_states[N_MEMORY] is empty. Link: http://lkml.kernel.org/r/20190806023634.55356-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Williams Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f000771558d8..464406e8da91 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1512,10 +1512,6 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, if (nodes_empty(*new)) goto out_put; - nodes_and(*new, *new, node_states[N_MEMORY]); - if (nodes_empty(*new)) - goto out_put; - err = security_task_movememory(task); if (err) goto out_put; From 8ac3f8fe91a2119522a73fbc41d354057054e6ed Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Mon, 23 Sep 2019 15:37:04 -0700 Subject: [PATCH 586/721] mm/oom_kill.c: add task UID to info message on an oom kill In the event of an oom kill, useful information about the killed process is printed to dmesg. Users, especially system administrators, will find it useful to immediately see the UID of the process. We already print uid when dumping eligible tasks so it is not overly hard to find that information in the oom report. However this information is unavailable when dumping of eligible tasks is disabled. In the following example, abuse_the_ram is the name of a program that attempts to iteratively allocate all available memory until it is stopped by force. Current message: Out of memory: Killed process 35389 (abuse_the_ram) total-vm:133718232kB, anon-rss:129624980kB, file-rss:0kB, shmem-rss:0kB Patched message: Out of memory: Killed process 2739 (abuse_the_ram), total-vm:133880028kB, anon-rss:129754836kB, file-rss:0kB, shmem-rss:0kB, UID:0 [akpm@linux-foundation.org: s/UID %d/UID:%u/ in printk] Link: http://lkml.kernel.org/r/1560362273-534-1-git-send-email-jsavitz@redhat.com Signed-off-by: Joel Savitz Suggested-by: David Rientjes Acked-by: Rafael Aquini Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eda2e2a0bdc6..95872bdfec4e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -884,12 +884,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); - pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u\n", message, task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), K(get_mm_counter(victim->mm, MM_FILEPAGES)), - K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); + K(get_mm_counter(victim->mm, MM_SHMEMPAGES)), + from_kuid(&init_user_ns, task_uid(victim))); task_unlock(victim); /* From f9c645621a28e37813a1de96d9cbd89cde94a1e4 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 23 Sep 2019 15:37:08 -0700 Subject: [PATCH 587/721] memcg, oom: don't require __GFP_FS when invoking memcg OOM killer Masoud Sharbiani noticed that commit 29ef680ae7c21110 ("memcg, oom: move out_of_memory back to the charge path") broke memcg OOM called from __xfs_filemap_fault() path. It turned out that try_charge() is retrying forever without making forward progress because mem_cgroup_oom(GFP_NOFS) cannot invoke the OOM killer due to commit 3da88fb3bacfaa33 ("mm, oom: move GFP_NOFS check to out_of_memory"). Allowing forced charge due to being unable to invoke memcg OOM killer will lead to global OOM situation. Also, just returning -ENOMEM will be risky because OOM path is lost and some paths (e.g. get_user_pages()) will leak -ENOMEM. Therefore, invoking memcg OOM killer (despite GFP_NOFS) will be the only choice we can choose for now. Until 29ef680ae7c21110, we were able to invoke memcg OOM killer when GFP_KERNEL reclaim failed [1]. But since 29ef680ae7c21110, we need to invoke memcg OOM killer when GFP_NOFS reclaim failed [2]. Although in the past we did invoke memcg OOM killer for GFP_NOFS [3], we might get pre-mature memcg OOM reports due to this patch. [1] leaker invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0 CPU: 0 PID: 2746 Comm: leaker Not tainted 4.18.0+ #19 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 Call Trace: dump_stack+0x63/0x88 dump_header+0x67/0x27a ? mem_cgroup_scan_tasks+0x91/0xf0 oom_kill_process+0x210/0x410 out_of_memory+0x10a/0x2c0 mem_cgroup_out_of_memory+0x46/0x80 mem_cgroup_oom_synchronize+0x2e4/0x310 ? high_work_func+0x20/0x20 pagefault_out_of_memory+0x31/0x76 mm_fault_error+0x55/0x115 ? handle_mm_fault+0xfd/0x220 __do_page_fault+0x433/0x4e0 do_page_fault+0x22/0x30 ? page_fault+0x8/0x30 page_fault+0x1e/0x30 RIP: 0033:0x4009f0 Code: 03 00 00 00 e8 71 fd ff ff 48 83 f8 ff 49 89 c6 74 74 48 89 c6 bf c0 0c 40 00 31 c0 e8 69 fd ff ff 45 85 ff 7e 21 31 c9 66 90 <41> 0f be 14 0e 01 d3 f7 c1 ff 0f 00 00 75 05 41 c6 04 0e 2a 48 83 RSP: 002b:00007ffe29ae96f0 EFLAGS: 00010206 RAX: 000000000000001b RBX: 0000000000000000 RCX: 0000000001ce1000 RDX: 0000000000000000 RSI: 000000007fffffe5 RDI: 0000000000000000 RBP: 000000000000000c R08: 0000000000000000 R09: 00007f94be09220d R10: 0000000000000002 R11: 0000000000000246 R12: 00000000000186a0 R13: 0000000000000003 R14: 00007f949d845000 R15: 0000000002800000 Task in /leaker killed as a result of limit of /leaker memory: usage 524288kB, limit 524288kB, failcnt 158965 memory+swap: usage 0kB, limit 9007199254740988kB, failcnt 0 kmem: usage 2016kB, limit 9007199254740988kB, failcnt 0 Memory cgroup stats for /leaker: cache:844KB rss:521136KB rss_huge:0KB shmem:0KB mapped_file:0KB dirty:132KB writeback:0KB inactive_anon:0KB active_anon:521224KB inactive_file:1012KB active_file:8KB unevictable:0KB Memory cgroup out of memory: Kill process 2746 (leaker) score 998 or sacrifice child Killed process 2746 (leaker) total-vm:536704kB, anon-rss:521176kB, file-rss:1208kB, shmem-rss:0kB oom_reaper: reaped process 2746 (leaker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [2] leaker invoked oom-killer: gfp_mask=0x600040(GFP_NOFS), nodemask=(null), order=0, oom_score_adj=0 CPU: 1 PID: 2746 Comm: leaker Not tainted 4.18.0+ #20 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 Call Trace: dump_stack+0x63/0x88 dump_header+0x67/0x27a ? mem_cgroup_scan_tasks+0x91/0xf0 oom_kill_process+0x210/0x410 out_of_memory+0x109/0x2d0 mem_cgroup_out_of_memory+0x46/0x80 try_charge+0x58d/0x650 ? __radix_tree_replace+0x81/0x100 mem_cgroup_try_charge+0x7a/0x100 __add_to_page_cache_locked+0x92/0x180 add_to_page_cache_lru+0x4d/0xf0 iomap_readpages_actor+0xde/0x1b0 ? iomap_zero_range_actor+0x1d0/0x1d0 iomap_apply+0xaf/0x130 iomap_readpages+0x9f/0x150 ? iomap_zero_range_actor+0x1d0/0x1d0 xfs_vm_readpages+0x18/0x20 [xfs] read_pages+0x60/0x140 __do_page_cache_readahead+0x193/0x1b0 ondemand_readahead+0x16d/0x2c0 page_cache_async_readahead+0x9a/0xd0 filemap_fault+0x403/0x620 ? alloc_set_pte+0x12c/0x540 ? _cond_resched+0x14/0x30 __xfs_filemap_fault+0x66/0x180 [xfs] xfs_filemap_fault+0x27/0x30 [xfs] __do_fault+0x19/0x40 __handle_mm_fault+0x8e8/0xb60 handle_mm_fault+0xfd/0x220 __do_page_fault+0x238/0x4e0 do_page_fault+0x22/0x30 ? page_fault+0x8/0x30 page_fault+0x1e/0x30 RIP: 0033:0x4009f0 Code: 03 00 00 00 e8 71 fd ff ff 48 83 f8 ff 49 89 c6 74 74 48 89 c6 bf c0 0c 40 00 31 c0 e8 69 fd ff ff 45 85 ff 7e 21 31 c9 66 90 <41> 0f be 14 0e 01 d3 f7 c1 ff 0f 00 00 75 05 41 c6 04 0e 2a 48 83 RSP: 002b:00007ffda45c9290 EFLAGS: 00010206 RAX: 000000000000001b RBX: 0000000000000000 RCX: 0000000001a1e000 RDX: 0000000000000000 RSI: 000000007fffffe5 RDI: 0000000000000000 RBP: 000000000000000c R08: 0000000000000000 R09: 00007f6d061ff20d R10: 0000000000000002 R11: 0000000000000246 R12: 00000000000186a0 R13: 0000000000000003 R14: 00007f6ce59b2000 R15: 0000000002800000 Task in /leaker killed as a result of limit of /leaker memory: usage 524288kB, limit 524288kB, failcnt 7221 memory+swap: usage 0kB, limit 9007199254740988kB, failcnt 0 kmem: usage 1944kB, limit 9007199254740988kB, failcnt 0 Memory cgroup stats for /leaker: cache:3632KB rss:518232KB rss_huge:0KB shmem:0KB mapped_file:0KB dirty:0KB writeback:0KB inactive_anon:0KB active_anon:518408KB inactive_file:3908KB active_file:12KB unevictable:0KB Memory cgroup out of memory: Kill process 2746 (leaker) score 992 or sacrifice child Killed process 2746 (leaker) total-vm:536704kB, anon-rss:518264kB, file-rss:1188kB, shmem-rss:0kB oom_reaper: reaped process 2746 (leaker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB [3] leaker invoked oom-killer: gfp_mask=0x50, order=0, oom_score_adj=0 leaker cpuset=/ mems_allowed=0 CPU: 1 PID: 3206 Comm: leaker Not tainted 3.10.0-957.27.2.el7.x86_64 #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 Call Trace: [] dump_stack+0x19/0x1b [] dump_header+0x90/0x229 [] ? find_lock_task_mm+0x56/0xc0 [] ? try_get_mem_cgroup_from_mm+0x28/0x60 [] oom_kill_process+0x254/0x3d0 [] mem_cgroup_oom_synchronize+0x546/0x570 [] ? mem_cgroup_charge_common+0xc0/0xc0 [] pagefault_out_of_memory+0x14/0x90 [] mm_fault_error+0x6a/0x157 [] __do_page_fault+0x3c8/0x4f0 [] do_page_fault+0x35/0x90 [] page_fault+0x28/0x30 Task in /leaker killed as a result of limit of /leaker memory: usage 524288kB, limit 524288kB, failcnt 20628 memory+swap: usage 524288kB, limit 9007199254740988kB, failcnt 0 kmem: usage 0kB, limit 9007199254740988kB, failcnt 0 Memory cgroup stats for /leaker: cache:840KB rss:523448KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:523448KB inactive_file:464KB active_file:376KB unevictable:0KB Memory cgroup out of memory: Kill process 3206 (leaker) score 970 or sacrifice child Killed process 3206 (leaker) total-vm:536692kB, anon-rss:523304kB, file-rss:412kB, shmem-rss:0kB Bisected by Masoud Sharbiani. Link: http://lkml.kernel.org/r/cbe54ed1-b6ba-a056-8899-2dc42526371d@i-love.sakura.ne.jp Fixes: 3da88fb3bacfaa33 ("mm, oom: move GFP_NOFS check to out_of_memory") [necessary after 29ef680ae7c21110] Signed-off-by: Tetsuo Handa Reported-by: Masoud Sharbiani Tested-by: Masoud Sharbiani Acked-by: Michal Hocko Cc: David Rientjes Cc: [4.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 95872bdfec4e..a6b76624f5b8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1069,9 +1069,10 @@ bool out_of_memory(struct oom_control *oc) * The OOM killer does not compensate for IO-less reclaim. * pagefault_out_of_memory lost its gfp context so we have to * make sure exclude 0 mask - all other users should have at least - * ___GFP_DIRECT_RECLAIM to get here. + * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to + * invoke the OOM killer even if it is a GFP_NOFS allocation. */ - if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) + if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) return true; /* From 70cb6d2677905121bfc7fdf5babfd8444218edd9 Mon Sep 17 00:00:00 2001 From: Edward Chron Date: Mon, 23 Sep 2019 15:37:11 -0700 Subject: [PATCH 588/721] mm/oom: add oom_score_adj and pgtables to Killed process message For an OOM event: print oom_score_adj value for the OOM Killed process to document what the oom score adjust value was at the time the process was OOM Killed. The adjustment value can be set by user code and it affects the resulting oom_score so it is used to influence kill process selection. When eligible tasks are not printed (sysctl oom_dump_tasks = 0) printing this value is the only documentation of the value for the process being killed. Having this value on the Killed process message is useful to document if a miscconfiguration occurred or to confirm that the oom_score_adj configuration applies as expected. An example which illustates both misconfiguration and validation that the oom_score_adj was applied as expected is: Aug 14 23:00:02 testserver kernel: Out of memory: Killed process 2692 (systemd-udevd) total-vm:1056800kB, anon-rss:1052760kB, file-rss:4kB, shmem-rss:0kB pgtables:22kB oom_score_adj:1000 The systemd-udevd is a critical system application that should have an oom_score_adj of -1000. It was miconfigured to have a adjustment of 1000 making it a highly favored OOM kill target process. The output documents both the misconfiguration and the fact that the process was correctly targeted by OOM due to the miconfiguration. This can be quite helpful for triage and problem determination. The addition of the pgtables_bytes shows page table usage by the process and is a useful measure of the memory size of the process. Link: http://lkml.kernel.org/r/20190822173157.1569-1-echron@arista.com Signed-off-by: Edward Chron Acked-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a6b76624f5b8..76000feb0794 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -884,13 +884,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); - pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u\n", - message, task_pid_nr(victim), victim->comm, - K(victim->mm->total_vm), - K(get_mm_counter(victim->mm, MM_ANONPAGES)), - K(get_mm_counter(victim->mm, MM_FILEPAGES)), - K(get_mm_counter(victim->mm, MM_SHMEMPAGES)), - from_kuid(&init_user_ns, task_uid(victim))); + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", + message, task_pid_nr(victim), victim->comm, K(mm->total_vm), + K(get_mm_counter(mm, MM_ANONPAGES)), + K(get_mm_counter(mm, MM_FILEPAGES)), + K(get_mm_counter(mm, MM_SHMEMPAGES)), + from_kuid(&init_user_ns, task_uid(victim)), + mm_pgtables_bytes(mm), victim->signal->oom_score_adj); task_unlock(victim); /* From f364f06b34b55285df7b132b4e3752d820412ad4 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 23 Sep 2019 15:37:14 -0700 Subject: [PATCH 589/721] mm/oom_kill.c: fix oom_cpuset_eligible() comment Commit ac311a14c682 ("oom: decouple mems_allowed from oom_unkillable_task") changed has_intersects_mems_allowed() to oom_cpuset_eligible(), but didn't change the comment. Link: http://lkml.kernel.org/r/1566959929-10638-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Yi Wang Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 76000feb0794..1530fccdbe4f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -73,7 +73,7 @@ static inline bool is_memcg_oom(struct oom_control *oc) /** * oom_cpuset_eligible() - check task eligiblity for kill * @start: task struct of which task to consider - * @mask: nodemask passed to page allocator for mempolicy ooms + * @oc: pointer to struct oom_control * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy From 1eb41bb07e568629805acb8455c44b6709819551 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 Sep 2019 15:37:16 -0700 Subject: [PATCH 590/721] mm, oom: consider present pages for the node size constrained_alloc() calculates the size of the oom domain by using node_spanned_pages which is incorrect because this is the full range of the physical memory range that the numa node occupies rather than the memory that backs that range which is represented by node_present_pages. Sparsely populated nodes (e.g. after memory hot remove or simply sparse due to memory layout) can have really a large difference between the two. This shouldn't really cause any real user observable problems because the oom calculates a ratio against totalpages and used memory cannot exceed present pages but it is confusing and wrong from code point of view. Link: http://lkml.kernel.org/r/20190829163443.899-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: David Hildenbrand Reviewed-by: David Hildenbrand Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1530fccdbe4f..c1d9496b4c43 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -287,7 +287,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { oc->totalpages = total_swap_pages; for_each_node_mask(nid, *oc->nodemask) - oc->totalpages += node_spanned_pages(nid); + oc->totalpages += node_present_pages(nid); return CONSTRAINT_MEMORY_POLICY; } @@ -300,7 +300,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) if (cpuset_limited) { oc->totalpages = total_swap_pages; for_each_node_mask(nid, cpuset_current_mems_allowed) - oc->totalpages += node_spanned_pages(nid); + oc->totalpages += node_present_pages(nid); return CONSTRAINT_CPUSET; } return CONSTRAINT_NONE; From 4d0e3230a56a75bf26d74b3f4b6cb05a0e9fae60 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:37:19 -0700 Subject: [PATCH 591/721] mm/memcontrol.c: fix a -Wunused-function warning mem_cgroup_id_get() was introduced in commit 73f576c04b94 ("mm:memcontrol: fix cgroup creation failure after many small jobs"). Later, it no longer has any user since the commits, 1f47b61fb407 ("mm: memcontrol: fix swap counter leak on swapout from offline cgroup") 58fa2a5512d9 ("mm: memcontrol: add sanity checks for memcg->id.ref on get/put") so safe to remove it. Link: http://lkml.kernel.org/r/1568648453-5482-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 80131025f2da..701c1ab79d24 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4930,11 +4930,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) } } -static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) -{ - mem_cgroup_id_get_many(memcg, 1); -} - static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) { mem_cgroup_id_put_many(memcg, 1); From 0158115f702b0ba208ab0b5adf44cae99b3ebcc7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 Sep 2019 15:37:22 -0700 Subject: [PATCH 592/721] memcg, kmem: deprecate kmem.limit_in_bytes Cgroup v1 memcg controller has exposed a dedicated kmem limit to users which turned out to be really a bad idea because there are paths which cannot shrink the kernel memory usage enough to get below the limit (e.g. because the accounted memory is not reclaimable). There are cases when the failure is even not allowed (e.g. __GFP_NOFAIL). This means that the kmem limit is in excess to the hard limit without any way to shrink and thus completely useless. OOM killer cannot be invoked to handle the situation because that would lead to a premature oom killing. As a result many places might see ENOMEM returning from kmalloc and result in unexpected errors. E.g. a global OOM killer when there is a lot of free memory because ENOMEM is translated into VM_FAULT_OOM in #PF path and therefore pagefault_out_of_memory would result in OOM killer. Please note that the kernel memory is still accounted to the overall limit along with the user memory so removing the kmem specific limit should still allow to contain kernel memory consumption. Unlike the kmem one, though, it invokes memory reclaim and targeted memcg oom killing if necessary. Start the deprecation process by crying to the kernel log. Let's see whether there are relevant usecases and simply return to EINVAL in the second stage if nobody complains in few releases. [akpm@linux-foundation.org: tweak documentation text] Link: http://lkml.kernel.org/r/20190911151612.GI4023@dhcp22.suse.cz Signed-off-by: Michal Hocko Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Andrey Ryabinin Cc: Thomas Lindroth Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v1/memory.rst | 4 +++- mm/memcontrol.c | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 41bdc038dad9..0ae4f564c2d6 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -85,8 +85,10 @@ Brief summary of control files. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node - memory.kmem.limit_in_bytes set/show hard limit for kernel memory + This knob is deprecated and shouldn't be + used. It is planned that this be removed in + the foreseeable future. memory.kmem.usage_in_bytes show current kernel memory allocation memory.kmem.failcnt show the number of kernel memory usage hits limits diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 701c1ab79d24..7bb2971dc75e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3637,6 +3637,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: + pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: From 1c6c15971e4709953f75082a5d44212536b1c2b7 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Mon, 23 Sep 2019 15:37:26 -0700 Subject: [PATCH 593/721] mm, reclaim: make should_continue_reclaim perform dryrun detection Patch series "address hugetlb page allocation stalls", v2. Allocation of hugetlb pages via sysctl or procfs can stall for minutes or hours. A simple example on a two node system with 8GB of memory is as follows: echo 4096 > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages echo 4096 > /proc/sys/vm/nr_hugepages Obviously, both allocation attempts will fall short of their 8GB goal. However, one or both of these commands may stall and not be interruptible. The issues were initially discussed in mail thread [1] and RFC code at [2]. This series addresses the issues causing the stalls. There are two distinct fixes, a cleanup, and an optimization. The reclaim patch by Hillf and compaction patch by Vlasitmil address corner cases in their respective areas. hugetlb page allocation could stall due to either of these issues. Vlasitmil added a cleanup patch after Hillf's modifications. The hugetlb patch by Mike is an optimization suggested during the debug and development process. [1] http://lkml.kernel.org/r/d38a095e-dc39-7e82-bb76-2c9247929f07@oracle.com [2] http://lkml.kernel.org/r/20190724175014.9935-1-mike.kravetz@oracle.com This patch (of 4): Address the issue of should_continue_reclaim returning true too often for __GFP_RETRY_MAYFAIL attempts when !nr_reclaimed and nr_scanned. This was observed during hugetlb page allocation causing stalls for minutes or hours. We can stop reclaiming pages if compaction reports it can make a progress. There might be side-effects for other high-order allocations that would potentially benefit from reclaiming more before compaction so that they would be faster and less likely to stall. However, the consequences of premature/over-reclaim are considered worse. We can also bail out of reclaiming pages if we know that there are not enough inactive lru pages left to satisfy the costly allocation. We can give up reclaiming pages too if we see dryrun occur, with the certainty of plenty of inactive pages. IOW with dryrun detected, we are sure we have reclaimed as many pages as we could. Link: http://lkml.kernel.org/r/20190806014744.15446-2-mike.kravetz@oracle.com Signed-off-by: Hillf Danton Signed-off-by: Mike Kravetz Tested-by: Mike Kravetz Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2ee4d9283738..83b5d5280e99 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2620,18 +2620,6 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return false; } - /* - * If we have not reclaimed enough pages for compaction and the - * inactive lists are large enough, continue reclaiming - */ - pages_for_compaction = compact_gap(sc->order); - inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_nr_swap_pages() > 0) - inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); - if (sc->nr_reclaimed < pages_for_compaction && - inactive_lru_pages > pages_for_compaction) - return true; - /* If compaction would go ahead or the allocation would succeed, stop */ for (z = 0; z <= sc->reclaim_idx; z++) { struct zone *zone = &pgdat->node_zones[z]; @@ -2647,7 +2635,21 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, ; } } - return true; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = compact_gap(sc->order); + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); + if (get_nr_swap_pages() > 0) + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); + + return inactive_lru_pages > pages_for_compaction && + /* + * avoid dryrun with plenty of inactive pages + */ + nr_scanned && nr_reclaimed; } static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) From 5ee04716c46ce58989b1256a98af1af89f385db8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:37:29 -0700 Subject: [PATCH 594/721] mm, reclaim: cleanup should_continue_reclaim() After commit "mm, reclaim: make should_continue_reclaim perform dryrun detection", closer look at the function shows, that nr_reclaimed == 0 means the function will always return false. And since non-zero nr_reclaimed implies non_zero nr_scanned, testing nr_scanned serves no purpose, and so does the testing for __GFP_RETRY_MAYFAIL. This patch thus cleans up the function to test only !nr_reclaimed upfront, and remove the __GFP_RETRY_MAYFAIL test and nr_scanned parameter completely. Comment is also updated, explaining that approximating "full LRU list has been scanned" with nr_scanned == 0 didn't really work. Link: http://lkml.kernel.org/r/20190806014744.15446-3-mike.kravetz@oracle.com Signed-off-by: Vlastimil Babka Signed-off-by: Mike Kravetz Acked-by: Mike Kravetz Cc: Hillf Danton Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 83b5d5280e99..c27dd62ed594 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2586,7 +2586,6 @@ static bool in_reclaim_compaction(struct scan_control *sc) */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, - unsigned long nr_scanned, struct scan_control *sc) { unsigned long pages_for_compaction; @@ -2597,28 +2596,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (!in_reclaim_compaction(sc)) return false; - /* Consider stopping depending on scan and reclaim activity */ - if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { - /* - * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the - * full LRU list has been scanned and we are still failing - * to reclaim pages. This full LRU scan is potentially - * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed - */ - if (!nr_reclaimed && !nr_scanned) - return false; - } else { - /* - * For non-__GFP_RETRY_MAYFAIL allocations which can presumably - * fail without consequence, stop if we failed to reclaim - * any pages from the last SWAP_CLUSTER_MAX number of - * pages that were scanned. This will return to the - * caller faster at the risk reclaim/compaction and - * the resulting allocation attempt fails - */ - if (!nr_reclaimed) - return false; - } + /* + * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX + * number of pages that were scanned. This will return to the caller + * with the risk reclaim/compaction and the resulting allocation attempt + * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL + * allocations through requiring that the full LRU list has been scanned + * first, by assuming that zero delta of sc->nr_scanned means full LRU + * scan, but that approximation was wrong, and there were corner cases + * where always a non-zero amount of pages were scanned. + */ + if (!nr_reclaimed) + return false; /* If compaction would go ahead or the allocation would succeed, stop */ for (z = 0; z <= sc->reclaim_idx; z++) { @@ -2645,11 +2634,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (get_nr_swap_pages() > 0) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); - return inactive_lru_pages > pages_for_compaction && - /* - * avoid dryrun with plenty of inactive pages - */ - nr_scanned && nr_reclaimed; + return inactive_lru_pages > pages_for_compaction; } static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) @@ -2794,7 +2779,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) wait_iff_congested(BLK_RW_ASYNC, HZ/10); } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)); + sc)); /* * Kswapd gives up on balancing particular nodes after too From 494330855641269c8a49f1580f0d4e2ead693245 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 23 Sep 2019 15:37:32 -0700 Subject: [PATCH 595/721] mm, compaction: raise compaction priority after it withdrawns Mike Kravetz reports that "hugetlb allocations could stall for minutes or hours when should_compact_retry() would return true more often then it should. Specifically, this was in the case where compact_result was COMPACT_DEFERRED and COMPACT_PARTIAL_SKIPPED and no progress was being made." The problem is that the compaction_withdrawn() test in should_compact_retry() includes compaction outcomes that are only possible on low compaction priority, and results in a retry without increasing the priority. This may result in furter reclaim, and more incomplete compaction attempts. With this patch, compaction priority is raised when possible, or should_compact_retry() returns false. The COMPACT_SKIPPED result doesn't really fit together with the other outcomes in compaction_withdrawn(), as that's a result caused by insufficient order-0 pages, not due to low compaction priority. With this patch, it is moved to a new compaction_needs_reclaim() function, and for that outcome we keep the current logic of retrying if it looks like reclaim will be able to help. Link: http://lkml.kernel.org/r/20190806014744.15446-4-mike.kravetz@oracle.com Reported-by: Mike Kravetz Signed-off-by: Vlastimil Babka Signed-off-by: Mike Kravetz Tested-by: Mike Kravetz Cc: Hillf Danton Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 22 +++++++++++++++++----- mm/page_alloc.c | 18 +++++++++++++----- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 9569e7c786d3..4b898cdbdf05 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -129,11 +129,8 @@ static inline bool compaction_failed(enum compact_result result) return false; } -/* - * Compaction has backed off for some reason. It might be throttling or - * lock contention. Retrying is still worthwhile. - */ -static inline bool compaction_withdrawn(enum compact_result result) +/* Compaction needs reclaim to be performed first, so it can continue. */ +static inline bool compaction_needs_reclaim(enum compact_result result) { /* * Compaction backed off due to watermark checks for order-0 @@ -142,6 +139,16 @@ static inline bool compaction_withdrawn(enum compact_result result) if (result == COMPACT_SKIPPED) return true; + return false; +} + +/* + * Compaction has backed off for some reason after doing some work or none + * at all. It might be throttling or lock contention. Retrying might be still + * worthwhile, but with a higher priority if allowed. + */ +static inline bool compaction_withdrawn(enum compact_result result) +{ /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case @@ -207,6 +214,11 @@ static inline bool compaction_failed(enum compact_result result) return false; } +static inline bool compaction_needs_reclaim(enum compact_result result) +{ + return false; +} + static inline bool compaction_withdrawn(enum compact_result result) { return true; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df566c0f6729..591e026534c2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3955,16 +3955,24 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, goto check_priority; /* - * make sure the compaction wasn't deferred or didn't bail out early - * due to locks contention before we declare that we should give up. - * But do not retry if the given zonelist is not suitable for - * compaction. + * compaction was skipped because there are not enough order-0 pages + * to work with, so we retry only if it looks like reclaim can help. */ - if (compaction_withdrawn(compact_result)) { + if (compaction_needs_reclaim(compact_result)) { ret = compaction_zonelist_suitable(ac, order, alloc_flags); goto out; } + /* + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. + * But the next retry should use a higher priority if allowed, so + * we don't just keep bailing out endlessly. + */ + if (compaction_withdrawn(compact_result)) { + goto check_priority; + } + /* * !costly requests are much more important than __GFP_RETRY_MAYFAIL * costly ones because they are de facto nofail and invoke OOM From f60858f9d327c4dd0c432abe9ec943a83929c229 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 23 Sep 2019 15:37:35 -0700 Subject: [PATCH 596/721] hugetlbfs: don't retry when pool page allocations start to fail When allocating hugetlbfs pool pages via /proc/sys/vm/nr_hugepages, the pages will be interleaved between all nodes of the system. If nodes are not equal, it is quite possible for one node to fill up before the others. When this happens, the code still attempts to allocate pages from the full node. This results in calls to direct reclaim and compaction which slow things down considerably. When allocating pool pages, note the state of the previous allocation for each node. If previous allocation failed, do not use the aggressive retry algorithm on successive attempts. The allocation will still succeed if there is memory available, but it will not try as hard to free up memory. Link: http://lkml.kernel.org/r/20190806014744.15446-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6d7296dd11b8..ef37c85423a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1405,12 +1405,25 @@ pgoff_t __basepage_index(struct page *page) } static struct page *alloc_buddy_huge_page(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask) + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); struct page *page; + bool alloc_try_hard = true; - gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + /* + * By default we always try hard to allocate the page with + * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in + * a loop (to adjust global huge page counts) and previous allocation + * failed, do not continue to try hard on the same node. Use the + * node_alloc_noretry bitmap to manage this state information. + */ + if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) + alloc_try_hard = false; + gfp_mask |= __GFP_COMP|__GFP_NOWARN; + if (alloc_try_hard) + gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); @@ -1419,6 +1432,22 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, else __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + /* + * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this + * indicates an overall state change. Clear bit so that we resume + * normal 'try hard' allocations. + */ + if (node_alloc_noretry && page && !alloc_try_hard) + node_clear(nid, *node_alloc_noretry); + + /* + * If we tried hard to get a page but failed, set bit so that + * subsequent attempts will not try as hard until there is an + * overall state change. + */ + if (node_alloc_noretry && !page && alloc_try_hard) + node_set(nid, *node_alloc_noretry); + return page; } @@ -1427,7 +1456,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, * should use this function to get new hugetlb pages */ static struct page *alloc_fresh_huge_page(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask) + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct page *page; @@ -1435,7 +1465,7 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, page = alloc_gigantic_page(h, gfp_mask, nid, nmask); else page = alloc_buddy_huge_page(h, gfp_mask, - nid, nmask); + nid, nmask, node_alloc_noretry); if (!page) return NULL; @@ -1450,14 +1480,16 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, * Allocates a fresh page to the hugetlb allocator pool in the node interleaved * manner. */ -static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + nodemask_t *node_alloc_noretry) { struct page *page; int nr_nodes, node; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, + node_alloc_noretry); if (page) break; } @@ -1601,7 +1633,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock(&hugetlb_lock); - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -1637,7 +1669,7 @@ struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -2207,13 +2239,33 @@ static void __init gather_bootmem_prealloc(void) static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; + nodemask_t *node_alloc_noretry; + + if (!hstate_is_gigantic(h)) { + /* + * Bit mask controlling how hard we retry per-node allocations. + * Ignore errors as lower level routines can deal with + * node_alloc_noretry == NULL. If this kmalloc fails at boot + * time, we are likely in bigger trouble. + */ + node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), + GFP_KERNEL); + } else { + /* allocations done at boot time */ + node_alloc_noretry = NULL; + } + + /* bit mask controlling how hard we retry per-node allocations */ + if (node_alloc_noretry) + nodes_clear(*node_alloc_noretry); for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; } else if (!alloc_pool_huge_page(h, - &node_states[N_MEMORY])) + &node_states[N_MEMORY], + node_alloc_noretry)) break; cond_resched(); } @@ -2225,6 +2277,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) h->max_huge_pages, buf, i); h->max_huge_pages = i; } + + kfree(node_alloc_noretry); } static void __init hugetlb_init_hstates(void) @@ -2323,6 +2377,17 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; + NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); + + /* + * Bit mask controlling how hard we retry per-node allocations. + * If we can not allocate the bit mask, do not attempt to allocate + * the requested huge pages. + */ + if (node_alloc_noretry) + nodes_clear(*node_alloc_noretry); + else + return -ENOMEM; spin_lock(&hugetlb_lock); @@ -2356,6 +2421,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { spin_unlock(&hugetlb_lock); + NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } /* Fall through to decrease pool */ @@ -2388,7 +2454,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, /* yield cpu to avoid soft lockup */ cond_resched(); - ret = alloc_pool_huge_page(h, nodes_allowed); + ret = alloc_pool_huge_page(h, nodes_allowed, + node_alloc_noretry); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -2429,6 +2496,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, h->max_huge_pages = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); + NODEMASK_FREE(node_alloc_noretry); + return 0; } From 276f756d7002ceb66f8ae3f7bd8028e7ae3deed8 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Mon, 23 Sep 2019 15:37:38 -0700 Subject: [PATCH 597/721] mm/migrate.c: clean up useless code in migrate_vma_collect_pmd() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove unused 'pfn' variable. Link: http://lkml.kernel.org/r/1565167272-21453-1-git-send-email-kernelfans@gmail.com Signed-off-by: Pingfan Liu Reviewed-by: Andrew Morton Reviewed-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Mel Gorman Cc: Jan Kara Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mike Kravetz Cc: Andrea Arcangeli Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 374ef2fcb722..73d476d690b1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2218,17 +2218,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, pte_t pte; pte = *ptep; - pfn = pte_pfn(pte); if (pte_none(pte)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; - pfn = 0; goto next; } if (!pte_present(pte)) { - mpfn = pfn = 0; + mpfn = 0; /* * Only care about unaddressable device page special @@ -2245,10 +2243,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { + pfn = pte_pfn(pte); if (is_zero_pfn(pfn)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; - pfn = 0; goto next; } page = vm_normal_page(migrate->vma, addr, pte); @@ -2258,10 +2256,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, /* FIXME support THP */ if (!page || !page->mapping || PageTransCompound(page)) { - mpfn = pfn = 0; + mpfn = 0; goto next; } - pfn = page_to_pfn(page); /* * By getting a reference on the page we pin it and that blocks From 9ef258bad7af2f6abee2bdfaaa6c41890d4f4db6 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 23 Sep 2019 15:37:41 -0700 Subject: [PATCH 598/721] thp: update split_huge_page_pmd() comment According to 78ddc5347341 ("thp: rename split_huge_page_pmd() to split_huge_pmd()"), update related comment. Link: http://lkml.kernel.org/r/20190731033406.185285-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: "Kirill A . Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/book3s64/hash_utils.c | 2 +- include/asm-generic/pgtable.h | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 3410ea9f4de1..6c123760164e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1748,7 +1748,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr, /* * IF we try to do a HUGE PTE update after a withdraw is done. * we will find the below NULL. This happens when we do - * split_huge_page_pmd + * split_huge_pmd */ if (!hpte_slot_array) return; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index fae6abb3d586..818691846c90 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1002,9 +1002,8 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) * need this). If THP is not enabled, the pmd can't go away under the * code even if MADV_DONTNEED runs, but if THP is enabled we need to * run a pmd_trans_unstable before walking the ptes after - * split_huge_page_pmd returns (because it may have run when the pmd - * become null, but then a page fault can map in a THP and not a - * regular page). + * split_huge_pmd returns (because it may have run when the pmd become + * null, but then a page fault can map in a THP and not a regular page). */ static inline int pmd_trans_unstable(pmd_t *pmd) { From 585e5a7babd91fd85a5cc97b7324c6c2fc29e1ec Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:44 -0700 Subject: [PATCH 599/721] filemap: check compound_head(page)->mapping in filemap_fault() Patch series "Enable THP for text section of non-shmem files", v10; This patchset follows up discussion at LSF/MM 2019. The motivation is to put text section of an application in THP, and thus reduces iTLB miss rate and improves performance. Both Facebook and Oracle showed strong interests to this feature. To make reviews easier, this set aims a mininal valid product. Current version of the work does not have any changes to file system specific code. This comes with some limitations (discussed later). This set enables an application to "hugify" its text section by simply running something like: madvise(0x600000, 0x80000, MADV_HUGEPAGE); Before this call, the /proc//maps looks like: 00400000-074d0000 r-xp 00000000 00:27 2006927 app After this call, part of the text section is split out and mapped to THP: 00400000-00425000 r-xp 00000000 00:27 2006927 app 00600000-00e00000 r-xp 00200000 00:27 2006927 app <<< on THP 00e00000-074d0000 r-xp 00a00000 00:27 2006927 app Limitations: 1. This only works for text section (vma with VM_DENYWRITE). 2. Original limitation #2 is removed in v3. We gated this feature with an experimental config, READ_ONLY_THP_FOR_FS. Once we get better support on the write path, we can remove the config and enable it by default. Tested cases: 1. Tested with btrfs and ext4. 2. Tested with real work application (memcache like caching service). 3. Tested with "THP aware uprobe": https://patchwork.kernel.org/project/linux-mm/list/?series=131339 This patch (of 7): Currently, filemap_fault() avoids race condition with truncate by checking page->mapping == mapping. This does not work for compound pages. This patch let it check compound_head(page)->mapping instead. Link: http://lkml.kernel.org/r/20190801184244.3169074-2-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: William Kucharski Cc: Hillf Danton Cc: Hugh Dickins Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 533f271d6839..fc7818b4537e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2540,7 +2540,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) goto out_retry; /* Did it get truncated? */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(compound_head(page)->mapping != mapping)) { unlock_page(page); put_page(page); goto retry_find; From 31895438e702f48e25b7aa6d88f9c97c795c79c7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:47 -0700 Subject: [PATCH 600/721] filemap: check compound_head(page)->mapping in pagecache_get_page() Similar to previous patch, pagecache_get_page() avoids race condition with truncate by checking page->mapping == mapping. This does not work for compound pages. This patch let it check compound_head(page)->mapping instead. Link: http://lkml.kernel.org/r/20190801184244.3169074-3-songliubraving@fb.com Signed-off-by: Song Liu Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Rik van Riel Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index fc7818b4537e..0b301103ea80 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1647,7 +1647,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, } /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(compound_head(page)->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; From 520e5ba415906373186bcd3c7cffa3535bfdbdde Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:50 -0700 Subject: [PATCH 601/721] filemap: update offset check in filemap_fault() With THP, current check of offset: VM_BUG_ON_PAGE(page->index != offset, page); is no longer accurate. Update it to: VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); Link: http://lkml.kernel.org/r/20190801184244.3169074-4-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 0b301103ea80..f4d2971abd7c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2545,7 +2545,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) put_page(page); goto retry_find; } - VM_BUG_ON_PAGE(page->index != offset, page); + VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); /* * We have a locked page in the page cache, now we need to check From 60fbf0ab5da1c360e02b7f7d882bf1c0d8f7e32a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:54 -0700 Subject: [PATCH 602/721] mm,thp: stats for file backed THP In preparation for non-shmem THP, this patch adds a few stats and exposes them in /proc/meminfo, /sys/bus/node/devices//meminfo, and /proc//task//smaps. This patch is mostly a rewrite of Kirill A. Shutemov's earlier version: https://lkml.kernel.org/r/20170126115819.58875-5-kirill.shutemov@linux.intel.com/ Link: http://lkml.kernel.org/r/20190801184244.3169074-5-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 6 ++++++ fs/proc/meminfo.c | 4 ++++ fs/proc/task_mmu.c | 4 +++- include/linux/mmzone.h | 2 ++ mm/vmstat.c | 2 ++ 5 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 257449cf061f..296546ffed6c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -427,6 +427,8 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d AnonHugePages: %8lu kB\n" "Node %d ShmemHugePages: %8lu kB\n" "Node %d ShmemPmdMapped: %8lu kB\n" + "Node %d FileHugePages: %8lu kB\n" + "Node %d FilePmdMapped: %8lu kB\n" #endif , nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), @@ -452,6 +454,10 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * + HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_FILE_THPS) * + HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * HPAGE_PMD_NR) #endif ); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 4bd80e68eb03..ac9247371871 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -132,6 +132,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); + show_val_kb(m, "FileHugePages: ", + global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); + show_val_kb(m, "FilePmdMapped: ", + global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); #endif #ifdef CONFIG_CMA diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ea1630465474..9442631fd4af 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -417,6 +417,7 @@ struct mem_size_stats { unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; + unsigned long file_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; @@ -588,7 +589,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, else if (is_zone_device_page(page)) /* pass */; else - VM_BUG_ON_PAGE(1, page); + mss->file_thp += HPAGE_PMD_SIZE; smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); } #else @@ -809,6 +810,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", mss->private_hugetlb >> 10, 7); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3f38c30d2f13..aafb7c38c627 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -235,6 +235,8 @@ enum node_stat_item { NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, + NR_FILE_THPS, + NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_VMSCAN_WRITE, diff --git a/mm/vmstat.c b/mm/vmstat.c index fd7e16ca6996..6afc892a148a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1158,6 +1158,8 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_shmem_hugepages", "nr_shmem_pmdmapped", + "nr_file_hugepages", + "nr_file_pmdmapped", "nr_anon_transparent_hugepages", "nr_unstable", "nr_vmscan_write", From 579c571e2efdb8e5b50959ae66b6142e05bd704f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:37:57 -0700 Subject: [PATCH 603/721] khugepaged: rename collapse_shmem() and khugepaged_scan_shmem() Next patch will add khugepaged support of non-shmem files. This patch renames these two functions to reflect the new functionality: collapse_shmem() => collapse_file() khugepaged_scan_shmem() => khugepaged_scan_file() Link: http://lkml.kernel.org/r/20190801184244.3169074-6-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 04a54ff5a8ac..7e36cc893f25 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1287,7 +1287,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) } /** - * collapse_shmem - collapse small tmpfs/shmem pages into huge one. + * collapse_file - collapse small tmpfs/shmem pages into huge one. * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; @@ -1304,10 +1304,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_shmem(struct mm_struct *mm, - struct address_space *mapping, pgoff_t start, +static void collapse_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage, int node) { + struct address_space *mapping = file->f_mapping; gfp_t gfp; struct page *new_page; struct mem_cgroup *memcg; @@ -1563,11 +1564,11 @@ static void collapse_shmem(struct mm_struct *mm, /* TODO: tracepoints */ } -static void khugepaged_scan_shmem(struct mm_struct *mm, - struct address_space *mapping, - pgoff_t start, struct page **hpage) +static void khugepaged_scan_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage) { struct page *page = NULL; + struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; @@ -1631,16 +1632,15 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, result = SCAN_EXCEED_NONE_PTE; } else { node = khugepaged_find_target_node(); - collapse_shmem(mm, mapping, start, hpage, node); + collapse_file(mm, file, start, hpage, node); } } /* TODO: tracepoints */ } #else -static void khugepaged_scan_shmem(struct mm_struct *mm, - struct address_space *mapping, - pgoff_t start, struct page **hpage) +static void khugepaged_scan_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage) { BUILD_BUG(); } @@ -1722,8 +1722,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, file = get_file(vma->vm_file); up_read(&mm->mmap_sem); ret = 1; - khugepaged_scan_shmem(mm, file->f_mapping, - pgoff, hpage); + khugepaged_scan_file(mm, file, pgoff, hpage); fput(file); } else { ret = khugepaged_scan_pmd(mm, vma, From 99cb0dbd47a15d395bf3faa78dc122bc5efe3fc0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:00 -0700 Subject: [PATCH 604/721] mm,thp: add read-only THP support for (non-shmem) FS This patch is (hopefully) the first step to enable THP for non-shmem filesystems. This patch enables an application to put part of its text sections to THP via madvise, for example: madvise((void *)0x600000, 0x200000, MADV_HUGEPAGE); We tried to reuse the logic for THP on tmpfs. Currently, write is not supported for non-shmem THP. khugepaged will only process vma with VM_DENYWRITE. sys_mmap() ignores VM_DENYWRITE requests (see ksys_mmap_pgoff). The only way to create vma with VM_DENYWRITE is execve(). This requirement limits non-shmem THP to text sections. The next patch will handle writes, which would only happen when the all the vmas with VM_DENYWRITE are unmapped. An EXPERIMENTAL config, READ_ONLY_THP_FOR_FS, is added to gate this feature. [songliubraving@fb.com: fix build without CONFIG_SHMEM] Link: http://lkml.kernel.org/r/F53407FB-96CC-42E8-9862-105C92CC2B98@fb.com [songliubraving@fb.com: fix double unlock in collapse_file()] Link: http://lkml.kernel.org/r/B960CBFA-8EFC-4DA4-ABC5-1977FFF2CA57@fb.com Link: http://lkml.kernel.org/r/20190801184244.3169074-7-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Stephen Rothwell Cc: Dan Carpenter Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 11 ++++ mm/filemap.c | 4 +- mm/khugepaged.c | 149 ++++++++++++++++++++++++++++++++++-------------- mm/rmap.c | 12 ++-- 4 files changed, 128 insertions(+), 48 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index f88be1cbcfb2..a5dae9a7eb51 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -712,6 +712,17 @@ config GUP_BENCHMARK config GUP_GET_PTE_LOW_HIGH bool +config READ_ONLY_THP_FOR_FS + bool "Read-only THP for filesystems (EXPERIMENTAL)" + depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM + + help + Allow khugepaged to put read-only file-backed pages in THP. + + This is marked experimental because it is a new feature. Write + support of file THPs will be developed in the next few release + cycles. + config ARCH_HAS_PTE_SPECIAL bool diff --git a/mm/filemap.c b/mm/filemap.c index f4d2971abd7c..91fe3a08ca4a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -203,8 +203,8 @@ static void unaccount_page_cache_page(struct address_space *mapping, __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); - } else { - VM_BUG_ON_PAGE(PageTransHuge(page), page); + } else if (PageTransHuge(page)) { + __dec_node_page_state(page, NR_FILE_THPS); } /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7e36cc893f25..8607c77431b3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -48,6 +48,7 @@ enum scan_result { SCAN_CGROUP_CHARGE_FAIL, SCAN_EXCEED_SWAP_PTE, SCAN_TRUNCATED, + SCAN_PAGE_HAS_PRIVATE, }; #define CREATE_TRACE_POINTS @@ -404,7 +405,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, (vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; - if (shmem_file(vma->vm_file)) { + + if (shmem_file(vma->vm_file) || + (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + vma->vm_file && + (vm_flags & VM_DENYWRITE))) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) return false; return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, @@ -456,8 +461,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long hstart, hend; /* - * khugepaged does not yet work on non-shmem files or special - * mappings. And file-private shmem THP is not supported. + * khugepaged only supports read-only files for non-shmem files. + * khugepaged does not yet work on special mappings. And + * file-private shmem THP is not supported. */ if (!hugepage_vma_check(vma, vm_flags)) return 0; @@ -1287,12 +1293,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) } /** - * collapse_file - collapse small tmpfs/shmem pages into huge one. + * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; * - scan page cache replacing old pages with the new one - * + swap in pages if necessary; + * + swap/gup in pages if necessary; * + fill in gaps; * + keep old pages around in case rollback is required; * - if replacing succeeds: @@ -1316,7 +1322,9 @@ static void collapse_file(struct mm_struct *mm, LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; + bool is_shmem = shmem_file(file); + VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); /* Only allocate from the target node */ @@ -1348,7 +1356,8 @@ static void collapse_file(struct mm_struct *mm, } while (1); __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); + if (is_shmem) + __SetPageSwapBacked(new_page); new_page->index = start; new_page->mapping = mapping; @@ -1363,41 +1372,75 @@ static void collapse_file(struct mm_struct *mm, struct page *page = xas_next(&xas); VM_BUG_ON(index != xas.xa_index); - if (!page) { - /* - * Stop if extent has been truncated or hole-punched, - * and is now completely empty. - */ - if (index == start) { - if (!xas_next_entry(&xas, end - 1)) { - result = SCAN_TRUNCATED; + if (is_shmem) { + if (!page) { + /* + * Stop if extent has been truncated or + * hole-punched, and is now completely + * empty. + */ + if (index == start) { + if (!xas_next_entry(&xas, end - 1)) { + result = SCAN_TRUNCATED; + goto xa_locked; + } + xas_set(&xas, index); + } + if (!shmem_charge(mapping->host, 1)) { + result = SCAN_FAIL; goto xa_locked; } - xas_set(&xas, index); + xas_store(&xas, new_page); + nr_none++; + continue; } - if (!shmem_charge(mapping->host, 1)) { - result = SCAN_FAIL; + + if (xa_is_value(page) || !PageUptodate(page)) { + xas_unlock_irq(&xas); + /* swap in or instantiate fallocated page */ + if (shmem_getpage(mapping->host, index, &page, + SGP_NOHUGE)) { + result = SCAN_FAIL; + goto xa_unlocked; + } + } else if (trylock_page(page)) { + get_page(page); + xas_unlock_irq(&xas); + } else { + result = SCAN_PAGE_LOCK; goto xa_locked; } - xas_store(&xas, new_page); - nr_none++; - continue; - } - - if (xa_is_value(page) || !PageUptodate(page)) { - xas_unlock_irq(&xas); - /* swap in or instantiate fallocated page */ - if (shmem_getpage(mapping->host, index, &page, - SGP_NOHUGE)) { + } else { /* !is_shmem */ + if (!page || xa_is_value(page)) { + xas_unlock_irq(&xas); + page_cache_sync_readahead(mapping, &file->f_ra, + file, index, + PAGE_SIZE); + /* drain pagevecs to help isolate_lru_page() */ + lru_add_drain(); + page = find_lock_page(mapping, index); + if (unlikely(page == NULL)) { + result = SCAN_FAIL; + goto xa_unlocked; + } + } else if (!PageUptodate(page)) { + xas_unlock_irq(&xas); + wait_on_page_locked(page); + if (!trylock_page(page)) { + result = SCAN_PAGE_LOCK; + goto xa_unlocked; + } + get_page(page); + } else if (PageDirty(page)) { result = SCAN_FAIL; - goto xa_unlocked; + goto xa_locked; + } else if (trylock_page(page)) { + get_page(page); + xas_unlock_irq(&xas); + } else { + result = SCAN_PAGE_LOCK; + goto xa_locked; } - } else if (trylock_page(page)) { - get_page(page); - xas_unlock_irq(&xas); - } else { - result = SCAN_PAGE_LOCK; - goto xa_locked; } /* @@ -1426,6 +1469,12 @@ static void collapse_file(struct mm_struct *mm, goto out_unlock; } + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) { + result = SCAN_PAGE_HAS_PRIVATE; + goto out_unlock; + } + if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); @@ -1463,12 +1512,18 @@ static void collapse_file(struct mm_struct *mm, goto xa_unlocked; } - __inc_node_page_state(new_page, NR_SHMEM_THPS); + if (is_shmem) + __inc_node_page_state(new_page, NR_SHMEM_THPS); + else + __inc_node_page_state(new_page, NR_FILE_THPS); + if (nr_none) { struct zone *zone = page_zone(new_page); __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); - __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); + if (is_shmem) + __mod_node_page_state(zone->zone_pgdat, + NR_SHMEM, nr_none); } xa_locked: @@ -1506,10 +1561,15 @@ static void collapse_file(struct mm_struct *mm, SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); + + if (is_shmem) { + set_page_dirty(new_page); + lru_cache_add_anon(new_page); + } else { + lru_cache_add_file(new_page); + } count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); - lru_cache_add_anon(new_page); /* * Remove pte page tables, so we can re-fault the page as huge. @@ -1524,7 +1584,9 @@ static void collapse_file(struct mm_struct *mm, /* Something went wrong: roll back page cache changes */ xas_lock_irq(&xas); mapping->nrpages -= nr_none; - shmem_uncharge(mapping->host, nr_none); + + if (is_shmem) + shmem_uncharge(mapping->host, nr_none); xas_set(&xas, start); xas_for_each(&xas, page, end - 1) { @@ -1607,7 +1669,8 @@ static void khugepaged_scan_file(struct mm_struct *mm, break; } - if (page_count(page) != 1 + page_mapcount(page)) { + if (page_count(page) != + 1 + page_mapcount(page) + page_has_private(page)) { result = SCAN_PAGE_COUNT; break; } @@ -1713,11 +1776,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (shmem_file(vma->vm_file)) { + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { struct file *file; pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); - if (!shmem_huge_enabled(vma)) + + if (shmem_file(vma->vm_file) + && !shmem_huge_enabled(vma)) goto skip; file = get_file(vma->vm_file); up_read(&mm->mmap_sem); diff --git a/mm/rmap.c b/mm/rmap.c index 26006445c8b5..d9a23bb773bf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1189,8 +1189,10 @@ void page_add_file_rmap(struct page *page, bool compound) } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + if (PageSwapBacked(page)) + __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + else + __inc_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (PageTransCompound(page) && page_mapping(page)) { VM_WARN_ON_ONCE(!PageLocked(page)); @@ -1229,8 +1231,10 @@ static void page_remove_file_rmap(struct page *page, bool compound) } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) goto out; - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + if (PageSwapBacked(page)) + __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + else + __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (!atomic_add_negative(-1, &page->_mapcount)) goto out; From 09d91cda0e8207c1f14ee0d572f61a53dbcdaf85 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:03 -0700 Subject: [PATCH 605/721] mm,thp: avoid writes to file with THP in pagecache In previous patch, an application could put part of its text section in THP via madvise(). These THPs will be protected from writes when the application is still running (TXTBSY). However, after the application exits, the file is available for writes. This patch avoids writes to file THP by dropping page cache for the file when the file is open for write. A new counter nr_thps is added to struct address_space. In do_dentry_open(), if the file is open for write and nr_thps is non-zero, we drop page cache for the whole file. Link: http://lkml.kernel.org/r/20190801184244.3169074-8-songliubraving@fb.com Signed-off-by: Song Liu Reported-by: kbuild test robot Acked-by: Rik van Riel Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Hillf Danton Cc: Hugh Dickins Cc: William Kucharski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 3 +++ fs/open.c | 8 ++++++++ include/linux/fs.h | 32 ++++++++++++++++++++++++++++++++ mm/filemap.c | 1 + mm/khugepaged.c | 4 +++- 5 files changed, 47 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 64bf28cf05cd..fef457a42882 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -181,6 +181,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->flags = 0; mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_set(&mapping->nr_thps, 0); +#endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; diff --git a/fs/open.c b/fs/open.c index a59abe3c669a..c60cd22cc052 100644 --- a/fs/open.c +++ b/fs/open.c @@ -818,6 +818,14 @@ static int do_dentry_open(struct file *f, if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; } + + /* + * XXX: Huge page cache doesn't support writing yet. Drop all page + * cache for this file before processing writes. + */ + if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping)) + truncate_pagecache(inode, 0); + return 0; cleanup_all: diff --git a/include/linux/fs.h b/include/linux/fs.h index 866268c2c6e3..b0c6b0d34d02 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -429,6 +429,7 @@ int pagecache_write_end(struct file *, struct address_space *mapping, * @i_pages: Cached pages. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED mappings. + * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. @@ -446,6 +447,10 @@ struct address_space { struct xarray i_pages; gfp_t gfp_mask; atomic_t i_mmap_writable; +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + /* number of thp, only for non-shmem files */ + atomic_t nr_thps; +#endif struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; @@ -2798,6 +2803,33 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) return errseq_sample(&mapping->wb_err); } +static inline int filemap_nr_thps(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + return atomic_read(&mapping->nr_thps); +#else + return 0; +#endif +} + +static inline void filemap_nr_thps_inc(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_inc(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + +static inline void filemap_nr_thps_dec(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_dec(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); diff --git a/mm/filemap.c b/mm/filemap.c index 91fe3a08ca4a..1146fcfa3215 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -205,6 +205,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, __dec_node_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { __dec_node_page_state(page, NR_FILE_THPS); + filemap_nr_thps_dec(mapping); } /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8607c77431b3..e89430ec5267 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1514,8 +1514,10 @@ static void collapse_file(struct mm_struct *mm, if (is_shmem) __inc_node_page_state(new_page, NR_SHMEM_THPS); - else + else { __inc_node_page_state(new_page, NR_FILE_THPS); + filemap_nr_thps_inc(mapping); + } if (nr_none) { struct zone *zone = page_zone(new_page); From 364c1eebe453f06f0c1e837eb155a5725c9cd272 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:06 -0700 Subject: [PATCH 606/721] mm: thp: extract split_queue_* into a struct Patch series "Make deferred split shrinker memcg aware", v6. Currently THP deferred split shrinker is not memcg aware, this may cause premature OOM with some configuration. For example the below test would run into premature OOM easily: $ cgcreate -g memory:thp $ echo 4G > /sys/fs/cgroup/memory/thp/memory/limit_in_bytes $ cgexec -g memory:thp transhuge-stress 4000 transhuge-stress comes from kernel selftest. It is easy to hit OOM, but there are still a lot THP on the deferred split queue, memcg direct reclaim can't touch them since the deferred split shrinker is not memcg aware. Convert deferred split shrinker memcg aware by introducing per memcg deferred split queue. The THP should be on either per node or per memcg deferred split queue if it belongs to a memcg. When the page is immigrated to the other memcg, it will be immigrated to the target memcg's deferred split queue too. Reuse the second tail page's deferred_list for per memcg list since the same THP can't be on multiple deferred split queues. Make deferred split shrinker not depend on memcg kmem since it is not slab. It doesn't make sense to not shrink THP even though memcg kmem is disabled. With the above change the test demonstrated above doesn't trigger OOM even though with cgroup.memory=nokmem. This patch (of 4): Put split_queue, split_queue_lock and split_queue_len into a struct in order to reduce code duplication when we convert deferred_split to memcg aware in the later patches. Link: http://lkml.kernel.org/r/1565144277-36240-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Suggested-by: "Kirill A . Shutemov" Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ++++++++--- mm/huge_memory.c | 45 +++++++++++++++++++++++------------------- mm/page_alloc.c | 8 +++++--- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aafb7c38c627..bda20282746b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -679,6 +679,14 @@ struct zonelist { extern struct page *mem_map; #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct deferred_split { + spinlock_t split_queue_lock; + struct list_head split_queue; + unsigned long split_queue_len; +}; +#endif + /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which @@ -758,9 +766,7 @@ typedef struct pglist_data { #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE - spinlock_t split_queue_lock; - struct list_head split_queue; - unsigned long split_queue_len; + struct deferred_split deferred_split_queue; #endif /* Fields commonly accessed by the page reclaim scanner */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 483b07b2d6ae..c642c0394690 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2691,6 +2691,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; @@ -2777,17 +2778,17 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock(&pgdata->split_queue_lock); + spin_lock(&ds_queue->split_queue_lock); count = page_count(head); mapcount = total_mapcount(head); if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { - pgdata->split_queue_len--; + ds_queue->split_queue_len--; list_del(page_deferred_list(head)); } if (mapping) __dec_node_page_state(page, NR_SHMEM_THPS); - spin_unlock(&pgdata->split_queue_lock); + spin_unlock(&ds_queue->split_queue_lock); __split_huge_page(page, list, end, flags); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2804,7 +2805,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) dump_page(page, "total_mapcount(head) > 0"); BUG(); } - spin_unlock(&pgdata->split_queue_lock); + spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(&pgdata->lru_lock, flags); @@ -2827,52 +2828,56 @@ fail: if (mapping) void free_transhuge_page(struct page *page) { struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(page_deferred_list(page))) { - pgdata->split_queue_len--; + ds_queue->split_queue_len--; list_del(page_deferred_list(page)); } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); free_compound_page(page); } void deferred_split_huge_page(struct page *page) { struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; VM_BUG_ON_PAGE(!PageTransHuge(page), page); - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); - list_add_tail(page_deferred_list(page), &pgdata->split_queue); - pgdata->split_queue_len++; + list_add_tail(page_deferred_list(page), &ds_queue->split_queue); + ds_queue->split_queue_len++; } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); - return READ_ONCE(pgdata->split_queue_len); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + return READ_ONCE(ds_queue->split_queue_len); } static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; LIST_HEAD(list), *pos, *next; struct page *page; int split = 0; - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &pgdata->split_queue) { + list_for_each_safe(pos, next, &ds_queue->split_queue) { page = list_entry((void *)pos, struct page, mapping); page = compound_head(page); if (get_page_unless_zero(page)) { @@ -2880,12 +2885,12 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } else { /* We lost race with put_compound_page() */ list_del_init(page_deferred_list(page)); - pgdata->split_queue_len--; + ds_queue->split_queue_len--; } if (!--sc->nr_to_scan) break; } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); @@ -2899,15 +2904,15 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, put_page(page); } - spin_lock_irqsave(&pgdata->split_queue_lock, flags); - list_splice_tail(&list, &pgdata->split_queue); - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + list_splice_tail(&list, &ds_queue->split_queue); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); /* * Stop shrinker if we didn't split any page, but the queue is empty. * This can happen if pages were freed under us. */ - if (!split && list_empty(&pgdata->split_queue)) + if (!split && list_empty(&ds_queue->split_queue)) return SHRINK_STOP; return split; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 591e026534c2..a41436cca563 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6646,9 +6646,11 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages, #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { - spin_lock_init(&pgdat->split_queue_lock); - INIT_LIST_HEAD(&pgdat->split_queue); - pgdat->split_queue_len = 0; + struct deferred_split *ds_queue = &pgdat->deferred_split_queue; + + spin_lock_init(&ds_queue->split_queue_lock); + INIT_LIST_HEAD(&ds_queue->split_queue); + ds_queue->split_queue_len = 0; } #else static void pgdat_init_split_queue(struct pglist_data *pgdat) {} From 7ae88534cdd96235cd775c03b32a75009355740b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:09 -0700 Subject: [PATCH 607/721] mm: move mem_cgroup_uncharge out of __page_cache_release() A later patch makes THP deferred split shrinker memcg aware, but it needs page->mem_cgroup information in THP destructor, which is called after mem_cgroup_uncharge() now. So move mem_cgroup_uncharge() from __page_cache_release() to compound page destructor, which is called by both THP and other compound pages except HugeTLB. And call it in __put_single_page() for single order page. Link: http://lkml.kernel.org/r/1565144277-36240-3-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Suggested-by: "Kirill A . Shutemov" Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + mm/swap.c | 2 +- mm/vmscan.c | 6 ++---- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a41436cca563..3334a769eb91 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -670,6 +670,7 @@ static void bad_page(struct page *page, const char *reason, void free_compound_page(struct page *page) { + mem_cgroup_uncharge(page); __free_pages_ok(page, compound_order(page)); } diff --git a/mm/swap.c b/mm/swap.c index 0226c5346560..784dc1620620 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -71,12 +71,12 @@ static void __page_cache_release(struct page *page) spin_unlock_irqrestore(&pgdat->lru_lock, flags); } __ClearPageWaiters(page); - mem_cgroup_uncharge(page); } static void __put_single_page(struct page *page) { __page_cache_release(page); + mem_cgroup_uncharge(page); free_unref_page(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index c27dd62ed594..c4ef8681637b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1487,10 +1487,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - if (unlikely(PageTransHuge(page))) { - mem_cgroup_uncharge(page); + if (unlikely(PageTransHuge(page))) (*get_compound_page_dtor(page))(page); - } else + else list_add(&page->lru, &free_pages); continue; @@ -1911,7 +1910,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&pgdat->lru_lock); - mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&pgdat->lru_lock); } else From 0a432dcbeb32edcd211a5d8f7847d0da7642a8b4 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:12 -0700 Subject: [PATCH 608/721] mm: shrinker: make shrinker not depend on memcg kmem Currently shrinker is just allocated and can work when memcg kmem is enabled. But, THP deferred split shrinker is not slab shrinker, it doesn't make too much sense to have such shrinker depend on memcg kmem. It should be able to reclaim THP even though memcg kmem is disabled. Introduce a new shrinker flag, SHRINKER_NONSLAB, for non-slab shrinker. When memcg kmem is disabled, just such shrinkers can be called in shrinking memcg slab. [yang.shi@linux.alibaba.com: add comment] Link: http://lkml.kernel.org/r/1566496227-84952-4-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1565144277-36240-4-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 +++++++----- include/linux/shrinker.h | 7 ++++- mm/memcontrol.c | 9 +----- mm/vmscan.c | 60 ++++++++++++++++++++------------------ 4 files changed, 49 insertions(+), 46 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ad8f1a397ae4..a3c0a639c824 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -128,9 +128,8 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; -#ifdef CONFIG_MEMCG_KMEM struct memcg_shrinker_map __rcu *shrinker_map; -#endif + struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ @@ -1311,6 +1310,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) } while ((memcg = parent_mem_cgroup(memcg))); return false; } + +extern int memcg_expand_shrinker_maps(int new_id); + +extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; @@ -1319,6 +1323,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; } + +static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) +{ +} #endif struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); @@ -1390,10 +1399,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -extern int memcg_expand_shrinker_maps(int new_id); - -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id); #else static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) @@ -1435,8 +1440,6 @@ static inline void memcg_put_cache_ids(void) { } -static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id) { } #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 9443cafd1969..0f80123650e2 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -69,7 +69,7 @@ struct shrinker { /* These are for internal use */ struct list_head list; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG /* ID in shrinker_idr */ int id; #endif @@ -81,6 +81,11 @@ struct shrinker { /* Flags */ #define SHRINKER_NUMA_AWARE (1 << 0) #define SHRINKER_MEMCG_AWARE (1 << 1) +/* + * It just makes sense when the shrinker is also MEMCG_AWARE for now, + * non-MEMCG_AWARE shrinker should not have this flag set. + */ +#define SHRINKER_NONSLAB (1 << 2) extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7bb2971dc75e..a385a7cb7d9f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -318,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; +#endif static int memcg_shrinker_map_size; static DEFINE_MUTEX(memcg_shrinker_map_mutex); @@ -441,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) } } -#else /* CONFIG_MEMCG_KMEM */ -static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) -{ - return 0; -} -static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } -#endif /* CONFIG_MEMCG_KMEM */ - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest diff --git a/mm/vmscan.c b/mm/vmscan.c index c4ef8681637b..4911754c93b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -171,11 +171,22 @@ int vm_swappiness = 60; */ unsigned long vm_total_pages; +static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) +{ + /* Check for an overwrite */ + WARN_ON_ONCE(rs && task->reclaim_state); + + /* Check for the nulling of an already-nulled member */ + WARN_ON_ONCE(!rs && !task->reclaim_state); + + task->reclaim_state = rs; +} + static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_MEMCG_KMEM - +#ifdef CONFIG_MEMCG /* * We allow subsystems to populate their shrinker-related * LRU lists before register_shrinker_prepared() is called @@ -227,30 +238,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); up_write(&shrinker_rwsem); } -#else /* CONFIG_MEMCG_KMEM */ -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - return 0; -} -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ -} -#endif /* CONFIG_MEMCG_KMEM */ - -static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) -{ - /* Check for an overwrite */ - WARN_ON_ONCE(rs && task->reclaim_state); - - /* Check for the nulling of an already-nulled member */ - WARN_ON_ONCE(!rs && !task->reclaim_state); - - task->reclaim_state = rs; -} - -#ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; @@ -305,6 +293,15 @@ static bool memcg_congested(pg_data_t *pgdat, } #else +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return 0; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} + static bool global_reclaim(struct scan_control *sc) { return true; @@ -591,7 +588,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { @@ -599,7 +596,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, unsigned long ret, freed = 0; int i; - if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) + if (!mem_cgroup_online(memcg)) return 0; if (!down_read_trylock(&shrinker_rwsem)) @@ -625,6 +622,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, continue; } + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_enabled() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { clear_bit(i, map->map); @@ -661,13 +663,13 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, up_read(&shrinker_rwsem); return freed; } -#else /* CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG */ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { return 0; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ /** * shrink_slab - shrink slab caches From 87eaceb3faa59b9b4d940ec9554ce251325d83fe Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Mon, 23 Sep 2019 15:38:15 -0700 Subject: [PATCH 609/721] mm: thp: make deferred split shrinker memcg aware Currently THP deferred split shrinker is not memcg aware, this may cause premature OOM with some configuration. For example the below test would run into premature OOM easily: $ cgcreate -g memory:thp $ echo 4G > /sys/fs/cgroup/memory/thp/memory/limit_in_bytes $ cgexec -g memory:thp transhuge-stress 4000 transhuge-stress comes from kernel selftest. It is easy to hit OOM, but there are still a lot THP on the deferred split queue, memcg direct reclaim can't touch them since the deferred split shrinker is not memcg aware. Convert deferred split shrinker memcg aware by introducing per memcg deferred split queue. The THP should be on either per node or per memcg deferred split queue if it belongs to a memcg. When the page is immigrated to the other memcg, it will be immigrated to the target memcg's deferred split queue too. Reuse the second tail page's deferred_list for per memcg list since the same THP can't be on multiple deferred split queues. [yang.shi@linux.alibaba.com: simplify deferred split queue dereference per Kirill Tkhai] Link: http://lkml.kernel.org/r/1566496227-84952-5-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1565144277-36240-5-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Reviewed-by: Kirill Tkhai Cc: Johannes Weiner Cc: Michal Hocko Cc: "Kirill A . Shutemov" Cc: Hugh Dickins Cc: Shakeel Butt Cc: David Rientjes Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 9 ++++++ include/linux/memcontrol.h | 4 +++ include/linux/mm_types.h | 1 + mm/huge_memory.c | 62 ++++++++++++++++++++++++++++++++------ mm/memcontrol.c | 24 +++++++++++++++ 5 files changed, 91 insertions(+), 9 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 45ede62aa85b..61c9ffd89b05 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -267,6 +267,15 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } +static inline struct list_head *page_deferred_list(struct page *page) +{ + /* + * Global or memcg deferred list in the second tail pages is + * occupied by compound_head. + */ + return &page[2].deferred_list; +} + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a3c0a639c824..9b60863429cc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -330,6 +330,10 @@ struct mem_cgroup { struct list_head event_list; spinlock_t event_list_lock; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct deferred_split deferred_split_queue; +#endif + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0b739f360cec..5183e0d77dfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -138,6 +138,7 @@ struct page { struct { /* Second tail page of compound page */ unsigned long _compound_pad_1; /* compound_head */ unsigned long _compound_pad_2; + /* For both global and memcg */ struct list_head deferred_list; }; struct { /* Page table pages */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c642c0394690..73fc517c08d2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -496,11 +496,25 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } -static inline struct list_head *page_deferred_list(struct page *page) +#ifdef CONFIG_MEMCG +static inline struct deferred_split *get_deferred_split_queue(struct page *page) { - /* ->lru in the tail pages is occupied by compound_head. */ - return &page[2].deferred_list; + struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + + if (memcg) + return &memcg->deferred_split_queue; + else + return &pgdat->deferred_split_queue; } +#else +static inline struct deferred_split *get_deferred_split_queue(struct page *page) +{ + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + + return &pgdat->deferred_split_queue; +} +#endif void prep_transhuge_page(struct page *page) { @@ -2691,7 +2705,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + struct deferred_split *ds_queue = get_deferred_split_queue(page); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; @@ -2827,8 +2841,7 @@ fail: if (mapping) void free_transhuge_page(struct page *page) { - struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + struct deferred_split *ds_queue = get_deferred_split_queue(page); unsigned long flags; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); @@ -2842,17 +2855,37 @@ void free_transhuge_page(struct page *page) void deferred_split_huge_page(struct page *page) { - struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); - struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + struct deferred_split *ds_queue = get_deferred_split_queue(page); +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; +#endif unsigned long flags; VM_BUG_ON_PAGE(!PageTransHuge(page), page); + /* + * The try_to_unmap() in page reclaim path might reach here too, + * this may cause a race condition to corrupt deferred split queue. + * And, if page reclaim is already handling the same page, it is + * unnecessary to handle it again in shrinker. + * + * Check PageSwapCache to determine if the page is being + * handled by page reclaim since THP swap would add the page into + * swap cache before calling try_to_unmap(). + */ + if (PageSwapCache(page)) + return; + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(page_deferred_list(page), &ds_queue->split_queue); ds_queue->split_queue_len++; +#ifdef CONFIG_MEMCG + if (memcg) + memcg_set_shrinker_bit(memcg, page_to_nid(page), + deferred_split_shrinker.id); +#endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } @@ -2862,6 +2895,11 @@ static unsigned long deferred_split_count(struct shrinker *shrink, { struct pglist_data *pgdata = NODE_DATA(sc->nid); struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + +#ifdef CONFIG_MEMCG + if (sc->memcg) + ds_queue = &sc->memcg->deferred_split_queue; +#endif return READ_ONCE(ds_queue->split_queue_len); } @@ -2875,6 +2913,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct page *page; int split = 0; +#ifdef CONFIG_MEMCG + if (sc->memcg) + ds_queue = &sc->memcg->deferred_split_queue; +#endif + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_safe(pos, next, &ds_queue->split_queue) { @@ -2921,7 +2964,8 @@ static struct shrinker deferred_split_shrinker = { .count_objects = deferred_split_count, .scan_objects = deferred_split_scan, .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | + SHRINKER_NONSLAB, }; #ifdef CONFIG_DEBUG_FS diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a385a7cb7d9f..2156ef775d04 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5070,6 +5070,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) memcg->cgwb_frn[i].done = __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); + INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); + memcg->deferred_split_queue.split_queue_len = 0; #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; @@ -5449,6 +5454,14 @@ static int mem_cgroup_move_account(struct page *page, __mod_memcg_state(to, NR_WRITEBACK, nr_pages); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (compound && !list_empty(page_deferred_list(page))) { + spin_lock(&from->deferred_split_queue.split_queue_lock); + list_del_init(page_deferred_list(page)); + from->deferred_split_queue.split_queue_len--; + spin_unlock(&from->deferred_split_queue.split_queue_lock); + } +#endif /* * It is safe to change page->mem_cgroup here because the page * is referenced, charged, and isolated - we can't race with @@ -5457,6 +5470,17 @@ static int mem_cgroup_move_account(struct page *page, /* caller should have done css_get */ page->mem_cgroup = to; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (compound && list_empty(page_deferred_list(page))) { + spin_lock(&to->deferred_split_queue.split_queue_lock); + list_add_tail(page_deferred_list(page), + &to->deferred_split_queue.split_queue); + to->deferred_split_queue.split_queue_len++; + spin_unlock(&to->deferred_split_queue.split_queue_lock); + } +#endif + spin_unlock_irqrestore(&from->move_lock, flags); ret = 0; From 010c164a5fa7e169deab0a4d8211611f1930c1cd Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:19 -0700 Subject: [PATCH 610/721] mm: move memcmp_pages() and pages_identical() Patch series "THP aware uprobe", v13. This patchset makes uprobe aware of THPs. Currently, when uprobe is attached to text on THP, the page is split by FOLL_SPLIT. As a result, uprobe eliminates the performance benefit of THP. This set makes uprobe THP-aware. Instead of FOLL_SPLIT, we introduces FOLL_SPLIT_PMD, which only split PMD for uprobe. After all uprobes within the THP are removed, the PTE-mapped pages are regrouped as huge PMD. This set (plus a few THP patches) is also available at https://github.com/liu-song-6/linux/tree/uprobe-thp This patch (of 6): Move memcmp_pages() to mm/util.c and pages_identical() to mm.h, so that we can use them in other files. Link: http://lkml.kernel.org/r/20190815164525.1848545-2-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Kirill A. Shutemov Reviewed-by: Oleg Nesterov Cc: Johannes Weiner Cc: Matthew Wilcox Cc: William Kucharski Cc: Srikar Dronamraju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 +++++++ mm/ksm.c | 18 ------------------ mm/util.c | 13 +++++++++++++ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 57a9fa34f159..0ac87d9ee9d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2868,5 +2868,12 @@ void __init setup_nr_node_ids(void); static inline void setup_nr_node_ids(void) {} #endif +extern int memcmp_pages(struct page *page1, struct page *page2); + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/ksm.c b/mm/ksm.c index 3dc4346411e4..dbee2eb4dd05 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1029,24 +1029,6 @@ static u32 calc_checksum(struct page *page) return checksum; } -static int memcmp_pages(struct page *page1, struct page *page2) -{ - char *addr1, *addr2; - int ret; - - addr1 = kmap_atomic(page1); - addr2 = kmap_atomic(page2); - ret = memcmp(addr1, addr2, PAGE_SIZE); - kunmap_atomic(addr2); - kunmap_atomic(addr1); - return ret; -} - -static inline int pages_identical(struct page *page1, struct page *page2) -{ - return !memcmp_pages(page1, page2); -} - static int write_protect_page(struct vm_area_struct *vma, struct page *page, pte_t *orig_pte) { diff --git a/mm/util.c b/mm/util.c index bab284d69c8c..37f7b6711514 100644 --- a/mm/util.c +++ b/mm/util.c @@ -783,3 +783,16 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) out: return res; } + +int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1); + addr2 = kmap_atomic(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2); + kunmap_atomic(addr1); + return ret; +} From fb4fb04ff4dd377b3132e9b31259263ec37b830a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:22 -0700 Subject: [PATCH 611/721] uprobe: use original page when all uprobes are removed Currently, uprobe swaps the target page with a anonymous page in both install_breakpoint() and remove_breakpoint(). When all uprobes on a page are removed, the given mm is still using an anonymous page (not the original page). This patch allows uprobe to use original page when possible (all uprobes on the page are already removed, and the original page is in page cache and uptodate). As suggested by Oleg, we unmap the old_page and let the original page fault in. Link: http://lkml.kernel.org/r/20190815164525.1848545-3-songliubraving@fb.com Signed-off-by: Song Liu Suggested-by: Oleg Nesterov Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 66 +++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 84fa00497c49..648f47553bff 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -143,10 +143,12 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) * * @vma: vma that holds the pte pointing to page * @addr: address the old @page is mapped at - * @page: the cowed page we are replacing by kpage - * @kpage: the modified page we replace page by + * @old_page: the page we are replacing by new_page + * @new_page: the modified page we replace page by * - * Returns 0 on success, -EFAULT on failure. + * If @new_page is NULL, only unmap @old_page. + * + * Returns 0 on success, negative error code otherwise. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) @@ -166,10 +168,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, - false); - if (err) - return err; + if (new_page) { + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, + &memcg, false); + if (err) + return err; + } /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(old_page); @@ -177,15 +181,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) { - mem_cgroup_cancel_charge(new_page, memcg, false); + if (new_page) + mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); - lru_cache_add_active_or_unevictable(new_page, vma); + if (new_page) { + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); + lru_cache_add_active_or_unevictable(new_page, vma); + } else + /* no new page, just dec_mm_counter for old_page */ + dec_mm_counter(mm, MM_ANONPAGES); if (!PageAnon(old_page)) { dec_mm_counter(mm, mm_counter_file(old_page)); @@ -194,8 +203,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); ptep_clear_flush_notify(vma, addr, pvmw.pte); - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + if (new_page) + set_pte_at_notify(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, false); if (!page_mapped(old_page)) @@ -488,6 +498,10 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, ref_ctr_updated = 1; } + ret = 0; + if (!is_register && !PageAnon(old_page)) + goto put_old; + ret = anon_vma_prepare(vma); if (ret) goto put_old; @@ -501,8 +515,30 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (!is_register) { + struct page *orig_page; + pgoff_t index; + + VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); + + index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; + orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, + index); + + if (orig_page) { + if (PageUptodate(orig_page) && + pages_identical(new_page, orig_page)) { + /* let go new_page */ + put_page(new_page); + new_page = NULL; + } + put_page(orig_page); + } + } + ret = __replace_page(vma, vaddr, old_page, new_page); - put_page(new_page); + if (new_page) + put_page(new_page); put_old: put_page(old_page); From bfe7b00de6d1e25fee08484c4fbf1c1ed175be78 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:25 -0700 Subject: [PATCH 612/721] mm, thp: introduce FOLL_SPLIT_PMD Introduce a new foll_flag: FOLL_SPLIT_PMD. As the name says FOLL_SPLIT_PMD splits huge pmd for given mm_struct, the underlining huge page stays as-is. FOLL_SPLIT_PMD is useful for cases where we need to use regular pages, but would switch back to huge page and huge pmd on. One of such example is uprobe. The following patches use FOLL_SPLIT_PMD in uprobe. Link: http://lkml.kernel.org/r/20190815164525.1848545-4-songliubraving@fb.com Signed-off-by: Song Liu Reviewed-by: Oleg Nesterov Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/gup.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ac87d9ee9d8..233ad11938db 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2591,6 +2591,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ /* * NOTE on FOLL_LONGTERM: diff --git a/mm/gup.c b/mm/gup.c index 012060efddf1..60c3915c8ee6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -384,7 +384,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & FOLL_SPLIT) { + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { @@ -393,7 +393,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; - } else { + } else if (flags & FOLL_SPLIT) { if (unlikely(!try_get_page(page))) { spin_unlock(ptl); return ERR_PTR(-ENOMEM); @@ -405,6 +405,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, put_page(page); if (pmd_none(*pmd)) return no_page_table(vma, flags); + } else { /* flags & FOLL_SPLIT_PMD */ + spin_unlock(ptl); + split_huge_pmd(vma, pmd, address); + ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; } return ret ? ERR_PTR(ret) : From 5a52c9df62b422087d0c88fc79641028e4472a3b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:27 -0700 Subject: [PATCH 613/721] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT Use the newly added FOLL_SPLIT_PMD in uprobe. This preserves the huge page when the uprobe is enabled. When the uprobe is disabled, newer instances of the same application could still benefit from huge page. For the next step, we will enable khugepaged to regroup the pmd, so that existing instances of the application could also benefit from huge page after the uprobe is disabled. Link: http://lkml.kernel.org/r/20190815164525.1848545-5-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Kirill A. Shutemov Reviewed-by: Srikar Dronamraju Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 648f47553bff..27b596f14463 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,7 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, { struct mm_struct *mm = vma->vm_mm; struct page_vma_mapped_walk pvmw = { - .page = old_page, + .page = compound_head(old_page), .vma = vma, .address = addr, }; @@ -166,8 +166,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); - VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - if (new_page) { err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, false); @@ -481,7 +479,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); + FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); if (ret <= 0) return ret; From 27e1f8273113adec0e98bf513e4091636b27cc2a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:30 -0700 Subject: [PATCH 614/721] khugepaged: enable collapse pmd for pte-mapped THP khugepaged needs exclusive mmap_sem to access page table. When it fails to lock mmap_sem, the page will fault in as pte-mapped THP. As the page is already a THP, khugepaged will not handle this pmd again. This patch enables the khugepaged to retry collapse the page table. struct mm_slot (in khugepaged.c) is extended with an array, containing addresses of pte-mapped THPs. We use array here for simplicity. We can easily replace it with more advanced data structures when needed. In khugepaged_scan_mm_slot(), if the mm contains pte-mapped THP, we try to collapse the page table. Since collapse may happen at an later time, some pages may already fault in. collapse_pte_mapped_thp() is added to properly handle these pages. collapse_pte_mapped_thp() also double checks whether all ptes in this pmd are mapping to the same THP. This is necessary because some subpage of the THP may be replaced, for example by uprobe. In such cases, it is not possible to collapse the pmd. [kirill.shutemov@linux.intel.com: add comments for retract_page_tables()] Link: http://lkml.kernel.org/r/20190816145443.6ard3iilytc6jlgv@box Link: http://lkml.kernel.org/r/20190815164525.1848545-6-songliubraving@fb.com Signed-off-by: Song Liu Signed-off-by: Kirill A. Shutemov Acked-by: Kirill A. Shutemov Suggested-by: Johannes Weiner Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/khugepaged.h | 12 +++ mm/khugepaged.c | 192 ++++++++++++++++++++++++++++++++++++- 2 files changed, 200 insertions(+), 4 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 082d1d2a5216..bc45ea1efbf7 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,14 @@ extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags); +#ifdef CONFIG_SHMEM +extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); +#else +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ +} +#endif #define khugepaged_enabled() \ (transparent_hugepage_flags & \ @@ -73,6 +81,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, { return 0; } +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e89430ec5267..0a1b4b484ac5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -77,6 +77,8 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __read_mostly; +#define MAX_PTE_MAPPED_THP 8 + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -87,6 +89,10 @@ struct mm_slot { struct hlist_node hash; struct list_head mm_node; struct mm_struct *mm; + + /* pte-mapped THP in this mm */ + int nr_pte_mapped_thp; + unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP]; }; /** @@ -1254,6 +1260,159 @@ static void collect_mm_slot(struct mm_slot *mm_slot) } #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +/* + * Notify khugepaged that given addr of the mm is pte-mapped THP. Then + * khugepaged should try to collapse the page table. + */ +static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ + struct mm_slot *mm_slot; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) + mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; + spin_unlock(&khugepaged_mm_lock); + return 0; +} + +/** + * Try to collapse a pte-mapped THP for mm at address haddr. + * + * This function checks whether all the PTEs in the PMD are pointing to the + * right THP. If so, retract the page table so the THP can refault in with + * as pmd-mapped. + */ +void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) +{ + unsigned long haddr = addr & HPAGE_PMD_MASK; + struct vm_area_struct *vma = find_vma(mm, haddr); + struct page *hpage = NULL; + pte_t *start_pte, *pte; + pmd_t *pmd, _pmd; + spinlock_t *ptl; + int count = 0; + int i; + + if (!vma || !vma->vm_file || + vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) + return; + + /* + * This vm_flags may not have VM_HUGEPAGE if the page was not + * collapsed by this mm. But we can still collapse if the page is + * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() + * will not fail the vma for missing VM_HUGEPAGE + */ + if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) + return; + + pmd = mm_find_pmd(mm, haddr); + if (!pmd) + return; + + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + + /* step 1: check all mapped PTEs are to the right huge page */ + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *page; + + /* empty pte, skip */ + if (pte_none(*pte)) + continue; + + /* page swapped out, abort */ + if (!pte_present(*pte)) + goto abort; + + page = vm_normal_page(vma, addr, *pte); + + if (!page || !PageCompound(page)) + goto abort; + + if (!hpage) { + hpage = compound_head(page); + /* + * The mapping of the THP should not change. + * + * Note that uprobe, debugger, or MAP_PRIVATE may + * change the page table, but the new page will + * not pass PageCompound() check. + */ + if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping)) + goto abort; + } + + /* + * Confirm the page maps to the correct subpage. + * + * Note that uprobe, debugger, or MAP_PRIVATE may change + * the page table, but the new page will not pass + * PageCompound() check. + */ + if (WARN_ON(hpage + i != page)) + goto abort; + count++; + } + + /* step 2: adjust rmap */ + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *page; + + if (pte_none(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + page_remove_rmap(page, false); + } + + pte_unmap_unlock(start_pte, ptl); + + /* step 3: set proper refcount and mm_counters. */ + if (hpage) { + page_ref_sub(hpage, count); + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); + } + + /* step 4: collapse pmd */ + ptl = pmd_lock(vma->vm_mm, pmd); + _pmd = pmdp_collapse_flush(vma, addr, pmd); + spin_unlock(ptl); + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(_pmd)); + return; + +abort: + pte_unmap_unlock(start_pte, ptl); +} + +static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + int i; + + if (likely(mm_slot->nr_pte_mapped_thp == 0)) + return 0; + + if (!down_write_trylock(&mm->mmap_sem)) + return -EBUSY; + + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); + +out: + mm_slot->nr_pte_mapped_thp = 0; + up_write(&mm->mmap_sem); + return 0; +} + static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -1262,7 +1421,22 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - /* probably overkill */ + /* + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that + * got written to. These VMAs are likely not worth investing + * down_write(mmap_sem) as PMD-mapping is likely to be split + * later. + * + * Not that vma->anon_vma check is racy: it can be set up after + * the check but before we took mmap_sem by the fault path. + * But page lock would prevent establishing any new ptes of the + * page, so we are safe. + * + * An alternative would be drop the check, but check that page + * table is clear before calling pmdp_collapse_flush() under + * ptl. It has higher chance to recover THP for the VMA, but + * has higher cost too. + */ if (vma->anon_vma) continue; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); @@ -1275,9 +1449,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) continue; /* * We need exclusive mmap_sem to retract page table. - * If trylock fails we would end up with pte-mapped THP after - * re-fault. Not ideal, but it's more important to not disturb - * the system too much. + * + * We use trylock due to lock inversion: we need to acquire + * mmap_sem while holding page lock. Fault path does it in + * reverse order. Trylock is a way to avoid deadlock. */ if (down_write_trylock(&vma->vm_mm->mmap_sem)) { spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); @@ -1287,6 +1462,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) up_write(&vma->vm_mm->mmap_sem); mm_dec_nr_ptes(vma->vm_mm); pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } else { + /* Try again later */ + khugepaged_add_pte_mapped_thp(vma->vm_mm, addr); } } i_mmap_unlock_write(mapping); @@ -1709,6 +1887,11 @@ static void khugepaged_scan_file(struct mm_struct *mm, { BUILD_BUG(); } + +static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +{ + return 0; +} #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, @@ -1733,6 +1916,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); + khugepaged_collapse_pte_mapped_thps(mm_slot); mm = mm_slot->mm; /* From f385cb85a42fc4ba92464c2bfd2e2049d65353d3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Sep 2019 15:38:33 -0700 Subject: [PATCH 615/721] uprobe: collapse THP pmd after removing all uprobes After all uprobes are removed from the huge page (with PTE pgtable), it is possible to collapse the pmd and benefit from THP again. This patch does the collapse by calling collapse_pte_mapped_thp(). Link: http://lkml.kernel.org/r/20190815164525.1848545-7-songliubraving@fb.com Signed-off-by: Song Liu Acked-by: Kirill A. Shutemov Reported-by: kbuild test robot Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 27b596f14463..94d38a39d72e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -472,6 +473,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct page *old_page, *new_page; struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; + bool orig_page_huge = false; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); @@ -529,6 +531,9 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, /* let go new_page */ put_page(new_page); new_page = NULL; + + if (PageCompound(orig_page)) + orig_page_huge = true; } put_page(orig_page); } @@ -547,6 +552,10 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); + /* try collapse pmd for compound page */ + if (!ret && orig_page_huge) + collapse_pte_mapped_thp(mm, vaddr); + return ret; } From 649775be63c8b2e0b56ecc5bbc96d38205ec5259 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:37 -0700 Subject: [PATCH 616/721] mm, fs: move randomize_stack_top from fs to mm Patch series "Provide generic top-down mmap layout functions", v6. This series introduces generic functions to make top-down mmap layout easily accessible to architectures, in particular riscv which was the initial goal of this series. The generic implementation was taken from arm64 and used successively by arm, mips and finally riscv. Note that in addition the series fixes 2 issues: - stack randomization was taken into account even if not necessary. - [1] fixed an issue with mmap base which did not take into account randomization but did not report it to arm and mips, so by moving arm64 into a generic library, this problem is now fixed for both architectures. This work is an effort to factorize architecture functions to avoid code duplication and oversights as in [1]. [1]: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1429066.html This patch (of 14): This preparatory commit moves this function so that further introduction of generic topdown mmap layout is contained only in mm/util.c. Link: http://lkml.kernel.org/r/20190730055113.23635-2-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Palmer Dabbelt Cc: Albert Ou Cc: Alexander Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 20 -------------------- include/linux/mm.h | 2 ++ mm/util.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d4e11b2e04f6..cec3b4146440 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -670,26 +670,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, * libraries. There is no binary dependent code anywhere else. */ -#ifndef STACK_RND_MASK -#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ -#endif - -static unsigned long randomize_stack_top(unsigned long stack_top) -{ - unsigned long random_variable = 0; - - if (current->flags & PF_RANDOMIZE) { - random_variable = get_random_long(); - random_variable &= STACK_RND_MASK; - random_variable <<= PAGE_SHIFT; - } -#ifdef CONFIG_STACK_GROWSUP - return PAGE_ALIGN(stack_top) + random_variable; -#else - return PAGE_ALIGN(stack_top) - random_variable; -#endif -} - static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 233ad11938db..294a67b94147 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2328,6 +2328,8 @@ extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); +unsigned long randomize_stack_top(unsigned long stack_top); + extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long mmap_region(struct file *file, unsigned long addr, diff --git a/mm/util.c b/mm/util.c index 37f7b6711514..bf8af5e07c4a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include @@ -293,6 +295,26 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } +#ifndef STACK_RND_MASK +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ +#endif + +unsigned long randomize_stack_top(unsigned long stack_top) +{ + unsigned long random_variable = 0; + + if (current->flags & PF_RANDOMIZE) { + random_variable = get_random_long(); + random_variable &= STACK_RND_MASK; + random_variable <<= PAGE_SHIFT; + } +#ifdef CONFIG_STACK_GROWSUP + return PAGE_ALIGN(stack_top) + random_variable; +#else + return PAGE_ALIGN(stack_top) - random_variable; +#endif +} + #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { From 28058ed61fc869d7e67916725a3f7e9de50e606b Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:40 -0700 Subject: [PATCH 617/721] arm64: make use of is_compat_task instead of hardcoding this test Each architecture has its own way to determine if a task is a compat task, by using is_compat_task in arch_mmap_rnd, it allows more genericity and then it prepares its moving to mm/. Link: http://lkml.kernel.org/r/20190730055113.23635-3-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Catalin Marinas Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index b050641b5139..bb0140afed66 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -43,7 +43,7 @@ unsigned long arch_mmap_rnd(void) unsigned long rnd; #ifdef CONFIG_COMPAT - if (test_thread_flag(TIF_32BIT)) + if (is_compat_task()) rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); else #endif From e8d54b62c55ab6201de6d195fc2c276294c1f6ae Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:43 -0700 Subject: [PATCH 618/721] arm64: consider stack randomization for mmap base only when necessary Do not offset mmap base address because of stack randomization if current task does not want randomization. Note that x86 already implements this behaviour. Link: http://lkml.kernel.org/r/20190730055113.23635-4-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Catalin Marinas Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index bb0140afed66..e4acaead67de 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -54,7 +54,11 @@ unsigned long arch_mmap_rnd(void) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) From 67f3977f805b34cf0e41090679800d2091d41d49 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:47 -0700 Subject: [PATCH 619/721] arm64, mm: move generic mmap layout functions to mm arm64 handles top-down mmap layout in a way that can be easily reused by other architectures, so make it available in mm. It then introduces a new config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT that can be set by other architectures to benefit from those functions. Note that this new config depends on MMU being enabled, if selected without MMU support, a warning will be thrown. Link: http://lkml.kernel.org/r/20190730055113.23635-5-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Suggested-by: Christoph Hellwig Acked-by: Catalin Marinas Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 10 ++++ arch/arm64/Kconfig | 1 + arch/arm64/include/asm/processor.h | 2 - arch/arm64/mm/mmap.c | 76 ----------------------------- kernel/sysctl.c | 6 ++- mm/util.c | 78 +++++++++++++++++++++++++++++- 6 files changed, 92 insertions(+), 81 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 0fcf8ec1e098..dfce421b8e8a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -706,6 +706,16 @@ config HAVE_ARCH_COMPAT_MMAP_BASES and vice-versa 32-bit applications to call 64-bit mmap(). Required for applications doing different bitness syscalls. +# This allows to use a set of generic functions to determine mmap base +# address by giving priority to top-down scheme only if the process +# is not in legacy mode (compat task, unlimited stack size or +# sysctl_legacy_va_layout). +# Architecture that selects this option can provide its own version of: +# - STACK_RND_MASK +config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT + bool + depends on MMU + config HAVE_COPY_THREAD_TLS bool help diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 37c610963eee..2d4b7044063c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -71,6 +71,7 @@ config ARM64 select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index c67848c55009..5623685c7d13 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -280,8 +280,6 @@ static inline void spin_lock_prefetch(const void *ptr) "nop") : : "p" (ptr)); } -#define HAVE_ARCH_PICK_MMAP_LAYOUT - extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */ extern void __init minsigstksz_setup(void); diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index e4acaead67de..3028bacbc4e9 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -20,82 +20,6 @@ #include -/* - * Leave enough space between the mmap area and the stack to honour ulimit in - * the face of randomisation. - */ -#define MIN_GAP (SZ_128M) -#define MAX_GAP (STACK_TOP/6*5) - -static int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -unsigned long arch_mmap_rnd(void) -{ - unsigned long rnd; - -#ifdef CONFIG_COMPAT - if (is_compat_task()) - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); - else -#endif - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); - return rnd << PAGE_SHIFT; -} - -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = stack_guard_gap; - - /* Account for stack randomization if necessary */ - if (current->flags & PF_RANDOMIZE) - pad += (STACK_RND_MASK << PAGE_SHIFT); - - /* Values close to RLIM_INFINITY can overflow. */ - if (gap + pad > gap) - gap += pad; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(STACK_TOP - gap - rnd); -} - -/* - * This function, called very early during the creation of a new process VM - * image, sets up which VM layout function to use: - */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - /* - * Fall back to the standard layout if the personality bit is set, or - * if the expected stack growth is unlimited: - */ - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} - /* * You really shouldn't be using read() or write() on /dev/mem. This might go * away in the future. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 078950d9605b..00fcea236eba 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -264,7 +264,8 @@ extern struct ctl_table epoll_table[]; extern struct ctl_table firmware_config_table[]; #endif -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif @@ -1573,7 +1574,8 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec, .extra1 = SYSCTL_ZERO, }, -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) { .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, diff --git a/mm/util.c b/mm/util.c index bf8af5e07c4a..7922726f0a8f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -17,7 +17,12 @@ #include #include #include +#include +#include #include +#include +#include +#include #include @@ -315,7 +320,78 @@ unsigned long randomize_stack_top(unsigned long stack_top) #endif } -#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) +#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +#ifdef CONFIG_ARCH_HAS_ELF_RANDOMIZE +unsigned long arch_mmap_rnd(void) +{ + unsigned long rnd; + +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + if (is_compat_task()) + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); + else +#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + + return rnd << PAGE_SHIFT; +} +#endif /* CONFIG_ARCH_HAS_ELF_RANDOMIZE */ + +static int mmap_is_legacy(struct rlimit *rlim_stack) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (rlim_stack->rlim_cur == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +/* + * Leave enough space between the mmap area and the stack to honour ulimit in + * the face of randomisation. + */ +#define MIN_GAP (SZ_128M) +#define MAX_GAP (STACK_TOP / 6 * 5) + +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) +{ + unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return PAGE_ALIGN(STACK_TOP - gap - rnd); +} + +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +{ + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = arch_mmap_rnd(); + + if (mmap_is_legacy(rlim_stack)) { + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; + mm->get_unmapped_area = arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor, rlim_stack); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + } +} +#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; From e7142bf5d231f3ccdf6ea6764d5080999b8e299d Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:50 -0700 Subject: [PATCH 620/721] arm64, mm: make randomization selected by generic topdown mmap layout This commits selects ARCH_HAS_ELF_RANDOMIZE when an arch uses the generic topdown mmap layout functions so that this security feature is on by default. Note that this commit also removes the possibility for arm64 to have elf randomization and no MMU: without MMU, the security added by randomization is worth nothing. Link: http://lkml.kernel.org/r/20190730055113.23635-6-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Catalin Marinas Acked-by: Kees Cook Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 1 + arch/arm64/Kconfig | 1 - arch/arm64/kernel/process.c | 8 -------- mm/util.c | 11 +++++++++-- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index dfce421b8e8a..5f8a5d84dbbe 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -715,6 +715,7 @@ config HAVE_ARCH_COMPAT_MMAP_BASES config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT bool depends on MMU + select ARCH_HAS_ELF_RANDOMIZE config HAVE_COPY_THREAD_TLS bool diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 2d4b7044063c..866e05882799 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -15,7 +15,6 @@ config ARM64 select ARCH_HAS_DMA_COHERENT_TO_PFN select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI - select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 03689c0beb34..a47462def04b 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -557,14 +557,6 @@ unsigned long arch_align_stack(unsigned long sp) return sp & ~0xf; } -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - if (is_compat_task()) - return randomize_page(mm->brk, SZ_32M); - else - return randomize_page(mm->brk, SZ_1G); -} - /* * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. */ diff --git a/mm/util.c b/mm/util.c index 7922726f0a8f..3ad6db9a722e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -321,7 +321,15 @@ unsigned long randomize_stack_top(unsigned long stack_top) } #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT -#ifdef CONFIG_ARCH_HAS_ELF_RANDOMIZE +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + /* Is the current task 32bit ? */ + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) + return randomize_page(mm->brk, SZ_32M); + + return randomize_page(mm->brk, SZ_1G); +} + unsigned long arch_mmap_rnd(void) { unsigned long rnd; @@ -335,7 +343,6 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -#endif /* CONFIG_ARCH_HAS_ELF_RANDOMIZE */ static int mmap_is_legacy(struct rlimit *rlim_stack) { From af0f4297286f13a75edf93677b1fb2fc16c412a7 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:54 -0700 Subject: [PATCH 621/721] arm: properly account for stack randomization and stack guard gap This commit takes care of stack randomization and stack guard gap when computing mmap base address and checks if the task asked for randomization. This fixes the problem uncovered and not fixed for arm here: https://lkml.kernel.org/r/20170622200033.25714-1-riel@redhat.com Link: http://lkml.kernel.org/r/20190730055113.23635-7-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/mmap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index f866870db749..bff3d00bda5b 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -18,8 +18,9 @@ (((pgoff)<> (PAGE_SHIFT - 12)) static int mmap_is_legacy(struct rlimit *rlim_stack) { @@ -35,6 +36,15 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; if (gap < MIN_GAP) gap = MIN_GAP; From 86e568e9c0525fc40e76d827212d5e9721cf7504 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:38:57 -0700 Subject: [PATCH 622/721] arm: use STACK_TOP when computing mmap base address mmap base address must be computed wrt stack top address, using TASK_SIZE is wrong since STACK_TOP and TASK_SIZE are not equivalent. Link: http://lkml.kernel.org/r/20190730055113.23635-8-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index bff3d00bda5b..0b94b674aa91 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -19,7 +19,7 @@ /* gap between mmap and stack */ #define MIN_GAP (128*1024*1024UL) -#define MAX_GAP ((TASK_SIZE)/6*5) +#define MAX_GAP ((STACK_TOP)/6*5) #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) static int mmap_is_legacy(struct rlimit *rlim_stack) @@ -51,7 +51,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - rnd); + return PAGE_ALIGN(STACK_TOP - gap - rnd); } /* From dba79c3df4a2275132759b0bc04c64b7a510af4a Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:01 -0700 Subject: [PATCH 623/721] arm: use generic mmap top-down layout and brk randomization arm uses a top-down mmap layout by default that exactly fits the generic functions, so get rid of arch specific code and use the generic version by selecting ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT. As ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT selects ARCH_HAS_ELF_RANDOMIZE, use the generic version of arch_randomize_brk since it also fits. Note that this commit also removes the possibility for arm to have elf randomization and no MMU: without MMU, the security added by randomization is worth nothing. Note that it is safe to remove STACK_RND_MASK since it matches the default value. Link: http://lkml.kernel.org/r/20190730055113.23635-9-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + arch/arm/include/asm/processor.h | 2 -- arch/arm/kernel/process.c | 5 --- arch/arm/mm/mmap.c | 62 -------------------------------- 4 files changed, 1 insertion(+), 69 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 229f2cdd81ca..8a50efb559f3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -34,6 +34,7 @@ config ARM select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION select BINFMT_FLAT_ARGVP_ENVP_ON_STACK select BUILDTIME_EXTABLE_SORT if MMU diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 20c2f42454b8..614bf829e454 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -140,8 +140,6 @@ static inline void prefetchw(const void *ptr) #endif #endif -#define HAVE_ARCH_PICK_MMAP_LAYOUT - #endif #endif /* __ASM_ARM_PROCESSOR_H */ diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index f934a6739fc0..9485acc520a4 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -319,11 +319,6 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - return randomize_page(mm->brk, 0x02000000); -} - #ifdef CONFIG_MMU #ifdef CONFIG_KUSER_HELPERS /* diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index 0b94b674aa91..b8d912ac9e61 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -17,43 +17,6 @@ ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ (((pgoff)<> (PAGE_SHIFT - 12)) - -static int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = stack_guard_gap; - - /* Account for stack randomization if necessary */ - if (current->flags & PF_RANDOMIZE) - pad += (STACK_RND_MASK << PAGE_SHIFT); - - /* Values close to RLIM_INFINITY can overflow. */ - if (gap + pad > gap) - gap += pad; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(STACK_TOP - gap - rnd); -} - /* * We need to ensure that shared mappings are correctly aligned to * avoid aliasing issues with VIPT caches. We need to ensure that @@ -181,31 +144,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } -unsigned long arch_mmap_rnd(void) -{ - unsigned long rnd; - - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); - - return rnd << PAGE_SHIFT; -} - -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} - /* * You really shouldn't be using read() or write() on /dev/mem. This * might go away in the future. From b1f61b5bde3a1f50392c97b4c8513d1b8efb1cf2 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:04 -0700 Subject: [PATCH 624/721] mips: properly account for stack randomization and stack guard gap This commit takes care of stack randomization and stack guard gap when computing mmap base address and checks if the task asked for randomization. This fixes the problem uncovered and not fixed for arm here: https://lkml.kernel.org/r/20170622200033.25714-1-riel@redhat.com Link: http://lkml.kernel.org/r/20190730055113.23635-10-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Acked-by: Paul Burton Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/mmap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index d79f2b432318..f5c778113384 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -21,8 +21,9 @@ unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ EXPORT_SYMBOL(shm_align_mask); /* gap between mmap and stack */ -#define MIN_GAP (128*1024*1024UL) -#define MAX_GAP ((TASK_SIZE)/6*5) +#define MIN_GAP (128*1024*1024UL) +#define MAX_GAP ((TASK_SIZE)/6*5) +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) static int mmap_is_legacy(struct rlimit *rlim_stack) { @@ -38,6 +39,15 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; if (gap < MIN_GAP) gap = MIN_GAP; From b5fb861790bf54486b68644fc27d6969bf772dd8 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:07 -0700 Subject: [PATCH 625/721] mips: use STACK_TOP when computing mmap base address mmap base address must be computed wrt stack top address, using TASK_SIZE is wrong since STACK_TOP and TASK_SIZE are not equivalent. Link: http://lkml.kernel.org/r/20190730055113.23635-11-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Kees Cook Acked-by: Paul Burton Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index f5c778113384..a7e84b2e71d7 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -22,7 +22,7 @@ EXPORT_SYMBOL(shm_align_mask); /* gap between mmap and stack */ #define MIN_GAP (128*1024*1024UL) -#define MAX_GAP ((TASK_SIZE)/6*5) +#define MAX_GAP ((STACK_TOP)/6*5) #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) static int mmap_is_legacy(struct rlimit *rlim_stack) @@ -54,7 +54,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - rnd); + return PAGE_ALIGN(STACK_TOP - gap - rnd); } #define COLOUR_ALIGN(addr, pgoff) \ From e548599fbe310754aa8f687d53c24d9cb5338ac4 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:11 -0700 Subject: [PATCH 626/721] mips: adjust brk randomization offset to fit generic version This commit simply bumps up to 32MB and 1GB the random offset of brk, compared to 8MB and 256MB, for 32bit and 64bit respectively. Link: http://lkml.kernel.org/r/20190730055113.23635-12-alex@ghiti.fr Suggested-by: Kees Cook Signed-off-by: Alexandre Ghiti Acked-by: Paul Burton Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/mmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index a7e84b2e71d7..ff6ab87e9c56 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -16,6 +16,7 @@ #include #include #include +#include unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ EXPORT_SYMBOL(shm_align_mask); @@ -189,11 +190,11 @@ static inline unsigned long brk_rnd(void) unsigned long rnd = get_random_long(); rnd = rnd << PAGE_SHIFT; - /* 8MB for 32bit, 256MB for 64bit */ + /* 32MB for 32bit, 1GB for 64bit */ if (TASK_IS_32BIT_ADDR) - rnd = rnd & 0x7ffffful; + rnd = rnd & (SZ_32M - 1); else - rnd = rnd & 0xffffffful; + rnd = rnd & (SZ_1G - 1); return rnd; } From 09036468c8d074b730a840657a896f81c1c92017 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:14 -0700 Subject: [PATCH 627/721] mips: replace arch specific way to determine 32bit task with generic version Mips uses TASK_IS_32BIT_ADDR to determine if a task is 32bit, but this define is mips specific and other arches do not have it: instead, use !IS_ENABLED(CONFIG_64BIT) || is_compat_task() condition. Link: http://lkml.kernel.org/r/20190730055113.23635-13-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Paul Burton Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/mmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index ff6ab87e9c56..d5106c26ac6a 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -17,6 +17,7 @@ #include #include #include +#include unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ EXPORT_SYMBOL(shm_align_mask); @@ -191,7 +192,7 @@ static inline unsigned long brk_rnd(void) rnd = rnd << PAGE_SHIFT; /* 32MB for 32bit, 1GB for 64bit */ - if (TASK_IS_32BIT_ADDR) + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) rnd = rnd & (SZ_32M - 1); else rnd = rnd & (SZ_1G - 1); From 9035bd29427921cd32d268a830aff78dcafb945b Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:18 -0700 Subject: [PATCH 628/721] mips: use generic mmap top-down layout and brk randomization mips uses a top-down layout by default that exactly fits the generic functions, so get rid of arch specific code and use the generic version by selecting ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT. As ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT selects ARCH_HAS_ELF_RANDOMIZE, use the generic version of arch_randomize_brk since it also fits. Note that this commit also removes the possibility for mips to have elf randomization and no MMU: without MMU, the security added by randomization is worth nothing. Link: http://lkml.kernel.org/r/20190730055113.23635-14-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: Paul Burton Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/Kconfig | 2 +- arch/mips/include/asm/processor.h | 5 -- arch/mips/mm/mmap.c | 96 ------------------------------- 3 files changed, 1 insertion(+), 102 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index cc8e2b1032a5..a0bd9bdb5f83 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -5,7 +5,6 @@ config MIPS select ARCH_32BIT_OFF_T if !64BIT select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT select ARCH_CLOCKSOURCE_DATA - select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_SUPPORTS_UPROBES @@ -13,6 +12,7 @@ config MIPS select ARCH_USE_CMPXCHG_LOCKREF if 64BIT select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index aca909bd7841..fba18d4a9190 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -29,11 +29,6 @@ extern unsigned int vced_count, vcei_count; -/* - * MIPS does have an arch_pick_mmap_layout() - */ -#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 - #ifdef CONFIG_32BIT #ifdef CONFIG_KVM_GUEST /* User space process size is limited to 1GB in KVM Guest Mode */ diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index d5106c26ac6a..00fe90c6db3e 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -16,49 +16,10 @@ #include #include #include -#include -#include unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ EXPORT_SYMBOL(shm_align_mask); -/* gap between mmap and stack */ -#define MIN_GAP (128*1024*1024UL) -#define MAX_GAP ((STACK_TOP)/6*5) -#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) - -static int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = stack_guard_gap; - - /* Account for stack randomization if necessary */ - if (current->flags & PF_RANDOMIZE) - pad += (STACK_RND_MASK << PAGE_SHIFT); - - /* Values close to RLIM_INFINITY can overflow. */ - if (gap + pad > gap) - gap += pad; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(STACK_TOP - gap - rnd); -} - #define COLOUR_ALIGN(addr, pgoff) \ ((((addr) + shm_align_mask) & ~shm_align_mask) + \ (((pgoff) << PAGE_SHIFT) & shm_align_mask)) @@ -156,63 +117,6 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, addr0, len, pgoff, flags, DOWN); } -unsigned long arch_mmap_rnd(void) -{ - unsigned long rnd; - -#ifdef CONFIG_COMPAT - if (TASK_IS_32BIT_ADDR) - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); - else -#endif /* CONFIG_COMPAT */ - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); - - return rnd << PAGE_SHIFT; -} - -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} - -static inline unsigned long brk_rnd(void) -{ - unsigned long rnd = get_random_long(); - - rnd = rnd << PAGE_SHIFT; - /* 32MB for 32bit, 1GB for 64bit */ - if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) - rnd = rnd & (SZ_32M - 1); - else - rnd = rnd & (SZ_1G - 1); - - return rnd; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long base = mm->brk; - unsigned long ret; - - ret = PAGE_ALIGN(base + brk_rnd()); - - if (ret < mm->brk) - return mm->brk; - - return ret; -} - bool __virt_addr_valid(const volatile void *kaddr) { unsigned long vaddr = (unsigned long)kaddr; From 54c95a11cc1b5e1d578209e027adf5775395dafd Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 23 Sep 2019 15:39:21 -0700 Subject: [PATCH 629/721] riscv: make mmap allocation top-down by default In order to avoid wasting user address space by using bottom-up mmap allocation scheme, prefer top-down scheme when possible. Before: root@qemuriscv64:~# cat /proc/self/maps 00010000-00016000 r-xp 00000000 fe:00 6389 /bin/cat.coreutils 00016000-00017000 r--p 00005000 fe:00 6389 /bin/cat.coreutils 00017000-00018000 rw-p 00006000 fe:00 6389 /bin/cat.coreutils 00018000-00039000 rw-p 00000000 00:00 0 [heap] 1555556000-155556d000 r-xp 00000000 fe:00 7193 /lib/ld-2.28.so 155556d000-155556e000 r--p 00016000 fe:00 7193 /lib/ld-2.28.so 155556e000-155556f000 rw-p 00017000 fe:00 7193 /lib/ld-2.28.so 155556f000-1555570000 rw-p 00000000 00:00 0 1555570000-1555572000 r-xp 00000000 00:00 0 [vdso] 1555574000-1555576000 rw-p 00000000 00:00 0 1555576000-1555674000 r-xp 00000000 fe:00 7187 /lib/libc-2.28.so 1555674000-1555678000 r--p 000fd000 fe:00 7187 /lib/libc-2.28.so 1555678000-155567a000 rw-p 00101000 fe:00 7187 /lib/libc-2.28.so 155567a000-15556a0000 rw-p 00000000 00:00 0 3fffb90000-3fffbb1000 rw-p 00000000 00:00 0 [stack] After: root@qemuriscv64:~# cat /proc/self/maps 00010000-00016000 r-xp 00000000 fe:00 6389 /bin/cat.coreutils 00016000-00017000 r--p 00005000 fe:00 6389 /bin/cat.coreutils 00017000-00018000 rw-p 00006000 fe:00 6389 /bin/cat.coreutils 2de81000-2dea2000 rw-p 00000000 00:00 0 [heap] 3ff7eb6000-3ff7ed8000 rw-p 00000000 00:00 0 3ff7ed8000-3ff7fd6000 r-xp 00000000 fe:00 7187 /lib/libc-2.28.so 3ff7fd6000-3ff7fda000 r--p 000fd000 fe:00 7187 /lib/libc-2.28.so 3ff7fda000-3ff7fdc000 rw-p 00101000 fe:00 7187 /lib/libc-2.28.so 3ff7fdc000-3ff7fe2000 rw-p 00000000 00:00 0 3ff7fe4000-3ff7fe6000 r-xp 00000000 00:00 0 [vdso] 3ff7fe6000-3ff7ffd000 r-xp 00000000 fe:00 7193 /lib/ld-2.28.so 3ff7ffd000-3ff7ffe000 r--p 00016000 fe:00 7193 /lib/ld-2.28.so 3ff7ffe000-3ff7fff000 rw-p 00017000 fe:00 7193 /lib/ld-2.28.so 3ff7fff000-3ff8000000 rw-p 00000000 00:00 0 3fff888000-3fff8a9000 rw-p 00000000 00:00 0 [stack] [alex@ghiti.fr: v6] Link: http://lkml.kernel.org/r/20190808061756.19712-15-alex@ghiti.fr Link: http://lkml.kernel.org/r/20190730055113.23635-15-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Christoph Hellwig Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Acked-by: Paul Walmsley [arch/riscv] Cc: Albert Ou Cc: Alexander Viro Cc: Catalin Marinas Cc: Christoph Hellwig Cc: James Hogan Cc: Palmer Dabbelt Cc: Paul Burton Cc: Ralf Baechle Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/riscv/Kconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 71d29fb4008a..8eebbc8860bb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -59,6 +59,18 @@ config RISCV select ARCH_HAS_GIGANTIC_PAGE select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select SPARSEMEM_STATIC if 32BIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU + select HAVE_ARCH_MMAP_RND_BITS + +config ARCH_MMAP_RND_BITS_MIN + default 18 if 64BIT + default 8 + +# max bits determined by the following formula: +# VA_BITS - PAGE_SHIFT - 3 +config ARCH_MMAP_RND_BITS_MAX + default 24 if 64BIT # SV39 based + default 17 config MMU def_bool y From 73848a9711105836346462e1d3c6b5765b452de1 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 23 Sep 2019 15:39:25 -0700 Subject: [PATCH 630/721] mm/mmap.c: refine find_vma_prev() with rb_last() When addr is out of range of the whole rb_tree, pprev will point to the right-most node. rb_tree facility already provides a helper function, rb_last(), to do this task. We can leverage this instead of reimplementing it. This patch refines find_vma_prev() with rb_last() to make it a little nicer to read. [akpm@linux-foundation.org: little cleanup, per Vlastimil] Link: http://lkml.kernel.org/r/20190809001928.4950-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 6bc21fca20bc..e4a8f67aad62 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2274,12 +2274,9 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, if (vma) { *pprev = vma->vm_prev; } else { - struct rb_node *rb_node = mm->mm_rb.rb_node; - *pprev = NULL; - while (rb_node) { - *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); - rb_node = rb_node->rb_right; - } + struct rb_node *rb_node = rb_last(&mm->mm_rb); + + *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL; } return vma; } From 76f34950779f3d7847c94615232dff2cdc2f9844 Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Mon, 23 Sep 2019 15:39:28 -0700 Subject: [PATCH 631/721] mm: mmap: increase sockets maximum memory size pgoff for 32bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AF_XDP sockets umem mapping interface uses XDP_UMEM_PGOFF_FILL_RING and XDP_UMEM_PGOFF_COMPLETION_RING offsets. These offsets are established already and are part of the configuration interface. But for 32-bit systems, using AF_XDP socket configuration, these values are too large to pass the maximum allowed file size verification. The offsets can be tuned off, but instead of changing the existing interface, let's extend the max allowed file size for sockets. No one has been using this until this patch with 32 bits as without this fix af_xdp sockets can't be used at all, so it unblocks af_xdp socket usage for 32bit systems. All list of mmap cbs for sockets was verified for side effects and all of them contain dummy cb - sock_no_mmap() at this moment, except the following: xsk_mmap() - it's what this fix is needed for. tcp_mmap() - doesn't have obvious issues with pgoff - no any references on it. packet_mmap() - return -EINVAL if it's even set. Link: http://lkml.kernel.org/r/20190812124326.32146-1-ivan.khoronzhuk@linaro.org Signed-off-by: Ivan Khoronzhuk Reviewed-by: Andrew Morton Cc: Björn Töpel Cc: Alexei Starovoitov Cc: Magnus Karlsson Cc: Daniel Borkmann Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/mmap.c b/mm/mmap.c index e4a8f67aad62..f1e8c7f93e04 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1358,6 +1358,9 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) if (S_ISBLK(inode->i_mode)) return MAX_LFS_FILESIZE; + if (S_ISSOCK(inode->i_mode)) + return MAX_LFS_FILESIZE; + /* Special "we do even unsigned file positions" case */ if (file->f_mode & FMODE_UNSIGNED_OFFSET) return 0; From f3bc0dba3154fa7e4a728e0be8f37bd5fb9100fe Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 23 Sep 2019 15:39:31 -0700 Subject: [PATCH 632/721] mm/madvise: reduce code duplication in error handling paths madvise_behavior() converts -ENOMEM to -EAGAIN in several places using identical code. Move that code to a common error handling path. No functional changes. Link: http://lkml.kernel.org/r/1564640896-1210-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Pankaj Gupta Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 52 ++++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 36 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 88babcc384b9..68ab988ad433 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -107,28 +107,14 @@ static long madvise_behavior(struct vm_area_struct *vma, case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; break; } @@ -154,15 +140,8 @@ static long madvise_behavior(struct vm_area_struct *vma, goto out; } error = __split_vma(mm, vma, start, 1); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; } if (end != vma->vm_end) { @@ -171,15 +150,8 @@ static long madvise_behavior(struct vm_area_struct *vma, goto out; } error = __split_vma(mm, vma, end, 0); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; } success: @@ -187,6 +159,14 @@ static long madvise_behavior(struct vm_area_struct *vma, * vm_flags is protected by the mmap_sem held in write mode. */ vma->vm_flags = new_flags; + +out_convert_errno: + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; out: return error; } From 28eb3c80871951cae59b9c1b7385262e5d045ac3 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Mon, 23 Sep 2019 15:39:34 -0700 Subject: [PATCH 633/721] shmem: fix obsolete comment in shmem_getpage_gfp() Replace "fault_mm" with "vmf" in code comment because commit cfda05267f7b ("userfaultfd: shmem: add userfaultfd hook for shared memory faults") has changed the prototpye of shmem_getpage_gfp() - pass vmf instead of fault_mm to the function. Before: static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); After: static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, struct vm_fault *vmf, vm_fault_t *fault_type); Link: http://lkml.kernel.org/r/20190816100204.9781-1-miles.chen@mediatek.com Signed-off-by: Miles Chen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 57a6aedf6649..30ce722c23fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1734,7 +1734,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * fault_mm and fault_type are only supplied by shmem_fault: + * vmf and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, From c165f25d23ecb2f9f121ced20435415b931219e2 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Mon, 23 Sep 2019 15:39:37 -0700 Subject: [PATCH 634/721] zpool: add malloc_support_movable to zpool_driver As a zpool_driver, zsmalloc can allocate movable memory because it support migate pages. But zbud and z3fold cannot allocate movable memory. Add malloc_support_movable to zpool_driver. If a zpool_driver support allocate movable memory, set it to true. And add zpool_malloc_support_movable check malloc_support_movable to make sure if a zpool support allocate movable memory. Link: http://lkml.kernel.org/r/20190605100630.13293-1-teawaterz@linux.alibaba.com Signed-off-by: Hui Zhu Reviewed-by: Shakeel Butt Cc: Dan Streetman Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 3 +++ mm/zpool.c | 16 ++++++++++++++++ mm/zsmalloc.c | 19 ++++++++++--------- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 7238865e75b0..51bf43076165 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -46,6 +46,8 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); +bool zpool_malloc_support_movable(struct zpool *pool); + int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, unsigned long *handle); @@ -90,6 +92,7 @@ struct zpool_driver { struct zpool *zpool); void (*destroy)(void *pool); + bool malloc_support_movable; int (*malloc)(void *pool, size_t size, gfp_t gfp, unsigned long *handle); void (*free)(void *pool, unsigned long handle); diff --git a/mm/zpool.c b/mm/zpool.c index a2dd9107857d..863669212070 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -238,6 +238,22 @@ const char *zpool_get_type(struct zpool *zpool) return zpool->driver->type; } +/** + * zpool_malloc_support_movable() - Check if the zpool support + * allocate movable memory + * @zpool: The zpool to check + * + * This returns if the zpool support allocate movable memory. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: true if if the zpool support allocate movable memory, false if not + */ +bool zpool_malloc_support_movable(struct zpool *zpool) +{ + return zpool->driver->malloc_support_movable; +} + /** * zpool_malloc() - Allocate memory * @zpool: The zpool to allocate from. diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e98bb6ab4f7e..646858efd468 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -443,15 +443,16 @@ static u64 zs_zpool_total_size(void *pool) } static struct zpool_driver zs_zpool_driver = { - .type = "zsmalloc", - .owner = THIS_MODULE, - .create = zs_zpool_create, - .destroy = zs_zpool_destroy, - .malloc = zs_zpool_malloc, - .free = zs_zpool_free, - .map = zs_zpool_map, - .unmap = zs_zpool_unmap, - .total_size = zs_zpool_total_size, + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc_support_movable = true, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, }; MODULE_ALIAS("zpool-zsmalloc"); From d2fcd82bb83aab47c6d63aa8c960cd5edb578065 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Mon, 23 Sep 2019 15:39:40 -0700 Subject: [PATCH 635/721] zswap: use movable memory if zpool support allocate movable memory This is the third version that was updated according to the comments from Sergey Senozhatsky https://lkml.org/lkml/2019/5/29/73 and Shakeel Butt https://lkml.org/lkml/2019/6/4/973 zswap compresses swap pages into a dynamically allocated RAM-based memory pool. The memory pool should be zbud, z3fold or zsmalloc. All of them will allocate unmovable pages. It will increase the number of unmovable page blocks that will bad for anti-fragment. zsmalloc support page migration if request movable page: handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); And commit "zpool: Add malloc_support_movable to zpool_driver" add zpool_malloc_support_movable check malloc_support_movable to make sure if a zpool support allocate movable memory. This commit let zswap allocate block with gfp __GFP_HIGHMEM | __GFP_MOVABLE if zpool support allocate movable memory. Following part is test log in a pc that has 8G memory and 2G swap. Without this commit: ~# echo lz4 > /sys/module/zswap/parameters/compressor ~# echo zsmalloc > /sys/module/zswap/parameters/zpool ~# echo 1 > /sys/module/zswap/parameters/enabled ~# swapon /swapfile ~# cd /home/teawater/kernel/vm-scalability/ /home/teawater/kernel/vm-scalability# export unit_size=$((9 * 1024 * 1024 * 1024)) /home/teawater/kernel/vm-scalability# ./case-anon-w-seq 2717908992 bytes / 4826062 usecs = 549973 KB/s 2717908992 bytes / 4864201 usecs = 545661 KB/s 2717908992 bytes / 4867015 usecs = 545346 KB/s 2717908992 bytes / 4915485 usecs = 539968 KB/s 397853 usecs to free memory 357820 usecs to free memory 421333 usecs to free memory 420454 usecs to free memory /home/teawater/kernel/vm-scalability# cat /proc/pagetypeinfo Page block order: 9 Pages per block: 512 Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 0, zone DMA, type Unmovable 1 1 1 0 2 1 1 0 1 0 0 Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 1 3 Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type Unmovable 6 5 8 6 6 5 4 1 1 1 0 Node 0, zone DMA32, type Movable 25 20 20 19 22 15 14 11 11 5 767 Node 0, zone DMA32, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Unmovable 4753 5588 5159 4613 3712 2520 1448 594 188 11 0 Node 0, zone Normal, type Movable 16 3 457 2648 2143 1435 860 459 223 224 296 Node 0, zone Normal, type Reclaimable 0 0 44 38 11 2 0 0 0 0 0 Node 0, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Number of blocks type Unmovable Movable Reclaimable HighAtomic CMA Isolate Node 0, zone DMA 1 7 0 0 0 0 Node 0, zone DMA32 4 1652 0 0 0 0 Node 0, zone Normal 931 1485 15 0 0 0 With this commit: ~# echo lz4 > /sys/module/zswap/parameters/compressor ~# echo zsmalloc > /sys/module/zswap/parameters/zpool ~# echo 1 > /sys/module/zswap/parameters/enabled ~# swapon /swapfile ~# cd /home/teawater/kernel/vm-scalability/ /home/teawater/kernel/vm-scalability# export unit_size=$((9 * 1024 * 1024 * 1024)) /home/teawater/kernel/vm-scalability# ./case-anon-w-seq 2717908992 bytes / 4689240 usecs = 566020 KB/s 2717908992 bytes / 4760605 usecs = 557535 KB/s 2717908992 bytes / 4803621 usecs = 552543 KB/s 2717908992 bytes / 5069828 usecs = 523530 KB/s 431546 usecs to free memory 383397 usecs to free memory 456454 usecs to free memory 224487 usecs to free memory /home/teawater/kernel/vm-scalability# cat /proc/pagetypeinfo Page block order: 9 Pages per block: 512 Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 0, zone DMA, type Unmovable 1 1 1 0 2 1 1 0 1 0 0 Node 0, zone DMA, type Movable 0 0 0 0 0 0 0 0 0 1 3 Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type Unmovable 10 8 10 9 10 4 3 2 3 0 0 Node 0, zone DMA32, type Movable 18 12 14 16 16 11 9 5 5 6 775 Node 0, zone DMA32, type Reclaimable 0 0 0 0 0 0 0 0 0 0 1 Node 0, zone DMA32, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Unmovable 2669 1236 452 118 37 14 4 1 2 3 0 Node 0, zone Normal, type Movable 3850 6086 5274 4327 3510 2494 1520 934 438 220 470 Node 0, zone Normal, type Reclaimable 56 93 155 124 47 31 17 7 3 0 0 Node 0, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Number of blocks type Unmovable Movable Reclaimable HighAtomic CMA Isolate Node 0, zone DMA 1 7 0 0 0 0 Node 0, zone DMA32 4 1650 2 0 0 0 Node 0, zone Normal 79 2326 26 0 0 0 You can see that the number of unmovable page blocks is decreased when the kernel has this commit. Link: http://lkml.kernel.org/r/20190605100630.13293-2-teawaterz@linux.alibaba.com Signed-off-by: Hui Zhu Reviewed-by: Shakeel Butt Cc: Dan Streetman Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0e22744a76cb..08b6cefae5d8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -997,6 +997,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, char *buf; u8 *src, *dst; struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; + gfp_t gfp; /* THP isn't supported */ if (PageTransHuge(page)) { @@ -1070,9 +1071,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; - ret = zpool_malloc(entry->pool->zpool, hlen + dlen, - __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, - &handle); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(entry->pool->zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; From 068619e32ff6229a09407d267e36ea7710b96ea1 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 23 Sep 2019 15:39:43 -0700 Subject: [PATCH 636/721] zswap: do not map same object twice zswap_writeback_entry() maps a handle to read swpentry first, and then in the most common case it would map the same handle again. This is ok when zbud is the backend since its mapping callback is plain and simple, but it slows things down for z3fold. Since there's hardly a point in unmapping a handle _that_ fast as zswap_writeback_entry() does when it reads swpentry, the suggestion is to keep the handle mapped till the end. Link: http://lkml.kernel.org/r/20190916004640.b453167d3556c4093af4cf7d@gmail.com Signed-off-by: Vitaly Wool Reviewed-by: Dan Streetman Cc: Shakeel Butt Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 08b6cefae5d8..46a322316e52 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -856,7 +856,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* extract swpentry from data */ zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ - zpool_unmap_handle(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); @@ -866,6 +865,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) if (!entry) { /* entry was invalidated */ spin_unlock(&tree->lock); + zpool_unmap_handle(pool, handle); return 0; } spin_unlock(&tree->lock); @@ -886,15 +886,13 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, - ZPOOL_MM_RO) + sizeof(struct zswap_header); + src = (u8 *)zhdr + sizeof(struct zswap_header); dst = kmap_atomic(page); tfm = *get_cpu_ptr(entry->pool->tfm); ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); put_cpu_ptr(entry->pool->tfm); kunmap_atomic(dst); - zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -940,6 +938,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) spin_unlock(&tree->lock); end: + zpool_unmap_handle(pool, handle); return ret; } From 2b38d01b4de8b1bbda7f5f7e91252609557635fc Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 23 Sep 2019 15:39:46 -0700 Subject: [PATCH 637/721] mm/zsmalloc.c: fix a -Wunused-function warning set_zspage_inuse() was introduced in the commit 4f42047bbde0 ("zsmalloc: use accessor") but all the users of it were removed later by the commits, bdb0af7ca8f0 ("zsmalloc: factor page chain functionality out") 3783689a1aa8 ("zsmalloc: introduce zspage structure") so the function can be safely removed now. Link: http://lkml.kernel.org/r/1568658408-19374-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 646858efd468..2b2b9aae8a3c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -477,10 +477,6 @@ static inline int get_zspage_inuse(struct zspage *zspage) return zspage->inuse; } -static inline void set_zspage_inuse(struct zspage *zspage, int val) -{ - zspage->inuse = val; -} static inline void mod_zspage_inuse(struct zspage *zspage, int val) { From 981c107cbb420ee028f8ecd155352cfd6351c246 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 10 Sep 2019 21:11:37 +0100 Subject: [PATCH 638/721] selftests/tpm2: Add the missing TEST_FILES assignment The Python files required by the selftests are not packaged because of the missing assignment to TEST_FILES. Add the assignment. Cc: stable@vger.kernel.org Fixes: 6ea3dfe1e073 ("selftests: add TPM 2.0 tests") Signed-off-by: Jarkko Sakkinen Reviewed-by: Petr Vorel --- tools/testing/selftests/tpm2/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/tpm2/Makefile b/tools/testing/selftests/tpm2/Makefile index 9dd848427a7b..bf401f725eef 100644 --- a/tools/testing/selftests/tpm2/Makefile +++ b/tools/testing/selftests/tpm2/Makefile @@ -2,3 +2,4 @@ include ../lib.mk TEST_PROGS := test_smoke.sh test_space.sh +TEST_FILES := tpm2.py tpm2_tests.py From 34cd83bb8a468b24d9a13adbfb5ad645c552b8e2 Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Wed, 11 Sep 2019 11:34:42 +0200 Subject: [PATCH 639/721] selftests/tpm2: Add log and *.pyc to .gitignore Fixes: 6ea3dfe1e073 ("selftests: add TPM 2.0 tests") Signed-off-by: Petr Vorel Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- tools/testing/selftests/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/.gitignore b/tools/testing/selftests/.gitignore index 8059ce834247..61df01cdf0b2 100644 --- a/tools/testing/selftests/.gitignore +++ b/tools/testing/selftests/.gitignore @@ -2,3 +2,5 @@ gpiogpio-event-mon gpiogpio-hammer gpioinclude/ gpiolsgpio +tpm2/SpaceTest.log +tpm2/*.pyc From 9f75c82246313d4c2a6bc77e947b45655b3b5ad5 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Fri, 13 Sep 2019 20:51:36 +0200 Subject: [PATCH 640/721] KEYS: trusted: correctly initialize digests and fix locking issue Commit 0b6cf6b97b7e ("tpm: pass an array of tpm_extend_digest structures to tpm_pcr_extend()") modifies tpm_pcr_extend() to accept a digest for each PCR bank. After modification, tpm_pcr_extend() expects that digests are passed in the same order as the algorithms set in chip->allocated_banks. This patch fixes two issues introduced in the last iterations of the patch set: missing initialization of the TPM algorithm ID in the tpm_digest structures passed to tpm_pcr_extend() by the trusted key module, and unreleased locks in the TPM driver due to returning from tpm_pcr_extend() without calling tpm_put_ops(). Cc: stable@vger.kernel.org Fixes: 0b6cf6b97b7e ("tpm: pass an array of tpm_extend_digest structures to tpm_pcr_extend()") Signed-off-by: Roberto Sassu Suggested-by: Jarkko Sakkinen Reviewed-by: Jerry Snitselaar Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-interface.c | 14 +++++++++----- security/keys/trusted.c | 5 +++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index 1b4f95c13e00..d9ace5480665 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -320,18 +320,22 @@ int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, if (!chip) return -ENODEV; - for (i = 0; i < chip->nr_allocated_banks; i++) - if (digests[i].alg_id != chip->allocated_banks[i].alg_id) - return -EINVAL; + for (i = 0; i < chip->nr_allocated_banks; i++) { + if (digests[i].alg_id != chip->allocated_banks[i].alg_id) { + rc = EINVAL; + goto out; + } + } if (chip->flags & TPM_CHIP_FLAG_TPM2) { rc = tpm2_pcr_extend(chip, pcr_idx, digests); - tpm_put_ops(chip); - return rc; + goto out; } rc = tpm1_pcr_extend(chip, pcr_idx, digests[0].digest, "attempting extend a PCR value"); + +out: tpm_put_ops(chip); return rc; } diff --git a/security/keys/trusted.c b/security/keys/trusted.c index ade699131065..1fbd77816610 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -1228,11 +1228,16 @@ static int __init trusted_shash_alloc(void) static int __init init_digests(void) { + int i; + digests = kcalloc(chip->nr_allocated_banks, sizeof(*digests), GFP_KERNEL); if (!digests) return -ENOMEM; + for (i = 0; i < chip->nr_allocated_banks; i++) + digests[i].alg_id = chip->allocated_banks[i].alg_id; + return 0; } From c980ecff4761f3c446f817430547781298815aa9 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Fri, 16 Aug 2019 01:12:00 +0300 Subject: [PATCH 641/721] MAINTAINERS: keys: Update path to trusted.h Update MAINTAINERS record to reflect that trusted.h was moved to a different directory in commit 22447981fc05 ("KEYS: Move trusted.h to include/keys [ver #2]"). Cc: Denis Kenzior Cc: James Bottomley Cc: Jarkko Sakkinen Cc: Mimi Zohar Cc: linux-integrity@vger.kernel.org Signed-off-by: Denis Efremov Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 54f1286087e9..ec807d446854 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9050,7 +9050,7 @@ S: Supported F: Documentation/security/keys/trusted-encrypted.rst F: include/keys/trusted-type.h F: security/keys/trusted.c -F: security/keys/trusted.h +F: include/keys/trusted.h KEYS/KEYRINGS: M: David Howells From e13cd21ffd50a07b55dcc4d8c38cedf27f28eaa1 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 16 Sep 2019 11:38:34 +0300 Subject: [PATCH 642/721] tpm: Wrap the buffer from the caller to tpm_buf in tpm_send() tpm_send() does not give anymore the result back to the caller. This would require another memcpy(), which kind of tells that the whole approach is somewhat broken. Instead, as Mimi suggested, this commit just wraps the data to the tpm_buf, and thus the result will not go to the garbage. Obviously this assumes from the caller that it passes large enough buffer, which makes the whole API somewhat broken because it could be different size than @buflen but since trusted keys is the only module using this API right now I think that this fix is sufficient for the moment. In the near future the plan is to replace the parameters with a tpm_buf created by the caller. Reported-by: Mimi Zohar Suggested-by: Mimi Zohar Cc: stable@vger.kernel.org Fixes: 412eb585587a ("use tpm_buf in tpm_transmit_cmd() as the IO parameter") Signed-off-by: Jarkko Sakkinen Reviewed-by: Jerry Snitselaar --- drivers/char/tpm/tpm-interface.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index d9ace5480665..d7a3888ad80f 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -358,14 +358,9 @@ int tpm_send(struct tpm_chip *chip, void *cmd, size_t buflen) if (!chip) return -ENODEV; - rc = tpm_buf_init(&buf, 0, 0); - if (rc) - goto out; - - memcpy(buf.data, cmd, buflen); + buf.data = cmd; rc = tpm_transmit_cmd(chip, &buf, 0, "attempting to a send a command"); - tpm_buf_destroy(&buf); -out: + tpm_put_ops(chip); return rc; } From e55d9d9bfb69405bd7615c0f8d229d8fafb3e9b8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 25 Sep 2019 16:45:53 -0700 Subject: [PATCH 643/721] memcg, kmem: do not fail __GFP_NOFAIL charges Thomas has noticed the following NULL ptr dereference when using cgroup v1 kmem limit: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 16923 Comm: gtk-update-icon Not tainted 4.19.51 #42 Hardware name: Gigabyte Technology Co., Ltd. Z97X-Gaming G1/Z97X-Gaming G1, BIOS F9 07/31/2015 RIP: 0010:create_empty_buffers+0x24/0x100 Code: cd 0f 1f 44 00 00 0f 1f 44 00 00 41 54 49 89 d4 ba 01 00 00 00 55 53 48 89 fb e8 97 fe ff ff 48 89 c5 48 89 c2 eb 03 48 89 ca <48> 8b 4a 08 4c 09 22 48 85 c9 75 f1 48 89 6a 08 48 8b 43 18 48 8d RSP: 0018:ffff927ac1b37bf8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: fffff2d4429fd740 RCX: 0000000100097149 RDX: 0000000000000000 RSI: 0000000000000082 RDI: ffff9075a99fbe00 RBP: 0000000000000000 R08: fffff2d440949cc8 R09: 00000000000960c0 R10: 0000000000000002 R11: 0000000000000000 R12: 0000000000000000 R13: ffff907601f18360 R14: 0000000000002000 R15: 0000000000001000 FS: 00007fb55b288bc0(0000) GS:ffff90761f8c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000008 CR3: 000000007aebc002 CR4: 00000000001606e0 Call Trace: create_page_buffers+0x4d/0x60 __block_write_begin_int+0x8e/0x5a0 ? ext4_inode_attach_jinode.part.82+0xb0/0xb0 ? jbd2__journal_start+0xd7/0x1f0 ext4_da_write_begin+0x112/0x3d0 generic_perform_write+0xf1/0x1b0 ? file_update_time+0x70/0x140 __generic_file_write_iter+0x141/0x1a0 ext4_file_write_iter+0xef/0x3b0 __vfs_write+0x17e/0x1e0 vfs_write+0xa5/0x1a0 ksys_write+0x57/0xd0 do_syscall_64+0x55/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Tetsuo then noticed that this is because the __memcg_kmem_charge_memcg fails __GFP_NOFAIL charge when the kmem limit is reached. This is a wrong behavior because nofail allocations are not allowed to fail. Normal charge path simply forces the charge even if that means to cross the limit. Kmem accounting should be doing the same. Link: http://lkml.kernel.org/r/20190906125608.32129-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Thomas Lindroth Debugged-by: Tetsuo Handa Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Andrey Ryabinin Cc: Thomas Lindroth Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2156ef775d04..c313c49074ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2943,6 +2943,16 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + + /* + * Enforce __GFP_NOFAIL allocation because callers are not + * prepared to see failures and likely do not have any failure + * handling code. + */ + if (gfp & __GFP_NOFAIL) { + page_counter_charge(&memcg->kmem, nr_pages); + return 0; + } cancel_charge(memcg, nr_pages); return -ENOMEM; } From 541be05095437a7e5e08e7d13a13e03ec0994ae7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 25 Sep 2019 16:45:56 -0700 Subject: [PATCH 644/721] linux/coff.h: add include guard Add a header include guard just in case. My motivation is to allow Kbuild to detect missing include guard: https://patchwork.kernel.org/patch/11063011/ Before I enable this checker I want to fix as many headers as possible. Link: http://lkml.kernel.org/r/20190728154728.11126-1-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/coff.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/uapi/linux/coff.h b/include/uapi/linux/coff.h index e4a79f80b9a0..ab5c7e847eed 100644 --- a/include/uapi/linux/coff.h +++ b/include/uapi/linux/coff.h @@ -11,6 +11,9 @@ more information about COFF, then O'Reilly has a very excellent book. */ +#ifndef _UAPI_LINUX_COFF_H +#define _UAPI_LINUX_COFF_H + #define E_SYMNMLEN 8 /* Number of characters in a symbol name */ #define E_FILNMLEN 14 /* Number of characters in a file name */ #define E_DIMNUM 4 /* Number of array dimensions in auxiliary entry */ @@ -350,3 +353,5 @@ struct COFF_reloc { /* For new sections we haven't heard of before */ #define COFF_DEF_SECTION_ALIGNMENT 4 + +#endif /* _UAPI_LINUX_COFF_H */ From 0f74914071ab7e7b78731ed62bf350e3a344e0a5 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Wed, 25 Sep 2019 16:45:59 -0700 Subject: [PATCH 645/721] kernel/elfcore.c: include proper prototypes When building with W=1, gcc properly complains that there's no prototypes: CC kernel/elfcore.o kernel/elfcore.c:7:17: warning: no previous prototype for 'elf_core_extra_phdrs' [-Wmissing-prototypes] 7 | Elf_Half __weak elf_core_extra_phdrs(void) | ^~~~~~~~~~~~~~~~~~~~ kernel/elfcore.c:12:12: warning: no previous prototype for 'elf_core_write_extra_phdrs' [-Wmissing-prototypes] 12 | int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/elfcore.c:17:12: warning: no previous prototype for 'elf_core_write_extra_data' [-Wmissing-prototypes] 17 | int __weak elf_core_write_extra_data(struct coredump_params *cprm) | ^~~~~~~~~~~~~~~~~~~~~~~~~ kernel/elfcore.c:22:15: warning: no previous prototype for 'elf_core_extra_data_size' [-Wmissing-prototypes] 22 | size_t __weak elf_core_extra_data_size(void) | ^~~~~~~~~~~~~~~~~~~~~~~~ Provide the include file so gcc is happy, and we don't have potential code drift Link: http://lkml.kernel.org/r/29875.1565224705@turing-police Signed-off-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/elfcore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/elfcore.c b/kernel/elfcore.c index fc482c8e0bd8..57fb4dcff434 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -3,6 +3,7 @@ #include #include #include +#include Elf_Half __weak elf_core_extra_phdrs(void) { From c7d4f7eeb6da9408e9ba7475fe2624bdb4d837d0 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:02 -0700 Subject: [PATCH 646/721] rbtree: avoid generating code twice for the cached versions (tools copy) As was already noted in rbtree.h, the logic to cache rb_first (or rb_last) can easily be implemented externally to the core rbtree api. This commit takes the changes applied to the include/linux/ and lib/ rbtree files in 9f973cb38088 ("lib/rbtree: avoid generating code twice for the cached versions"), and applies these to the tools/include/linux/ and tools/lib/ files as well to keep them synchronized. Link: http://lkml.kernel.org/r/20190703034812.53002-1-walken@google.com Signed-off-by: Michel Lespinasse Cc: David Howells Cc: Davidlohr Bueso Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/linux/rbtree.h | 71 +++++++++++++++++--------- tools/include/linux/rbtree_augmented.h | 31 +++++------ tools/lib/rbtree.c | 37 ++------------ 3 files changed, 62 insertions(+), 77 deletions(-) diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h index d83763a5327c..e03b1ea23e0e 100644 --- a/tools/include/linux/rbtree.h +++ b/tools/include/linux/rbtree.h @@ -31,25 +31,9 @@ struct rb_root { struct rb_node *rb_node; }; -/* - * Leftmost-cached rbtrees. - * - * We do not cache the rightmost node based on footprint - * size vs number of potential users that could benefit - * from O(1) rb_last(). Just not worth it, users that want - * this feature can always implement the logic explicitly. - * Furthermore, users that want to cache both pointers may - * find it a bit asymmetric, but that's ok. - */ -struct rb_root_cached { - struct rb_root rb_root; - struct rb_node *rb_leftmost; -}; - #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } -#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) @@ -71,12 +55,6 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); -extern void rb_insert_color_cached(struct rb_node *, - struct rb_root_cached *, bool); -extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); -/* Same as rb_first(), but O(1) */ -#define rb_first_cached(root) (root)->rb_leftmost - /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); @@ -84,8 +62,6 @@ extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); -extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, - struct rb_root_cached *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) @@ -129,4 +105,51 @@ static inline void rb_erase_init(struct rb_node *n, struct rb_root *root) rb_erase(n, root); RB_CLEAR_NODE(n); } + +/* + * Leftmost-cached rbtrees. + * + * We do not cache the rightmost node based on footprint + * size vs number of potential users that could benefit + * from O(1) rb_last(). Just not worth it, users that want + * this feature can always implement the logic explicitly. + * Furthermore, users that want to cache both pointers may + * find it a bit asymmetric, but that's ok. + */ +struct rb_root_cached { + struct rb_root rb_root; + struct rb_node *rb_leftmost; +}; + +#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } + +/* Same as rb_first(), but O(1) */ +#define rb_first_cached(root) (root)->rb_leftmost + +static inline void rb_insert_color_cached(struct rb_node *node, + struct rb_root_cached *root, + bool leftmost) +{ + if (leftmost) + root->rb_leftmost = node; + rb_insert_color(node, &root->rb_root); +} + +static inline void rb_erase_cached(struct rb_node *node, + struct rb_root_cached *root) +{ + if (root->rb_leftmost == node) + root->rb_leftmost = rb_next(node); + rb_erase(node, &root->rb_root); +} + +static inline void rb_replace_node_cached(struct rb_node *victim, + struct rb_node *new, + struct rb_root_cached *root) +{ + if (root->rb_leftmost == victim) + root->rb_leftmost = new; + rb_replace_node(victim, new, &root->rb_root); +} + #endif /* __TOOLS_LINUX_PERF_RBTREE_H */ diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index ddd01006ece5..467a3eefe1d2 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -32,17 +32,16 @@ struct rb_augment_callbacks { void (*rotate)(struct rb_node *old, struct rb_node *new); }; -extern void __rb_insert_augmented(struct rb_node *node, - struct rb_root *root, - bool newleft, struct rb_node **leftmost, +extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); + /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and - * rb_augment_inserted() instead of the usual rb_insert_color() call. - * If rb_augment_inserted() rebalances the rbtree, it will callback into + * rb_insert_augmented() instead of the usual rb_insert_color() call. + * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ @@ -50,7 +49,7 @@ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, root, false, NULL, augment->rotate); + __rb_insert_augmented(node, root, augment->rotate); } static inline void @@ -58,8 +57,9 @@ rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, &root->rb_root, - newleft, &root->rb_leftmost, augment->rotate); + if (newleft) + root->rb_leftmost = node; + rb_insert_augmented(node, &root->rb_root, augment); } #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ @@ -139,7 +139,6 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, - struct rb_node **leftmost, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; @@ -147,9 +146,6 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, struct rb_node *parent, *rebalance; unsigned long pc; - if (leftmost && node == *leftmost) - *leftmost = rb_next(node); - if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) @@ -249,8 +245,7 @@ static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, root, - NULL, augment); + struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } @@ -259,11 +254,9 @@ static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root, - &root->rb_leftmost, - augment); - if (rebalance) - __rb_erase_color(rebalance, &root->rb_root, augment->rotate); + if (root->rb_leftmost == node) + root->rb_leftmost = rb_next(node); + rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _TOOLS_LINUX_RBTREE_AUGMENTED_H */ diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 804f145e3113..2548ff8c4d9c 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -83,14 +83,10 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, static __always_inline void __rb_insert(struct rb_node *node, struct rb_root *root, - bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; - if (newleft) - *leftmost = node; - while (true) { /* * Loop invariant: node is red. @@ -436,34 +432,17 @@ static const struct rb_augment_callbacks dummy_callbacks = { void rb_insert_color(struct rb_node *node, struct rb_root *root) { - __rb_insert(node, root, false, NULL, dummy_rotate); + __rb_insert(node, root, dummy_rotate); } void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, root, - NULL, &dummy_callbacks); + rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); if (rebalance) ____rb_erase_color(rebalance, root, dummy_rotate); } -void rb_insert_color_cached(struct rb_node *node, - struct rb_root_cached *root, bool leftmost) -{ - __rb_insert(node, &root->rb_root, leftmost, - &root->rb_leftmost, dummy_rotate); -} - -void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) -{ - struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, &root->rb_root, - &root->rb_leftmost, &dummy_callbacks); - if (rebalance) - ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate); -} - /* * Augmented rbtree manipulation functions. * @@ -472,10 +451,9 @@ void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) */ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, - bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - __rb_insert(node, root, newleft, leftmost, augment_rotate); + __rb_insert(node, root, augment_rotate); } /* @@ -580,15 +558,6 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, __rb_change_child(victim, new, parent, root); } -void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, - struct rb_root_cached *root) -{ - rb_replace_node(victim, new, &root->rb_root); - - if (root->rb_leftmost == victim) - root->rb_leftmost = new; -} - static struct rb_node *rb_left_deepest_node(const struct rb_node *node) { for (;;) { From 444b8a83f1e01584ff2d53f5951d8e836c0070b5 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:04 -0700 Subject: [PATCH 647/721] augmented rbtree: add comments for RB_DECLARE_CALLBACKS macro Patch series "make RB_DECLARE_CALLBACKS more generic", v3. These changes are intended to make the RB_DECLARE_CALLBACKS macro more generic (allowing the aubmented subtree information to be a struct instead of a scalar). I have verified the compiled lib/interval_tree.o and mm/mmap.o files to check that they didn't change. This held as expected for interval_tree.o; mmap.o did have some changes which could be reverted by marking __vma_link_rb as noinline. I did not add such a change to the patchset; I felt it was reasonable enough to leave the inlining decision up to the compiler. This patch (of 3): Add a short comment summarizing the arguments to RB_DECLARE_CALLBACKS. The arguments are also now capitalized. This copies the style of the INTERVAL_TREE_DEFINE macro. No functional changes in this commit, only comments and capitalization. Link: http://lkml.kernel.org/r/20190703040156.56953-2-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 54 ++++++++++++++++---------- tools/include/linux/rbtree_augmented.h | 54 ++++++++++++++++---------- 2 files changed, 66 insertions(+), 42 deletions(-) diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 179faab29f52..979941600082 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -60,39 +60,51 @@ rb_insert_augmented_cached(struct rb_node *node, rb_insert_augmented(node, &root->rb_root, augment); } -#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ - rbtype, rbaugmented, rbcompute) \ +/* + * Template for declaring augmented rbtree callbacks + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data + */ + +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline void \ -rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ - rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ - rbtype augmented = rbcompute(node); \ - if (node->rbaugmented == augmented) \ + RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ + RBTYPE augmented = RBCOMPUTE(node); \ + if (node->RBAUGMENTED == augmented) \ break; \ - node->rbaugmented = augmented; \ - rb = rb_parent(&node->rbfield); \ + node->RBAUGMENTED = augmented; \ + rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ -rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ -rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ - old->rbaugmented = rbcompute(old); \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ + old->RBAUGMENTED = RBCOMPUTE(old); \ } \ -rbstatic const struct rb_augment_callbacks rbname = { \ - .propagate = rbname ## _propagate, \ - .copy = rbname ## _copy, \ - .rotate = rbname ## _rotate \ +RBSTATIC const struct rb_augment_callbacks RBNAME = { \ + .propagate = RBNAME ## _propagate, \ + .copy = RBNAME ## _copy, \ + .rotate = RBNAME ## _rotate \ }; diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 467a3eefe1d2..de3a480204ba 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -62,39 +62,51 @@ rb_insert_augmented_cached(struct rb_node *node, rb_insert_augmented(node, &root->rb_root, augment); } -#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ - rbtype, rbaugmented, rbcompute) \ +/* + * Template for declaring augmented rbtree callbacks + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data + */ + +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline void \ -rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ - rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ - rbtype augmented = rbcompute(node); \ - if (node->rbaugmented == augmented) \ + RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ + RBTYPE augmented = RBCOMPUTE(node); \ + if (node->RBAUGMENTED == augmented) \ break; \ - node->rbaugmented = augmented; \ - rb = rb_parent(&node->rbfield); \ + node->RBAUGMENTED = augmented; \ + rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ -rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ -rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ - old->rbaugmented = rbcompute(old); \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ + old->RBAUGMENTED = RBCOMPUTE(old); \ } \ -rbstatic const struct rb_augment_callbacks rbname = { \ - .propagate = rbname ## _propagate, \ - .copy = rbname ## _copy, \ - .rotate = rbname ## _rotate \ +RBSTATIC const struct rb_augment_callbacks RBNAME = { \ + .propagate = RBNAME ## _propagate, \ + .copy = RBNAME ## _copy, \ + .rotate = RBNAME ## _rotate \ }; From 315cc066b8ae8349a27887ad7a34e1916e9797fe Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:07 -0700 Subject: [PATCH 648/721] augmented rbtree: add new RB_DECLARE_CALLBACKS_MAX macro Add RB_DECLARE_CALLBACKS_MAX, which generates augmented rbtree callbacks for the case where the augmented value is a scalar whose definition follows a max(f(node)) pattern. This actually covers all present uses of RB_DECLARE_CALLBACKS, and saves some (source) code duplication in the various RBCOMPUTE function definitions. [walken@google.com: fix mm/vmalloc.c] Link: http://lkml.kernel.org/r/CANN689FXgK13wDYNh1zKxdipeTuALG4eKvKpsdZqKFJ-rvtGiQ@mail.gmail.com [walken@google.com: re-add check to check_augmented()] Link: http://lkml.kernel.org/r/20190727022027.GA86863@google.com Link: http://lkml.kernel.org/r/20190703040156.56953-3-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Davidlohr Bueso Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat_rbtree.c | 19 +++---------- drivers/block/drbd/drbd_interval.c | 29 +++----------------- include/linux/interval_tree_generic.h | 22 ++------------- include/linux/rbtree_augmented.h | 36 ++++++++++++++++++++++++- lib/rbtree_test.c | 37 ++++++++++++-------------- mm/mmap.c | 29 ++++++++++++-------- mm/vmalloc.c | 5 ++-- tools/include/linux/rbtree_augmented.h | 36 ++++++++++++++++++++++++- 8 files changed, 115 insertions(+), 98 deletions(-) diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index fa16036fa592..65ebe4b88f7c 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -54,23 +54,10 @@ static u64 get_subtree_max_end(struct rb_node *node) return ret; } -static u64 compute_subtree_max_end(struct memtype *data) -{ - u64 max_end = data->end, child_max_end; +#define NODE_END(node) ((node)->end) - child_max_end = get_subtree_max_end(data->rb.rb_right); - if (child_max_end > max_end) - max_end = child_max_end; - - child_max_end = get_subtree_max_end(data->rb.rb_left); - if (child_max_end > max_end) - max_end = child_max_end; - - return max_end; -} - -RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, - u64, subtree_max_end, compute_subtree_max_end) +RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb, + struct memtype, rb, u64, subtree_max_end, NODE_END) /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c index c58986556161..651bd0236a99 100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@ -13,33 +13,10 @@ sector_t interval_end(struct rb_node *node) return this->end; } -/** - * compute_subtree_last - compute end of @node - * - * The end of an interval is the highest (start + (size >> 9)) value of this - * node and of its children. Called for @node and its parents whenever the end - * may have changed. - */ -static inline sector_t -compute_subtree_last(struct drbd_interval *node) -{ - sector_t max = node->sector + (node->size >> 9); +#define NODE_END(node) ((node)->sector + ((node)->size >> 9)) - if (node->rb.rb_left) { - sector_t left = interval_end(node->rb.rb_left); - if (left > max) - max = left; - } - if (node->rb.rb_right) { - sector_t right = interval_end(node->rb.rb_right); - if (right > max) - max = right; - } - return max; -} - -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb, - sector_t, end, compute_subtree_last); +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks, + struct drbd_interval, rb, sector_t, end, NODE_END); /** * drbd_insert_interval - insert a new interval into a tree diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index 855476145fe1..aaa8a0767aa3 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -30,26 +30,8 @@ \ /* Callbacks for augmented rbtree insert and remove */ \ \ -static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node) \ -{ \ - ITTYPE max = ITLAST(node), subtree_last; \ - if (node->ITRB.rb_left) { \ - subtree_last = rb_entry(node->ITRB.rb_left, \ - ITSTRUCT, ITRB)->ITSUBTREE; \ - if (max < subtree_last) \ - max = subtree_last; \ - } \ - if (node->ITRB.rb_right) { \ - subtree_last = rb_entry(node->ITRB.rb_right, \ - ITSTRUCT, ITRB)->ITSUBTREE; \ - if (max < subtree_last) \ - max = subtree_last; \ - } \ - return max; \ -} \ - \ -RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB, \ - ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last) \ +RB_DECLARE_CALLBACKS_MAX(static, ITPREFIX ## _augment, \ + ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, ITLAST) \ \ /* Insert / remove interval nodes from the tree */ \ \ diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 979941600082..e5937e387e02 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -61,7 +61,7 @@ rb_insert_augmented_cached(struct rb_node *node, } /* - * Template for declaring augmented rbtree callbacks + * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure @@ -107,6 +107,40 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .rotate = RBNAME ## _rotate \ }; +/* + * Template for declaring augmented rbtree callbacks, + * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar + */ + +#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +{ \ + RBSTRUCT *child; \ + RBTYPE max = RBCOMPUTE(node); \ + if (node->RBFIELD.rb_left) { \ + child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + if (node->RBFIELD.rb_right) { \ + child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + return max; \ +} \ +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) + #define RB_RED 0 #define RB_BLACK 1 diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 62b8ee92643d..41ae3c7570d3 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -77,26 +77,10 @@ static inline void erase_cached(struct test_node *node, struct rb_root_cached *r } -static inline u32 augment_recompute(struct test_node *node) -{ - u32 max = node->val, child_augmented; - if (node->rb.rb_left) { - child_augmented = rb_entry(node->rb.rb_left, struct test_node, - rb)->augmented; - if (max < child_augmented) - max = child_augmented; - } - if (node->rb.rb_right) { - child_augmented = rb_entry(node->rb.rb_right, struct test_node, - rb)->augmented; - if (max < child_augmented) - max = child_augmented; - } - return max; -} +#define NODE_VAL(node) ((node)->val) -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb, - u32, augmented, augment_recompute) +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks, + struct test_node, rb, u32, augmented, NODE_VAL) static void insert_augmented(struct test_node *node, struct rb_root_cached *root) @@ -238,7 +222,20 @@ static void check_augmented(int nr_nodes) check(nr_nodes); for (rb = rb_first(&root.rb_root); rb; rb = rb_next(rb)) { struct test_node *node = rb_entry(rb, struct test_node, rb); - WARN_ON_ONCE(node->augmented != augment_recompute(node)); + u32 subtree, max = node->val; + if (node->rb.rb_left) { + subtree = rb_entry(node->rb.rb_left, struct test_node, + rb)->augmented; + if (max < subtree) + max = subtree; + } + if (node->rb.rb_right) { + subtree = rb_entry(node->rb.rb_right, struct test_node, + rb)->augmented; + if (max < subtree) + max = subtree; + } + WARN_ON_ONCE(node->augmented != max); } } diff --git a/mm/mmap.c b/mm/mmap.c index f1e8c7f93e04..14b7da317ec0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -289,9 +289,9 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) return retval; } -static long vma_compute_subtree_gap(struct vm_area_struct *vma) +static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) { - unsigned long max, prev_end, subtree_gap; + unsigned long gap, prev_end; /* * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we @@ -299,14 +299,21 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) * an unmapped area; whereas when expanding we only require one. * That's a little inconsistent, but keeps the code here simpler. */ - max = vm_start_gap(vma); + gap = vm_start_gap(vma); if (vma->vm_prev) { prev_end = vm_end_gap(vma->vm_prev); - if (max > prev_end) - max -= prev_end; + if (gap > prev_end) + gap -= prev_end; else - max = 0; + gap = 0; } + return gap; +} + +#ifdef CONFIG_DEBUG_VM_RB +static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ + unsigned long max = vma_compute_gap(vma), subtree_gap; if (vma->vm_rb.rb_left) { subtree_gap = rb_entry(vma->vm_rb.rb_left, struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -322,7 +329,6 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) return max; } -#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct mm_struct *mm) { struct rb_root *root = &mm->mm_rb; @@ -428,8 +434,9 @@ static void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif -RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_subtree_gap) +RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, + struct vm_area_struct, vm_rb, + unsigned long, rb_subtree_gap, vma_compute_gap) /* * Update augmented rbtree rb_subtree_gap values after vma->vm_start or @@ -439,8 +446,8 @@ RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, static void vma_gap_update(struct vm_area_struct *vma) { /* - * As it turns out, RB_DECLARE_CALLBACKS() already created a callback - * function that does exactly what we want. + * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created + * a callback function that does exactly what we want. */ vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fcadd3e25c0c..a3c70e275f4e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -396,9 +396,8 @@ compute_subtree_max_size(struct vmap_area *va) get_subtree_max_size(va->rb_node.rb_right)); } -RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, - struct vmap_area, rb_node, unsigned long, subtree_max_size, - compute_subtree_max_size) +RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index de3a480204ba..4e8c4c76e9a2 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -63,7 +63,7 @@ rb_insert_augmented_cached(struct rb_node *node, } /* - * Template for declaring augmented rbtree callbacks + * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure @@ -109,6 +109,40 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .rotate = RBNAME ## _rotate \ }; +/* + * Template for declaring augmented rbtree callbacks, + * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar + */ + +#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +{ \ + RBSTRUCT *child; \ + RBTYPE max = RBCOMPUTE(node); \ + if (node->RBFIELD.rb_left) { \ + child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + if (node->RBFIELD.rb_right) { \ + child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + return max; \ +} \ +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) + #define RB_RED 0 #define RB_BLACK 1 From 6d2052d188d962ffb7ad3d413e6ffd5f276aec94 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Wed, 25 Sep 2019 16:46:10 -0700 Subject: [PATCH 649/721] augmented rbtree: rework the RB_DECLARE_CALLBACKS macro definition Change the definition of the RBCOMPUTE function. The propagate callback repeatedly calls RBCOMPUTE as it moves from leaf to root. it wants to stop recomputing once the augmented subtree information doesn't change. This was previously checked using the == operator, but that only works when the augmented subtree information is a scalar field. This commit modifies the RBCOMPUTE function so that it now sets the augmented subtree information instead of returning it, and returns a boolean value indicating if the propagate callback should stop. The motivation for this change is that I want to introduce augmented rbtree uses where the augmented data for the subtree is a struct instead of a scalar. Link: http://lkml.kernel.org/r/20190703040156.56953-4-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Davidlohr Bueso Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 24 ++++++++++++------------ tools/include/linux/rbtree_augmented.h | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index e5937e387e02..fdd421b8d9ae 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -67,22 +67,19 @@ rb_insert_augmented_cached(struct rb_node *node, * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT - * RBTYPE: type of the RBAUGMENTED field - * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ -#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ - RBTYPE augmented = RBCOMPUTE(node); \ - if (node->RBAUGMENTED == augmented) \ + if (RBCOMPUTE(node, true)) \ break; \ - node->RBAUGMENTED = augmented; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ @@ -99,7 +96,7 @@ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ - old->RBAUGMENTED = RBCOMPUTE(old); \ + RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ @@ -122,7 +119,7 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ -static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ @@ -136,10 +133,13 @@ static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ - return max; \ + if (exit && node->RBAUGMENTED == max) \ + return true; \ + node->RBAUGMENTED = max; \ + return false; \ } \ -RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 4e8c4c76e9a2..381aa948610d 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -69,22 +69,19 @@ rb_insert_augmented_cached(struct rb_node *node, * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT - * RBTYPE: type of the RBAUGMENTED field - * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ -#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ - RBTYPE augmented = RBCOMPUTE(node); \ - if (node->RBAUGMENTED == augmented) \ + if (RBCOMPUTE(node, true)) \ break; \ - node->RBAUGMENTED = augmented; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ @@ -101,7 +98,7 @@ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ - old->RBAUGMENTED = RBCOMPUTE(old); \ + RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ @@ -124,7 +121,7 @@ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ -static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ +static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ @@ -138,10 +135,13 @@ static inline RBTYPE RBNAME ## _compute_max(RBSTRUCT *node) \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ - return max; \ + if (exit && node->RBAUGMENTED == max) \ + return true; \ + node->RBAUGMENTED = max; \ + return false; \ } \ -RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ - RBTYPE, RBAUGMENTED, RBNAME ## _compute_max) +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 From 917cda2790c4bd624c5191b8d9edd12121749e86 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:13 -0700 Subject: [PATCH 650/721] kernel-doc: core-api: include string.h into core-api core-api should show all the various string functions including the newly added stracpy and stracpy_pad. Miscellanea: o Update the Returns: value for strscpy o fix a defect with %NUL) [joe@perches.com: correct return of -E2BIG descriptions] Link: http://lkml.kernel.org/r/29f998b4c1a9d69fbeae70500ba0daa4b340c546.1563889130.git.joe@perches.com Link: http://lkml.kernel.org/r/224a6ebf39955f4107c0c376d66155d970e46733.1563841972.git.joe@perches.com Signed-off-by: Joe Perches Reviewed-by: Kees Cook Cc: Jonathan Corbet Cc: Stephen Kitt Cc: Nitin Gote Cc: Rasmus Villemoes Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/kernel-api.rst | 3 +++ include/linux/string.h | 5 +++-- lib/string.c | 10 ++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 08af5caf036d..f77de49b1d51 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -42,6 +42,9 @@ String Manipulation .. kernel-doc:: lib/string.c :export: +.. kernel-doc:: include/linux/string.h + :internal: + .. kernel-doc:: mm/util.c :functions: kstrdup kstrdup_const kstrndup kmemdup kmemdup_nul memdup_user vmemdup_user strndup_user memdup_user_nul diff --git a/include/linux/string.h b/include/linux/string.h index 4deb11f7976b..b2f9df7f0761 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -474,8 +474,9 @@ static inline void memcpy_and_pad(void *dest, size_t dest_len, * But this can lead to bugs due to typos, or if prefix is a pointer * and not a constant. Instead use str_has_prefix(). * - * Returns: 0 if @str does not start with @prefix - strlen(@prefix) if @str does start with @prefix + * Returns: + * * strlen(@prefix) if @str starts with @prefix + * * 0 if @str does not start with @prefix */ static __always_inline size_t str_has_prefix(const char *str, const char *prefix) { diff --git a/lib/string.c b/lib/string.c index 461fb620f85f..f7bc10da4259 100644 --- a/lib/string.c +++ b/lib/string.c @@ -173,8 +173,9 @@ EXPORT_SYMBOL(strlcpy); * doesn't unnecessarily force the tail of the destination buffer to be * zeroed. If zeroing is desired please use strscpy_pad(). * - * Return: The number of characters copied (not including the trailing - * %NUL) or -E2BIG if the destination buffer wasn't big enough. + * Returns: + * * The number of characters copied (not including the trailing %NUL) + * * -E2BIG if count is 0 or @src was truncated. */ ssize_t strscpy(char *dest, const char *src, size_t count) { @@ -253,8 +254,9 @@ EXPORT_SYMBOL(strscpy); * For full explanation of why you may want to consider using the * 'strscpy' functions please see the function docstring for strscpy(). * - * Return: The number of characters copied (not including the trailing - * %NUL) or -E2BIG if the destination buffer wasn't big enough. + * Returns: + * * The number of characters copied (not including the trailing %NUL) + * * -E2BIG if count is 0 or @src was truncated. */ ssize_t strscpy_pad(char *dest, const char *src, size_t count) { From d1a445d3b86c9341ce7a0954c23be0edb5c9bec5 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 25 Sep 2019 16:46:16 -0700 Subject: [PATCH 651/721] include/trace/events/writeback.h: fix -Wstringop-truncation warnings There are many of those warnings. In file included from ./arch/powerpc/include/asm/paca.h:15, from ./arch/powerpc/include/asm/current.h:13, from ./include/linux/thread_info.h:21, from ./include/asm-generic/preempt.h:5, from ./arch/powerpc/include/generated/asm/preempt.h:1, from ./include/linux/preempt.h:78, from ./include/linux/spinlock.h:51, from fs/fs-writeback.c:19: In function 'strncpy', inlined from 'perf_trace_writeback_page_template' at ./include/trace/events/writeback.h:56:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix it by using the new strscpy_pad() which was introduced in "lib/string: Add strscpy_pad() function" and will always be NUL-terminated instead of strncpy(). Also, change strlcpy() to use strscpy_pad() in this file for consistency. Link: http://lkml.kernel.org/r/1564075099-27750-1-git-send-email-cai@lca.pw Fixes: 455b2864686d ("writeback: Initial tracing support") Fixes: 028c2dd184c0 ("writeback: Add tracing to balance_dirty_pages") Fixes: e84d0a4f8e39 ("writeback: trace event writeback_queue_io") Fixes: b48c104d2211 ("writeback: trace event bdi_dirty_ratelimit") Fixes: cc1676d917f3 ("writeback: Move requeueing when I_SYNC set to writeback_sb_inodes()") Fixes: 9fb0a7da0c52 ("writeback: add more tracepoints") Signed-off-by: Qian Cai Reviewed-by: Jan Kara Cc: Tobin C. Harding Cc: Steven Rostedt (VMware) Cc: Ingo Molnar Cc: Tejun Heo Cc: Dave Chinner Cc: Fengguang Wu Cc: Jens Axboe Cc: Joe Perches Cc: Kees Cook Cc: Jann Horn Cc: Jonathan Corbet Cc: Nitin Gote Cc: Rasmus Villemoes Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/writeback.h | 38 +++++++++++++++++--------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 3a27335fce2c..c2ce6480b4b1 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -66,8 +66,9 @@ DECLARE_EVENT_CLASS(writeback_page_template, ), TP_fast_assign( - strncpy(__entry->name, - mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", + 32); __entry->ino = mapping ? mapping->host->i_ino : 0; __entry->index = page->index; ), @@ -110,8 +111,8 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ - strncpy(__entry->name, - bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; @@ -316,8 +317,8 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); @@ -360,8 +361,9 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(unsigned int, cgroup_ino) ), TP_fast_assign( - strncpy(__entry->name, - wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + wb->bdi->dev ? dev_name(wb->bdi->dev) : + "(unknown)", 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; @@ -414,7 +416,7 @@ DECLARE_EVENT_CLASS(writeback_class, __field(unsigned int, cgroup_ino) ), TP_fast_assign( - strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%u", @@ -436,7 +438,7 @@ TRACE_EVENT(writeback_bdi_register, __array(char, name, 32) ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(bdi->dev), 32); ), TP_printk("bdi %s", __entry->name @@ -461,7 +463,7 @@ DECLARE_EVENT_CLASS(wbc_class, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(bdi->dev), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; @@ -512,7 +514,7 @@ TRACE_EVENT(writeback_queue_io, ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; - strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->older = older_than_this ? *older_than_this : 0; __entry->age = older_than_this ? (jiffies - *older_than_this) * 1000 / HZ : -1; @@ -598,7 +600,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, ), TP_fast_assign( - strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); @@ -663,7 +665,7 @@ TRACE_EVENT(balance_dirty_pages, TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; - strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + @@ -723,8 +725,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; @@ -797,8 +799,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; From 9a156466147b61504f4cbe97ea503e67c21e117a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:46:20 -0700 Subject: [PATCH 652/721] strscpy: reject buffer sizes larger than INT_MAX As already done for snprintf(), add a check in strscpy() for giant (i.e. likely negative and/or miscalculated) copy sizes, WARN, and error out. Link: http://lkml.kernel.org/r/201907260928.23DE35406@keescook Signed-off-by: Kees Cook Cc: Joe Perches Cc: Rasmus Villemoes Cc: Yann Droneaud Cc: David Laight Cc: Jonathan Corbet Cc: Stephen Kitt Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/string.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/string.c b/lib/string.c index f7bc10da4259..cd7a10c19210 100644 --- a/lib/string.c +++ b/lib/string.c @@ -183,7 +183,7 @@ ssize_t strscpy(char *dest, const char *src, size_t count) size_t max = count; long res = 0; - if (count == 0) + if (count == 0 || WARN_ON_ONCE(count > INT_MAX)) return -E2BIG; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS From e3f4faa42095cacceeb33c68fda647a8e6d48a90 Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Wed, 25 Sep 2019 16:46:23 -0700 Subject: [PATCH 653/721] lib/generic-radix-tree.c: make 2 functions static inline When building with W=1, we get some warnings: l CC lib/generic-radix-tree.o lib/generic-radix-tree.c:39:10: warning: no previous prototype for 'genradix_root_to_depth' [-Wmissing-prototypes] 39 | unsigned genradix_root_to_depth(struct genradix_root *r) | ^~~~~~~~~~~~~~~~~~~~~~ lib/generic-radix-tree.c:44:23: warning: no previous prototype for 'genradix_root_to_node' [-Wmissing-prototypes] 44 | struct genradix_node *genradix_root_to_node(struct genradix_root *r) | ^~~~~~~~~~~~~~~~~~~~~ They're not used anywhere else, so make them static inline. Link: http://lkml.kernel.org/r/46923.1565236485@turing-police Signed-off-by: Valdis Kletnieks Cc: Kent Overstreet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/generic-radix-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index a7bafc413730..ae25e2fa2187 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -36,12 +36,12 @@ static inline size_t genradix_depth_size(unsigned depth) #define GENRADIX_DEPTH_MASK \ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) -unsigned genradix_root_to_depth(struct genradix_root *r) +static inline unsigned genradix_root_to_depth(struct genradix_root *r) { return (unsigned long) r & GENRADIX_DEPTH_MASK; } -struct genradix_node *genradix_root_to_node(struct genradix_root *r) +static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) { return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); } From 8e72a7a44df5534ae7664240c1fa75e71e11c64a Mon Sep 17 00:00:00 2001 From: Valdis Kletnieks Date: Wed, 25 Sep 2019 16:46:26 -0700 Subject: [PATCH 654/721] lib/extable.c: add missing prototypes When building with W=1, a number of warnings are issued: CC lib/extable.o lib/extable.c:63:6: warning: no previous prototype for 'sort_extable' [-Wmissing-prototypes] 63 | void sort_extable(struct exception_table_entry *start, | ^~~~~~~~~~~~ lib/extable.c:75:6: warning: no previous prototype for 'trim_init_extable' [-Wmissing-prototypes] 75 | void trim_init_extable(struct module *m) | ^~~~~~~~~~~~~~~~~ lib/extable.c:115:1: warning: no previous prototype for 'search_extable' [-Wmissing-prototypes] 115 | search_extable(const struct exception_table_entry *base, | ^~~~~~~~~~~~~~ Add the missing #include for the prototypes. Link: http://lkml.kernel.org/r/45574.1565235784@turing-police Signed-off-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/extable.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/extable.c b/lib/extable.c index 25da4071122a..c3e59caf7ffa 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -10,6 +10,7 @@ #include #include #include +#include #ifndef ARCH_HAS_RELATIVE_EXTABLE #define ex_to_insn(x) ((x)->insn) From 091cb0994edd20d67521094ac9c6ec9804058d9a Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 25 Sep 2019 16:46:29 -0700 Subject: [PATCH 655/721] lib/hexdump: make print_hex_dump_bytes() a nop on !DEBUG builds I'm seeing a bunch of debug prints from a user of print_hex_dump_bytes() in my kernel logs, but I don't have CONFIG_DYNAMIC_DEBUG enabled nor do I have DEBUG defined in my build. The problem is that print_hex_dump_bytes() calls a wrapper function in lib/hexdump.c that calls print_hex_dump() with KERN_DEBUG level. There are three cases to consider here 1. CONFIG_DYNAMIC_DEBUG=y --> call dynamic_hex_dum() 2. CONFIG_DYNAMIC_DEBUG=n && DEBUG --> call print_hex_dump() 3. CONFIG_DYNAMIC_DEBUG=n && !DEBUG --> stub it out Right now, that last case isn't detected and we still call print_hex_dump() from the stub wrapper. Let's make print_hex_dump_bytes() only call print_hex_dump_debug() so that it works properly in all cases. Case #1, print_hex_dump_debug() calls dynamic_hex_dump() and we get same behavior. Case #2, print_hex_dump_debug() calls print_hex_dump() with KERN_DEBUG and we get the same behavior. Case #3, print_hex_dump_debug() is a nop, changing behavior to what we want, i.e. print nothing. Link: http://lkml.kernel.org/r/20190816235624.115280-1-swboyd@chromium.org Signed-off-by: Stephen Boyd Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 22 +++++++++++++++------- lib/hexdump.c | 21 --------------------- 2 files changed, 15 insertions(+), 28 deletions(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index cefd374c47b1..c09d67edda3a 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -488,13 +488,6 @@ extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, extern void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); -#if defined(CONFIG_DYNAMIC_DEBUG) -#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ - dynamic_hex_dump(prefix_str, prefix_type, 16, 1, buf, len, true) -#else -extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type, - const void *buf, size_t len); -#endif /* defined(CONFIG_DYNAMIC_DEBUG) */ #else static inline void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, @@ -526,4 +519,19 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, } #endif +/** + * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params + * @prefix_str: string to prefix each line with; + * caller supplies trailing spaces for alignment if desired + * @prefix_type: controls whether prefix of an offset, address, or none + * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) + * @buf: data blob to dump + * @len: number of bytes in the @buf + * + * Calls print_hex_dump(), with log level of KERN_DEBUG, + * rowsize of 16, groupsize of 1, and ASCII output included. + */ +#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ + print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) + #endif diff --git a/lib/hexdump.c b/lib/hexdump.c index b1d55b669ae2..147133f8eb2f 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -270,25 +270,4 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, } EXPORT_SYMBOL(print_hex_dump); -#if !defined(CONFIG_DYNAMIC_DEBUG) -/** - * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params - * @prefix_str: string to prefix each line with; - * caller supplies trailing spaces for alignment if desired - * @prefix_type: controls whether prefix of an offset, address, or none - * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) - * @buf: data blob to dump - * @len: number of bytes in the @buf - * - * Calls print_hex_dump(), with log level of KERN_DEBUG, - * rowsize of 16, groupsize of 1, and ASCII output included. - */ -void print_hex_dump_bytes(const char *prefix_str, int prefix_type, - const void *buf, size_t len) -{ - print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1, - buf, len, true); -} -EXPORT_SYMBOL(print_hex_dump_bytes); -#endif /* !defined(CONFIG_DYNAMIC_DEBUG) */ #endif /* defined(CONFIG_PRINTK) */ From 634cffcc9478e954d121c3e27e53de4f0d917ac8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:32 -0700 Subject: [PATCH 656/721] checkpatch: don't interpret stack dumps as commit IDs Add more types of lines that appear to be stack dumps that also include hex lines that might otherwise be interpreted as commit IDs. Link: http://lkml.kernel.org/r/ff00208289224f0ca4eaf4ff7c9c6e087dad0a63.camel@perches.com Link: http://lkml.kernel.org/r/f7dc9727795db3802809a24162abe0b67e14123b.1563575364.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 93a7edfe0f05..3c0ee0dde850 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2725,8 +2725,10 @@ sub process { ($line =~ /^\s*(?:WARNING:|BUG:)/ || $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ || # timestamp - $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) { - # stack dump address + $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/) || + $line =~ /^(?:\s+\w+:\s+[0-9a-fA-F]+){3,3}/ || + $line =~ /^\s*\#\d+\s*\[[0-9a-fA-F]+\]\s*\w+ at [0-9a-fA-F]+/) { + # stack dump address styles $commit_log_possible_stack_dump = 1; } From ffbce8974d90efb5ced95fabc61283467c80cb0d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:35 -0700 Subject: [PATCH 657/721] checkpatch: improve SPDX license checking Use perl's m@@ match and not // comparisons to avoid an error using c90's // comment style. Miscellanea: o Use normal tab indentation and alignment Link: http://lkml.kernel.org/r/5e4a8fa7901148fbcd77ab391e6dd0e6bf95777f.camel@perches.com Link: http://lkml.kernel.org/r/f08eb62458407a145cfedf959d1091af151cd665.1563575364.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3c0ee0dde850..6cb99ec62000 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3071,21 +3071,21 @@ sub process { # check SPDX comment style for .[chsS] files if ($realfile =~ /\.[chsS]$/ && $rawline =~ /SPDX-License-Identifier:/ && - $rawline !~ /^\+\s*\Q$comment\E\s*/) { + $rawline !~ m@^\+\s*\Q$comment\E\s*@) { WARN("SPDX_LICENSE_TAG", "Improper SPDX comment style for '$realfile', please use '$comment' instead\n" . $herecurr); } if ($comment !~ /^$/ && - $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) { - WARN("SPDX_LICENSE_TAG", - "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); + $rawline !~ m@^\+\Q$comment\E SPDX-License-Identifier: @) { + WARN("SPDX_LICENSE_TAG", + "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); } elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) { - my $spdx_license = $1; - if (!is_SPDX_License_valid($spdx_license)) { - WARN("SPDX_LICENSE_TAG", - "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); - } + my $spdx_license = $1; + if (!is_SPDX_License_valid($spdx_license)) { + WARN("SPDX_LICENSE_TAG", + "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); + } } } } From a8dd86bf746256fbf68f82bc13356244c5ad8efa Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Wed, 25 Sep 2019 16:46:38 -0700 Subject: [PATCH 658/721] checkpatch.pl: warn on invalid commit id It can happen that a commit message refers to an invalid commit id, because the referenced hash changed following a rebase, or simply by mistake. Add a check in checkpatch.pl which checks that an hash referenced by a Fixes tag, or just cited in the commit message, is a valid commit id. $ scripts/checkpatch.pl <<'EOF' Subject: [PATCH] test commit Sample test commit to test checkpatch.pl Commit 1da177e4c3f4 ("Linux-2.6.12-rc2") really exists, commit 0bba044c4ce7 ("tree") is valid but not a commit, while commit b4cc0b1c0cca ("unknown") is invalid. Fixes: f0cacc14cade ("unknown") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") EOF WARNING: Unknown commit id '0bba044c4ce7', maybe rebased or not pulled? #8: commit 0bba044c4ce7 ("tree") is valid but not a commit, WARNING: Unknown commit id 'b4cc0b1c0cca', maybe rebased or not pulled? #9: while commit b4cc0b1c0cca ("unknown") is invalid. WARNING: Unknown commit id 'f0cacc14cade', maybe rebased or not pulled? #11: Fixes: f0cacc14cade ("unknown") total: 0 errors, 3 warnings, 4 lines checked Link: http://lkml.kernel.org/r/20190711001640.13398-1-mcroce@redhat.com Signed-off-by: Matteo Croce Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 6cb99ec62000..0b1388078b69 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2900,6 +2900,17 @@ sub process { } } +# check for invalid commit id + if ($in_commit_log && $line =~ /(^fixes:|\bcommit)\s+([0-9a-f]{6,40})\b/i) { + my $id; + my $description; + ($id, $description) = git_commit_info($2, undef, undef); + if (!defined($id)) { + WARN("UNKNOWN_COMMIT_ID", + "Unknown commit id '$2', maybe rebased or not pulled?\n" . $herecurr); + } + } + # ignore non-hunk lines and lines being removed next if (!$hunk_line || $line =~ /^-/); From 6dba824e9ef7155f58f11b268b0f98ecec31b723 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 25 Sep 2019 16:46:41 -0700 Subject: [PATCH 659/721] checkpatch: exclude sizeof sub-expressions from MACRO_ARG_REUSE The arguments of sizeof are not evaluated so arguments are safe to re-use in that context. Excluding sizeof subexpressions means macros like ARRAY_SIZE can pass checkpatch. Link: http://lkml.kernel.org/r/20190806070833.24423-1-brendan.jackman@bluwireless.co.uk Signed-off-by: Brendan Jackman Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0b1388078b69..d7466879d505 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5204,7 +5204,7 @@ sub process { next if ($arg =~ /\.\.\./); next if ($arg =~ /^type$/i); my $tmp_stmt = $define_stmt; - $tmp_stmt =~ s/\b(typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; + $tmp_stmt =~ s/\b(sizeof|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; $tmp_stmt =~ s/\#+\s*$arg\b//g; $tmp_stmt =~ s/\b$arg\s*\#\#//g; my $use_cnt = () = $tmp_stmt =~ /\b$arg\b/g; From 462811d9d4007abe229c4cede0c124ed631da0cc Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:44 -0700 Subject: [PATCH 660/721] checkpatch: prefer __section over __attribute__((section(...))) Add another test for __attribute__((section("foo"))) uses that should be __section(foo) Link: http://lkml.kernel.org/r/2f374c3c27054b7f978115270d587c624d9962fc.camel@perches.com Suggested-by: Nick Desaulniers Signed-off-by: Joe Perches Tested-by: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d7466879d505..1f85a3abbd17 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5886,6 +5886,18 @@ sub process { "__aligned(size) is preferred over __attribute__((aligned(size)))\n" . $herecurr); } +# Check for __attribute__ section, prefer __section + if ($realfile !~ m@\binclude/uapi/@ && + $line =~ /\b__attribute__\s*\(\s*\(.*_*section_*\s*\(\s*("[^"]*")/) { + my $old = substr($rawline, $-[1], $+[1] - $-[1]); + my $new = substr($old, 1, -1); + if (WARN("PREFER_SECTION", + "__section($new) is preferred over __attribute__((section($old)))\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*_*section_*\s*\(\s*\Q$old\E\s*\)\s*\)\s*\)/__section($new)/; + } + } + # Check for __attribute__ format(printf, prefer __printf if ($realfile !~ m@\binclude/uapi/@ && $line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) { From 94fb98450456da82a16a378816390d99b85edb55 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:47 -0700 Subject: [PATCH 661/721] checkpatch: allow consecutive close braces checkpatch allows consecutive open braces, so it should also allow consecutive close braces. Link: http://lkml.kernel.org/r/bfdb49ae2c3fa7b52fa168769e38b48f959880e2.camel@perches.com Signed-off-by: Joe Perches Acked-by: Jeff Kirsher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 1f85a3abbd17..7603711503ea 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4673,7 +4673,7 @@ sub process { # closing brace should have a space following it when it has anything # on the line - if ($line =~ /}(?!(?:,|;|\)))\S/) { + if ($line =~ /}(?!(?:,|;|\)|\}))\S/) { if (ERROR("SPACING", "space required after that close brace '}'\n" . $herecurr) && $fix) { From 5a7f4455ad321400e1361ab94fd6858c5b2fe0cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 25 Sep 2019 16:46:49 -0700 Subject: [PATCH 662/721] checkpatch: remove obsolete period from "ambiguous SHA1" query Git dropped the period from its "ambiguous SHA1" error message in commit 0c99171ad2 ("get_short_sha1: mark ambiguity error for translation"), circa 2016. Drop the period from checkpatch's associated query so as to match both the old and new error messages. Link: http://lkml.kernel.org/r/20190830163103.15914-1-sean.j.christopherson@intel.com Signed-off-by: Sean Christopherson Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7603711503ea..4b25a94cfc36 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -962,7 +962,7 @@ sub git_commit_info { return ($id, $desc) if ($#lines < 0); - if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous\./) { + if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous/) { # Maybe one day convert this block of bash into something that returns # all matching commit ids, but it's very slow... # From dbbf869da3adeba4a6beae4ecc184e47a16d078d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 25 Sep 2019 16:46:52 -0700 Subject: [PATCH 663/721] checkpatch: make git output use LANGUAGE=en_US.utf8 git output parsing depends on the language being en_US english. Make the backtick execution of all `git ` commands set the LANGUAGE of the process to en_US.utf8 before executing the actual command using `export LANGUAGE=en_US.utf8; git `. Because the command is executed in a child process, the parent LANGUAGE is unchanged. Link: http://lkml.kernel.org/r/bb9f29988f3258281956680ff39c3e19e37dc0b8.camel@perches.com Signed-off-by: Joe Perches Reported-by: Sean Christopherson Reviewed-by: Sean Christopherson Tested-by: Sean Christopherson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4b25a94cfc36..4eb355d8ae73 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -62,6 +62,8 @@ my $conststructsfile = "$D/const_structs.checkpatch"; my $typedefsfile = ""; my $color = "auto"; my $allow_c99_comments = 1; # Can be overridden by --ignore C99_COMMENT_TOLERANCE +# git output parsing needs US English output, so first set backtick child process LANGUAGE +my $git_command ='export LANGUAGE=en_US.UTF-8; git'; sub help { my ($exitcode) = @_; @@ -904,7 +906,7 @@ sub seed_camelcase_includes { $camelcase_seeded = 1; if (-e ".git") { - my $git_last_include_commit = `git log --no-merges --pretty=format:"%h%n" -1 -- include`; + my $git_last_include_commit = `${git_command} log --no-merges --pretty=format:"%h%n" -1 -- include`; chomp $git_last_include_commit; $camelcase_cache = ".checkpatch-camelcase.git.$git_last_include_commit"; } else { @@ -932,7 +934,7 @@ sub seed_camelcase_includes { } if (-e ".git") { - $files = `git ls-files "include/*.h"`; + $files = `${git_command} ls-files "include/*.h"`; @include_files = split('\n', $files); } @@ -956,7 +958,7 @@ sub git_commit_info { return ($id, $desc) if ((which("git") eq "") || !(-e ".git")); - my $output = `git log --no-color --format='%H %s' -1 $commit 2>&1`; + my $output = `${git_command} log --no-color --format='%H %s' -1 $commit 2>&1`; $output =~ s/^\s*//gm; my @lines = split("\n", $output); @@ -1006,7 +1008,7 @@ if ($git) { } else { $git_range = "-1 $commit_expr"; } - my $lines = `git log --no-color --no-merges --pretty=format:'%H %s' $git_range`; + my $lines = `${git_command} log --no-color --no-merges --pretty=format:'%H %s' $git_range`; foreach my $line (split(/\n/, $lines)) { $line =~ /^([0-9a-fA-F]{40,40}) (.*)$/; next if (!defined($1) || !defined($2)); From d256085be12dc3b24e7b19c357e975a37dbff509 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 25 Sep 2019 16:46:55 -0700 Subject: [PATCH 664/721] fs: reiserfs: remove unnecessary check of bh in remove_from_transaction() On lines 3430-3434, bh has been assured to be non-null: cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr); if (!cn || !cn->bh) { return ret; } bh = cn->bh; Thus, the check of bh on line 3447 is unnecessary and can be removed. Thank Andrew Morton for good advice. Link: http://lkml.kernel.org/r/20190727084019.11307-1-baijiaju1990@gmail.com Signed-off-by: Jia-Ju Bai Reviewed-by: Jan Kara Cc: Arnd Bergmann Cc: Hariprasad Kelam Cc: Bharath Vedartham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4517a1394c6f..11155b8513db 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -3444,9 +3444,8 @@ static int remove_from_transaction(struct super_block *sb, if (cn == journal->j_last) { journal->j_last = cn->prev; } - if (bh) - remove_journal_hash(sb, journal->j_hash_table, NULL, - bh->b_blocknr, 0); + remove_journal_hash(sb, journal->j_hash_table, NULL, + bh->b_blocknr, 0); clear_buffer_journaled(bh); /* don't log this one */ if (!already_cleaned) { From 6e9ca45f77bc944d44face28059ab4db02e280fa Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:46:58 -0700 Subject: [PATCH 665/721] fs/reiserfs/journal.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/reiserfs/journal.c: In function flush_older_commits: fs/reiserfs/journal.c:894:15: warning: variable first_trans_id set but not used [-Wunused-but-set-variable] fs/reiserfs/journal.c: In function flush_journal_list: fs/reiserfs/journal.c:1354:38: warning: variable last set but not used [-Wunused-but-set-variable] fs/reiserfs/journal.c: In function do_journal_release: fs/reiserfs/journal.c:1916:6: warning: variable flushed set but not used [-Wunused-but-set-variable] fs/reiserfs/journal.c: In function do_journal_end: fs/reiserfs/journal.c:3993:6: warning: variable old_start set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-2-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 11155b8513db..75d9d52d489f 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -891,7 +891,6 @@ static int flush_older_commits(struct super_block *s, struct list_head *entry; unsigned int trans_id = jl->j_trans_id; unsigned int other_trans_id; - unsigned int first_trans_id; find_first: /* @@ -914,8 +913,6 @@ static int flush_older_commits(struct super_block *s, return 0; } - first_trans_id = first_jl->j_trans_id; - entry = &first_jl->j_list; while (1) { other_jl = JOURNAL_LIST_ENTRY(entry); @@ -1351,7 +1348,7 @@ static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { struct reiserfs_journal_list *pjl; - struct reiserfs_journal_cnode *cn, *last; + struct reiserfs_journal_cnode *cn; int count; int was_jwait = 0; int was_dirty = 0; @@ -1509,7 +1506,6 @@ static int flush_journal_list(struct super_block *s, b_blocknr, __func__); } free_cnode: - last = cn; cn = cn->next; if (saved_bh) { /* @@ -1911,7 +1907,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb, int error) { struct reiserfs_transaction_handle myth; - int flushed = 0; struct reiserfs_journal *journal = SB_JOURNAL(sb); /* @@ -1933,7 +1928,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, 1); journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); do_journal_end(&myth, FLUSH_ALL); - flushed = 1; } } @@ -3987,7 +3981,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) struct buffer_head *c_bh; /* commit bh */ struct buffer_head *d_bh; /* desc bh */ int cur_write_start = 0; /* start index of current log write */ - int old_start; int i; int flush; int wait_on_commit; @@ -4244,7 +4237,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) journal->j_num_work_lists++; /* reset journal values for the next transaction */ - old_start = journal->j_start; journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(sb); From 66985cb9ee107f2596e9d721252b679249c41858 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:01 -0700 Subject: [PATCH 666/721] fs/reiserfs/stree.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/reiserfs/stree.c: In function search_by_key: fs/reiserfs/stree.c:596:6: warning: variable right_neighbor_of_leaf_node set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-3-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/stree.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 0037aea97d39..da9ebe33882b 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -593,7 +593,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, struct buffer_head *bh; struct path_element *last_element; int node_level, retval; - int right_neighbor_of_leaf_node; int fs_gen; struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA]; @@ -614,8 +613,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, pathrelse(search_path); - right_neighbor_of_leaf_node = 0; - /* * With each iteration of this loop we search through the items in the * current node, and calculate the next current node(next path element) @@ -701,7 +698,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, */ block_number = SB_ROOT_BLOCK(sb); expected_level = -1; - right_neighbor_of_leaf_node = 0; /* repeat search from the root */ continue; From d4a1a857e31abe7acc9586450119d9c4bc6cc2cd Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:04 -0700 Subject: [PATCH 667/721] fs/reiserfs/lbalance.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/reiserfs/lbalance.c: In function leaf_paste_entries: fs/reiserfs/lbalance.c:1325:9: warning: variable old_entry_num set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-4-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/lbalance.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index f5cebd70d903..7f868569d4d0 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -1322,7 +1322,7 @@ void leaf_paste_entries(struct buffer_info *bi, char *item; struct reiserfs_de_head *deh; char *insert_point; - int i, old_entry_num; + int i; struct buffer_head *bh = bi->bi_bh; if (new_entry_count == 0) @@ -1362,7 +1362,6 @@ void leaf_paste_entries(struct buffer_info *bi, put_deh_location(&deh[i], deh_location(&deh[i]) + paste_size); - old_entry_num = ih_entry_count(ih); put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); /* prepare space for pasted records */ From 4a70aebb12689b2e7b51c4b10157c4c4e99c59e4 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:07 -0700 Subject: [PATCH 668/721] fs/reiserfs/objectid.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/reiserfs/objectid.c: In function reiserfs_convert_objectid_map_v1: fs/reiserfs/objectid.c:186:25: warning: variable new_objectid_map set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-5-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/objectid.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 415d66ca87d1..34baf5c0f265 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -183,13 +183,12 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s) int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2; int old_max = sb_oid_maxsize(disk_sb); struct reiserfs_super_block_v1 *disk_sb_v1; - __le32 *objectid_map, *new_objectid_map; + __le32 *objectid_map; int i; disk_sb_v1 = (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); objectid_map = (__le32 *) (disk_sb_v1 + 1); - new_objectid_map = (__le32 *) (disk_sb + 1); if (cur_size > new_size) { /* From 73fbff5eea3c0684519c0d8b60177557b5ac8f9b Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:10 -0700 Subject: [PATCH 669/721] fs/reiserfs/prints.c: remove set but not used variables Fixes gcc '-Wunused-but-set-variable' warning: fs/reiserfs/prints.c: In function check_internal_block_head: fs/reiserfs/prints.c:749:21: warning: variable blkh set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-6-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/prints.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 9fed1c05f1f4..500f2000eb41 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -746,9 +746,6 @@ static void check_leaf_block_head(struct buffer_head *bh) static void check_internal_block_head(struct buffer_head *bh) { - struct block_head *blkh; - - blkh = B_BLK_HEAD(bh); if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh); From 4fadcd1c14d810ec6a695039cfc71e03ae742deb Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:13 -0700 Subject: [PATCH 670/721] fs/reiserfs/fix_node.c: remove set but not used variables fs/reiserfs/fix_node.c: In function get_num_ver: fs/reiserfs/fix_node.c:379:6: warning: variable cur_free set but not used [-Wunused-but-set-variable] fs/reiserfs/fix_node.c: In function dc_check_balance_internal: fs/reiserfs/fix_node.c:1737:6: warning: variable maxsize set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-7-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/fix_node.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 6b0ddb2a9091..117092224111 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -376,7 +376,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, int to, int to_bytes, short *snum012, int flow) { int i; - int cur_free; int units; struct virtual_node *vn = tb->tb_vn; int total_node_size, max_node_size, current_item_size; @@ -438,7 +437,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, /* leaf level */ needed_nodes = 1; total_node_size = 0; - cur_free = max_node_size; /* start from 'from'-th item */ start_item = from; @@ -1734,14 +1732,12 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) * and Fh is its father. */ struct buffer_head *Sh, *Fh; - int maxsize, ret; + int ret; int lfree, rfree /* free space in L and R */ ; Sh = PATH_H_PBUFFER(tb->tb_path, h); Fh = PATH_H_PPARENT(tb->tb_path, h); - maxsize = MAX_CHILD_SIZE(Sh); - /* * using tb->insert_size[h], which is negative in this case, * create_virtual_node calculates: From da5184c2ab10b57bf9b58f818405aa0054a2f829 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Sep 2019 16:47:16 -0700 Subject: [PATCH 671/721] fs/reiserfs/do_balan.c: remove set but not used variables fs/reiserfs/do_balan.c: In function balance_leaf_when_delete: fs/reiserfs/do_balan.c:245:20: warning: variable ih set but not used [-Wunused-but-set-variable] fs/reiserfs/do_balan.c: In function balance_leaf_insert_left: fs/reiserfs/do_balan.c:301:7: warning: variable version set but not used [-Wunused-but-set-variable] fs/reiserfs/do_balan.c: In function balance_leaf_insert_right: fs/reiserfs/do_balan.c:649:7: warning: variable version set but not used [-Wunused-but-set-variable] fs/reiserfs/do_balan.c: In function balance_leaf_new_nodes_insert: fs/reiserfs/do_balan.c:953:7: warning: variable version set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1566379929-118398-8-git-send-email-zhengbin13@huawei.com Signed-off-by: zhengbin Reported-by: Hulk Robot Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/do_balan.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 9c02d96d3a42..ffb6d7f0da94 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -239,10 +239,8 @@ static int balance_leaf_when_delete_left(struct tree_balance *tb) static int balance_leaf_when_delete(struct tree_balance *tb, int flag) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int item_pos = PATH_LAST_POSITION(tb->tb_path); struct buffer_info bi; int n; - struct item_head *ih; RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, "vs- 12000: level: wrong FR %z", tb->FR[0]); @@ -251,7 +249,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), "PAP-12010: tree can not be empty"); - ih = item_head(tbS0, item_pos); buffer_info_init_tbS0(tb, &bi); /* Delete or truncate the item */ @@ -298,7 +295,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb, if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { /* part of new item falls into L[0] */ int new_item_len, shift; - int version; ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1); @@ -317,8 +313,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb, leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, min_t(int, tb->zeroes_num, ih_item_len(ih))); - version = ih_version(ih); - /* * Calculate key component, item length and body to * insert into S[0] @@ -646,13 +640,11 @@ static void balance_leaf_insert_right(struct tree_balance *tb, if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { loff_t old_key_comp, old_len, r_zeroes_number; const char *r_body; - int version, shift; + int shift; loff_t offset; leaf_shift_right(tb, tb->rnum[0] - 1, -1); - version = ih_version(ih); - /* Remember key component and item length */ old_key_comp = le_ih_k_offset(ih); old_len = ih_item_len(ih); @@ -950,14 +942,12 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb, if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) { int old_key_comp, old_len, r_zeroes_number; const char *r_body; - int version; /* Move snum[i]-1 items from S[0] to S_new[i] */ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1, tb->S_new[i]); /* Remember key component and item length */ - version = ih_version(ih); old_key_comp = le_ih_k_offset(ih); old_len = ih_item_len(ih); From 3e9fd5a48cb7b0ef93be097c2c1066738d37f5b7 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 25 Sep 2019 16:47:19 -0700 Subject: [PATCH 672/721] fs/reiserfs/journal.c: remove set but not used variable Fix the following gcc warning: fs/reiserfs/journal.c: In function flush_used_journal_lists: fs/reiserfs/journal.c:1791:6: warning: variable ret set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/20190827032932.46622-1-yanaijie@huawei.com Signed-off-by: Jason Yan Cc: zhengbin Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 75d9d52d489f..4b3e3e73b512 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1788,7 +1788,6 @@ static int flush_used_journal_lists(struct super_block *s, { unsigned long len = 0; unsigned long cur_len; - int ret; int i; int limit = 256; struct reiserfs_journal_list *tjl; @@ -1825,9 +1824,9 @@ static int flush_used_journal_lists(struct super_block *s, * transactions, but only bother if we've actually spanned * across multiple lists */ - if (flush_jl != jl) { - ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); - } + if (flush_jl != jl) + kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + flush_journal_list(s, flush_jl, 1); put_journal_list(s, flush_jl); put_journal_list(s, jl); From b25bab17221ba366ffdff9dd62945932aac7dc98 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 25 Sep 2019 16:47:22 -0700 Subject: [PATCH 673/721] fs/reiserfs/do_balan.c: remove set but not used variable Fix the following gcc warning: fs/reiserfs/do_balan.c: In function balance_leaf_insert_right: fs/reiserfs/do_balan.c:629:6: warning: variable ret set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/20190827032932.46622-2-yanaijie@huawei.com Signed-off-by: Jason Yan Cc: zhengbin Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/do_balan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index ffb6d7f0da94..4075e41408b4 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -626,7 +626,6 @@ static void balance_leaf_insert_right(struct tree_balance *tb, struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tbS0); struct buffer_info bi; - int ret; /* new item or part of it doesn't fall into R[0] */ if (n - tb->rnum[0] >= tb->item_pos) { @@ -690,7 +689,7 @@ static void balance_leaf_insert_right(struct tree_balance *tb, /* whole new item falls into R[0] */ /* Shift rnum[0]-1 items to R[0] */ - ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); + leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); /* Insert new item into R[0] */ buffer_info_init_right(tb, &bi); From aadc4e01dbaaccd38abde03bc84d0332a6bd9eab Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 25 Sep 2019 16:47:24 -0700 Subject: [PATCH 674/721] fat: delete an unnecessary check before brelse() brelse() tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Link: http://lkml.kernel.org/r/cfff3b81-fb5d-af26-7b5e-724266509045@web.de Signed-off-by: Markus Elfring Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/dir.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 814ad2c2ba80..054acd9fd033 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -88,9 +88,7 @@ static int fat__get_entry(struct inode *dir, loff_t *pos, int err, offset; next: - if (*bh) - brelse(*bh); - + brelse(*bh); *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); From 8495f7e6732ed248b648d36439795b42ec650b9e Mon Sep 17 00:00:00 2001 From: Sai Praneeth Prakhya Date: Wed, 25 Sep 2019 16:47:27 -0700 Subject: [PATCH 675/721] fork: improve error message for corrupted page tables When a user process exits, the kernel cleans up the mm_struct of the user process and during cleanup, check_mm() checks the page tables of the user process for corruption (E.g: unexpected page flags set/cleared). For corrupted page tables, the error message printed by check_mm() isn't very clear as it prints the loop index instead of page table type (E.g: Resident file mapping pages vs Resident shared memory pages). The loop index in check_mm() is used to index rss_stat[] which represents individual memory type stats. Hence, instead of printing index, print memory type, thereby improving error message. Without patch: -------------- [ 204.836425] mm/pgtable-generic.c:29: bad p4d 0000000089eb4e92(800000025f941467) [ 204.836544] BUG: Bad rss-counter state mm:00000000f75895ea idx:0 val:2 [ 204.836615] BUG: Bad rss-counter state mm:00000000f75895ea idx:1 val:5 [ 204.836685] BUG: non-zero pgtables_bytes on freeing mm: 20480 With patch: ----------- [ 69.815453] mm/pgtable-generic.c:29: bad p4d 0000000084653642(800000025ca37467) [ 69.815872] BUG: Bad rss-counter state mm:00000000014a6c03 type:MM_FILEPAGES val:2 [ 69.815962] BUG: Bad rss-counter state mm:00000000014a6c03 type:MM_ANONPAGES val:5 [ 69.816050] BUG: non-zero pgtables_bytes on freeing mm: 20480 Also, change print function (from printk(KERN_ALERT, ..) to pr_alert()) so that it matches the other print statement. Link: http://lkml.kernel.org/r/da75b5153f617f4c5739c08ee6ebeb3d19db0fbc.1565123758.git.sai.praneeth.prakhya@intel.com Signed-off-by: Sai Praneeth Prakhya Reviewed-by: Anshuman Khandual Suggested-by: Dave Hansen Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types_task.h | 4 ++++ kernel/fork.c | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index d7016dcb245e..c1bc6731125c 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -36,6 +36,10 @@ struct vmacache { struct vm_area_struct *vmas[VMACACHE_SIZE]; }; +/* + * When updating this, please also update struct resident_page_types[] in + * kernel/fork.c + */ enum { MM_FILEPAGES, /* Resident file mapping pages */ MM_ANONPAGES, /* Resident anonymous pages */ diff --git a/kernel/fork.c b/kernel/fork.c index 5a0fd518e04e..60763c043aa3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -125,6 +125,15 @@ int nr_threads; /* The idle threads do not count.. */ static int max_threads; /* tunable limit on nr_threads */ +#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) + +static const char * const resident_page_types[] = { + NAMED_ARRAY_INDEX(MM_FILEPAGES), + NAMED_ARRAY_INDEX(MM_ANONPAGES), + NAMED_ARRAY_INDEX(MM_SWAPENTS), + NAMED_ARRAY_INDEX(MM_SHMEMPAGES), +}; + DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ @@ -645,12 +654,15 @@ static void check_mm(struct mm_struct *mm) { int i; + BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, + "Please make sure 'struct resident_page_types[]' is updated as well"); + for (i = 0; i < NR_MM_COUNTERS; i++) { long x = atomic_long_read(&mm->rss_stat.count[i]); if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", + mm, resident_page_types[i], x); } if (mm_pgtables_bytes(mm)) From 2a4a4082cd4438333b5ecffdd15d1a484e5a83c7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 25 Sep 2019 16:47:30 -0700 Subject: [PATCH 676/721] cpumask: nicer for_each_cpumask_and() signature Mask arguments can be swapped without changing anything. Make arguments names reflect that: #define for_each_cpu_and(cpu, mask1, mask2) Link: http://lkml.kernel.org/r/20190724183350.GA15041@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index b5a5a1ed9efd..78a73eba64dd 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -200,8 +200,8 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node) for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) -#define for_each_cpu_and(cpu, mask, and) \ - for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and) +#define for_each_cpu_and(cpu, mask1, mask2) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) #else /** * cpumask_first - get the first cpu in a cpumask @@ -290,20 +290,20 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator - * @mask: the first cpumask pointer - * @and: the second cpumask pointer + * @mask1: the first cpumask pointer + * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; - * cpumask_and(&tmp, &mask, &and); + * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ -#define for_each_cpu_and(cpu, mask, and) \ +#define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = -1; \ - (cpu) = cpumask_next_and((cpu), (mask), (and)), \ + (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \ (cpu) < nr_cpu_ids;) #endif /* SMP */ From 7c3a6aedcd6aae0a32a527e68669f7dd667492d1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 25 Sep 2019 16:47:33 -0700 Subject: [PATCH 677/721] kexec: bail out upon SIGKILL when allocating memory. syzbot found that a thread can stall for minutes inside kexec_load() after that thread was killed by SIGKILL [1]. It turned out that the reproducer was trying to allocate 2408MB of memory using kimage_alloc_page() from kimage_load_normal_segment(). Let's check for SIGKILL before doing memory allocation. [1] https://syzkaller.appspot.com/bug?id=a0e3436829698d5824231251fad9d8e998f94f5e Link: http://lkml.kernel.org/r/993c9185-d324-2640-d061-bed2dd18b1f7@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Reported-by: syzbot Cc: Eric Biederman Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d5870723b8ad..15d70a90b50d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -300,6 +300,8 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; + if (fatal_signal_pending(current)) + return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; From d5372c39132958679c480d0295dd328c741c7a41 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 25 Sep 2019 16:47:36 -0700 Subject: [PATCH 678/721] kexec: restore arch_kexec_kernel_image_probe declaration arch_kexec_kernel_image_probe function declaration has been removed by commit 9ec4ecef0af7 ("kexec_file,x86,powerpc: factor out kexec_file_ops functions"). Still this function is overridden by couple of architectures and proper prototype declaration is therefore important, so bring it back. This fixes the following sparse warning on s390: arch/s390/kernel/machine_kexec_file.c:333:5: warning: symbol 'arch_kexec_kernel_image_probe' was not declared. Should it be static? Link: http://lkml.kernel.org/r/patch.git-ff1c9045ebdc.your-ad-here.call-01564402297-ext-5690@work.hours Signed-off-by: Vasily Gorbik Acked-by: Dave Young Reviewed-by: Bhupesh Sharma Cc: Eric Biederman Cc: AKASHI Takahiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f0b809258ed3..cc162f3e6461 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -183,6 +183,8 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, bool get_value); void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len); void * __weak arch_kexec_kernel_image_load(struct kimage *image); int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, From 9dd819a15162f8f82a6001b090caa38c18297b39 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:39 -0700 Subject: [PATCH 679/721] uaccess: add missing __must_check attributes The usercopy implementation comments describe that callers of the copy_*_user() family of functions must always have their return values checked. This can be enforced at compile time with __must_check, so add it where needed. Link: http://lkml.kernel.org/r/201908251609.ADAD5CAAC1@keescook Signed-off-by: Kees Cook Cc: Alexander Viro Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thread_info.h | 2 +- include/linux/uaccess.h | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 8d8821b3689a..659a4400517b 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -134,7 +134,7 @@ static inline void copy_overflow(int size, unsigned long count) WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } -static __always_inline bool +static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __compiletime_object_size(addr); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 34a038563d97..70bbdc38dc37 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -55,7 +55,7 @@ * as usual) and both source and destination can trigger faults. */ -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); @@ -63,7 +63,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) return raw_copy_from_user(to, from, n); } -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); @@ -85,7 +85,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); @@ -93,7 +93,7 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) return raw_copy_to_user(to, from, n); } -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); @@ -103,7 +103,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) } #ifdef INLINE_COPY_FROM_USER -static inline unsigned long +static inline __must_check unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; @@ -117,12 +117,12 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) return res; } #else -extern unsigned long +extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); #endif #ifdef INLINE_COPY_TO_USER -static inline unsigned long +static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); @@ -133,7 +133,7 @@ _copy_to_user(void __user *to, const void *from, unsigned long n) return n; } #else -extern unsigned long +extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif @@ -222,8 +222,9 @@ static inline bool pagefault_disabled(void) #ifndef ARCH_HAS_NOCACHE_UACCESS -static inline unsigned long __copy_from_user_inatomic_nocache(void *to, - const void __user *from, unsigned long n) +static inline __must_check unsigned long +__copy_from_user_inatomic_nocache(void *to, const void __user *from, + unsigned long n) { return __copy_from_user_inatomic(to, from, n); } From ac7c3e4ff401b304489a031938dbeaab585bfe0a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 25 Sep 2019 16:47:42 -0700 Subject: [PATCH 680/721] compiler: enable CONFIG_OPTIMIZE_INLINING forcibly Commit 9012d011660e ("compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING") allowed all architectures to enable this option. A couple of build errors were reported by randconfig, but all of them have been ironed out. Towards the goal of removing CONFIG_OPTIMIZE_INLINING entirely (and it will simplify the 'inline' macro in compiler_types.h), this commit changes it to always-on option. Going forward, the compiler will always be allowed to not inline functions marked 'inline'. This is not a problem for x86 since it has been long used by arch/x86/configs/{x86_64,i386}_defconfig. I am keeping the config option just in case any problem crops up for other architectures. The code clean-up will be done after confirming this is solid. Link: http://lkml.kernel.org/r/20190830034304.24259-1-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Nick Desaulniers Cc: Ingo Molnar Cc: Borislav Petkov Cc: Miguel Ojeda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6b1b1703a646..93d97f9b0157 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -311,7 +311,7 @@ config HEADERS_CHECK relevant for userspace, say 'Y'. config OPTIMIZE_INLINING - bool "Allow compiler to uninline functions marked 'inline'" + def_bool y help This option determines if the kernel forces gcc to inline the functions developers have marked 'inline'. Doing so takes away freedom from gcc to @@ -322,8 +322,6 @@ config OPTIMIZE_INLINING decision will become the default in the future. Until then this option is there to test gcc for this. - If unsure, say N. - config DEBUG_SECTION_MISMATCH bool "Enable full Section mismatch analysis" help From 7d92bda271ddcbb2d1be2f82733dcb9bf8378010 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 16:47:45 -0700 Subject: [PATCH 681/721] kgdb: don't use a notifier to enter kgdb at panic; call directly Right now kgdb/kdb hooks up to debug panics by registering for the panic notifier. This works OK except that it means that kgdb/kdb gets called _after_ the CPUs in the system are taken offline. That means that if anything important was happening on those CPUs (like something that might have contributed to the panic) you can't debug them. Specifically I ran into a case where I got a panic because a task was "blocked for more than 120 seconds" which was detected on CPU 2. I nicely got shown stack traces in the kernel log for all CPUs including CPU 0, which was running 'PID: 111 Comm: kworker/0:1H' and was in the middle of __mmc_switch(). I then ended up at the kdb prompt where switched over to kgdb to try to look at local variables of the process on CPU 0. I found that I couldn't. Digging more, I found that I had no info on any tasks running on CPUs other than CPU 2 and that asking kdb for help showed me "Error: no saved data for this cpu". This was because all the CPUs were offline. Let's move the entry of kdb/kgdb to a direct call from panic() and stop using the generic notifier. Putting a direct call in allows us to order things more properly and it also doesn't seem like we're breaking any abstractions by calling into the debugger from the panic function. Daniel said: : This patch changes the way kdump and kgdb interact with each other. : However it would seem rather odd to have both tools simultaneously armed : and, even if they were, the user still has the option to use panic_timeout : to force a kdump to happen. Thus I think the change of order is : acceptable. Link: http://lkml.kernel.org/r/20190703170354.217312-1-dianders@chromium.org Signed-off-by: Douglas Anderson Reviewed-by: Daniel Thompson Cc: Jason Wessel Cc: Kees Cook Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Feng Tang Cc: YueHaibing Cc: Sergey Senozhatsky Cc: "Steven Rostedt (VMware)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kgdb.h | 2 ++ kernel/debug/debug_core.c | 33 ++++++++++++--------------------- kernel/panic.c | 8 ++++++++ 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index fbf144aaa749..b072aeb1fd78 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -326,8 +326,10 @@ extern atomic_t kgdb_active; (raw_smp_processor_id() == atomic_read(&kgdb_active)) extern bool dbg_is_early; extern void __init dbg_late_init(void); +extern void kgdb_panic(const char *msg); #else /* ! CONFIG_KGDB */ #define in_dbg_master() (0) #define dbg_late_init() +static inline void kgdb_panic(const char *msg) {} #endif /* ! CONFIG_KGDB */ #endif /* _KGDB_H_ */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 10f1187b3907..f76d6f77dd5e 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -893,29 +893,24 @@ static struct sysrq_key_op sysrq_dbg_op = { }; #endif -static int kgdb_panic_event(struct notifier_block *self, - unsigned long val, - void *data) +void kgdb_panic(const char *msg) { + if (!kgdb_io_module_registered) + return; + /* - * Avoid entering the debugger if we were triggered due to a panic - * We don't want to get stuck waiting for input from user in such case. - * panic_timeout indicates the system should automatically + * We don't want to get stuck waiting for input from user if + * "panic_timeout" indicates the system should automatically * reboot on panic. */ if (panic_timeout) - return NOTIFY_DONE; + return; if (dbg_kdb_mode) - kdb_printf("PANIC: %s\n", (char *)data); - kgdb_breakpoint(); - return NOTIFY_DONE; -} + kdb_printf("PANIC: %s\n", msg); -static struct notifier_block kgdb_panic_event_nb = { - .notifier_call = kgdb_panic_event, - .priority = INT_MAX, -}; + kgdb_breakpoint(); +} void __weak kgdb_arch_late(void) { @@ -965,8 +960,6 @@ static void kgdb_register_callbacks(void) kgdb_arch_late(); register_module_notifier(&dbg_module_load_nb); register_reboot_notifier(&dbg_reboot_notifier); - atomic_notifier_chain_register(&panic_notifier_list, - &kgdb_panic_event_nb); #ifdef CONFIG_MAGIC_SYSRQ register_sysrq_key('g', &sysrq_dbg_op); #endif @@ -980,16 +973,14 @@ static void kgdb_register_callbacks(void) static void kgdb_unregister_callbacks(void) { /* - * When this routine is called KGDB should unregister from the - * panic handler and clean up, making sure it is not handling any + * When this routine is called KGDB should unregister from + * handlers and clean up, making sure it is not handling any * break exceptions at the time. */ if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; unregister_reboot_notifier(&dbg_reboot_notifier); unregister_module_notifier(&dbg_module_load_nb); - atomic_notifier_chain_unregister(&panic_notifier_list, - &kgdb_panic_event_nb); kgdb_arch_exit(); #ifdef CONFIG_MAGIC_SYSRQ unregister_sysrq_key('g', &sysrq_dbg_op); diff --git a/kernel/panic.c b/kernel/panic.c index 057540b6eee9..d1ece4c363b9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -219,6 +220,13 @@ void panic(const char *fmt, ...) dump_stack(); #endif + /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left + * running on them. + */ + kgdb_panic(buf); + /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. From da036ae147624b70f7d3784ff3a53bd4fda20d2a Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 16:47:48 -0700 Subject: [PATCH 682/721] scripts/gdb: handle split debug Some systems (like Chrome OS) may use "split debug" for kernel modules. That means that the debug symbols are in a different file than the main elf file. Let's handle that by also searching for debug symbols that end in ".ko.debug". This is a packaging topic. You can take a normal elf file and split the debug out of it using objcopy. Try "man objcopy" and then take a look at the "--only-keep-debug" option. It'll give you a whole recipe for doing splitdebug. The suffix used for the debug symbols is arbitrary. If people have other another suffix besides ".ko.debug" then we could presumably support that too... For portage (which is the packaging system used by Chrome OS) split debug is supported by default (and the suffix is .ko.debug). ...and so in Chrome OS we always get the installed elf files stripped and then the symbols stashed away. At the moment we don't actually use the normal portage magic to do this for the kernel though since it affects our ability to get good stack dumps in the kernel. We instead pass a script as "strip" [1]. [1] https://chromium.googlesource.com/chromiumos/overlays/chromiumos-overlay/+/refs/heads/master/eclass/cros-kernel/strip_splitdebug Link: http://lkml.kernel.org/r/20190730234052.148744-1-dianders@chromium.org Signed-off-by: Douglas Anderson Reviewed-by: Stephen Boyd Reviewed-by: Jan Kiszka Cc: Kieran Bingham Cc: Jason Wessel Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/gdb/linux/symbols.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 2f5b95f09fa0..34e40e96dee2 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -77,12 +77,12 @@ lx-symbols command.""" gdb.write("scanning for modules in {0}\n".format(path)) for root, dirs, files in os.walk(path): for name in files: - if name.endswith(".ko"): + if name.endswith(".ko") or name.endswith(".ko.debug"): self.module_files.append(root + "/" + name) self.module_files_updated = True def _get_module_file(self, module_name): - module_pattern = ".*/{0}\.ko$".format( + module_pattern = ".*/{0}\.ko(?:.debug)?$".format( module_name.replace("_", r"[_\-]")) for name in self.module_files: if re.match(module_pattern, name) and os.path.exists(name): From ee8711336c51708382627ebcaee5f2122b77dfef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:52 -0700 Subject: [PATCH 683/721] bug: refactor away warn_slowpath_fmt_taint() Patch series "Clean up WARN() "cut here" handling", v2. Christophe Leroy noticed that the fix for missing "cut here" in the WARN() case was adding explicit printk() calls instead of teaching the exception handler to add it. This refactors the bug/warn infrastructure to pass this information as a new BUGFLAG. Longer details repeated from the last patch in the series: bug: move WARN_ON() "cut here" into exception handler The original cleanup of "cut here" missed the WARN_ON() case (that does not have a printk message), which was fixed recently by adding an explicit printk of "cut here". This had the downside of adding a printk() to every WARN_ON() caller, which reduces the utility of using an instruction exception to streamline the resulting code. By making this a new BUGFLAG, all of these can be removed and "cut here" can be handled by the exception handler. This was very pronounced on PowerPC, but the effect can be seen on x86 as well. The resulting text size of a defconfig build shows some small savings from this patch: text data bss dec hex filename 19691167 5134320 1646664 26472151 193eed7 vmlinux.before 19676362 5134260 1663048 26473670 193f4c6 vmlinux.after This change also opens the door for creating something like BUG_MSG(), where a custom printk() before issuing BUG(), without confusing the "cut here" line. This patch (of 7): There's no reason to have specialized helpers for passing the warn taint down to __warn(). Consolidate and refactor helper macros, removing __WARN_printf() and warn_slowpath_fmt_taint(). Link: http://lkml.kernel.org/r/20190819234111.9019-2-keescook@chromium.org Signed-off-by: Kees Cook Cc: Christophe Leroy Cc: Peter Zijlstra Cc: Christophe Leroy Cc: Drew Davenport Cc: Arnd Bergmann Cc: "Steven Rostedt (VMware)" Cc: Feng Tang Cc: Petr Mladek Cc: Mauro Carvalho Chehab Cc: Borislav Petkov Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 13 ++++--------- kernel/panic.c | 18 +++--------------- 2 files changed, 7 insertions(+), 24 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 7357a3c942a0..c3a9c16a2b69 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -90,24 +90,19 @@ struct bug_entry { * Use the versions with printk format strings to provide better diagnostics. */ #ifndef __WARN_TAINT -extern __printf(3, 4) -void warn_slowpath_fmt(const char *file, const int line, - const char *fmt, ...); extern __printf(4, 5) -void warn_slowpath_fmt_taint(const char *file, const int line, unsigned taint, - const char *fmt, ...); +void warn_slowpath_fmt(const char *file, const int line, unsigned taint, + const char *fmt, ...); extern void warn_slowpath_null(const char *file, const int line); #define WANT_WARN_ON_SLOWPATH #define __WARN() warn_slowpath_null(__FILE__, __LINE__) -#define __WARN_printf(arg...) warn_slowpath_fmt(__FILE__, __LINE__, arg) #define __WARN_printf_taint(taint, arg...) \ - warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg) + warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define __WARN() do { \ printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ } while (0) -#define __WARN_printf(arg...) __WARN_printf_taint(TAINT_WARN, arg) #define __WARN_printf_taint(taint, arg...) \ do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) #endif @@ -132,7 +127,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf(format); \ + __WARN_printf_taint(TAINT_WARN, format); \ unlikely(__ret_warn_on); \ }) #endif diff --git a/kernel/panic.c b/kernel/panic.c index d1ece4c363b9..1d89f5423426 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -600,20 +600,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, } #ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) -{ - struct warn_args args; - - args.fmt = fmt; - va_start(args.args, fmt); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, - &args); - va_end(args.args); -} -EXPORT_SYMBOL(warn_slowpath_fmt); - -void warn_slowpath_fmt_taint(const char *file, int line, - unsigned taint, const char *fmt, ...) +void warn_slowpath_fmt(const char *file, int line, unsigned taint, + const char *fmt, ...) { struct warn_args args; @@ -622,7 +610,7 @@ void warn_slowpath_fmt_taint(const char *file, int line, __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } -EXPORT_SYMBOL(warn_slowpath_fmt_taint); +EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_null(const char *file, int line) { From 89348fc31441f0270b040fbeb68b6d7d13504f36 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:55 -0700 Subject: [PATCH 684/721] bug: rename __WARN_printf_taint() to __WARN_printf() This just renames the helper to improve readability. Link: http://lkml.kernel.org/r/20190819234111.9019-3-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index c3a9c16a2b69..2d35bbf687d0 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -96,14 +96,14 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, extern void warn_slowpath_null(const char *file, const int line); #define WANT_WARN_ON_SLOWPATH #define __WARN() warn_slowpath_null(__FILE__, __LINE__) -#define __WARN_printf_taint(taint, arg...) \ +#define __WARN_printf(taint, arg...) \ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define __WARN() do { \ printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ } while (0) -#define __WARN_printf_taint(taint, arg...) \ +#define __WARN_printf(taint, arg...) \ do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) #endif @@ -127,7 +127,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf_taint(TAINT_WARN, format); \ + __WARN_printf(TAINT_WARN, format); \ unlikely(__ret_warn_on); \ }) #endif @@ -135,7 +135,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN_TAINT(condition, taint, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf_taint(taint, format); \ + __WARN_printf(taint, format); \ unlikely(__ret_warn_on); \ }) From f2f84b05e02b7710a201f0017b3272ad7ef703d1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:47:58 -0700 Subject: [PATCH 685/721] bug: consolidate warn_slowpath_fmt() usage Instead of having a separate helper for no printk output, just consolidate the logic into warn_slowpath_fmt(). Link: http://lkml.kernel.org/r/20190819234111.9019-4-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 3 +-- kernel/panic.c | 14 +++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 2d35bbf687d0..598d7072602f 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -93,9 +93,8 @@ struct bug_entry { extern __printf(4, 5) void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); -extern void warn_slowpath_null(const char *file, const int line); #define WANT_WARN_ON_SLOWPATH -#define __WARN() warn_slowpath_null(__FILE__, __LINE__) +#define __WARN() __WARN_printf(TAINT_WARN, NULL) #define __WARN_printf(taint, arg...) \ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else diff --git a/kernel/panic.c b/kernel/panic.c index 1d89f5423426..79c153951b59 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,19 +605,19 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, { struct warn_args args; + if (!fmt) { + pr_warn(CUT_HERE); + __warn(file, line, __builtin_return_address(0), taint, + NULL, NULL); + return; + } + args.fmt = fmt; va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); - -void warn_slowpath_null(const char *file, int line) -{ - pr_warn(CUT_HERE); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); -} -EXPORT_SYMBOL(warn_slowpath_null); #else void __warn_printk(const char *fmt, ...) { From d38aba49a9f72b862f1220739ca837c886fdc319 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:01 -0700 Subject: [PATCH 686/721] bug: lift "cut here" out of __warn() In preparation for cleaning up "cut here", move the "cut here" logic up out of __warn() and into callers that pass non-NULL args. For anyone looking closely, there are two callers that pass NULL args: one already explicitly prints "cut here". The remaining case is covered by how a WARN is built, which will be cleaned up in the next patch. Link: http://lkml.kernel.org/r/20190819234111.9019-5-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 79c153951b59..a643e5464296 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -559,9 +559,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - if (args) - pr_warn(CUT_HERE); - if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, @@ -605,8 +602,9 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, { struct warn_args args; + pr_warn(CUT_HERE); + if (!fmt) { - pr_warn(CUT_HERE); __warn(file, line, __builtin_return_address(0), taint, NULL, NULL); return; From d4bce140b4e739bceb4e239d4842cf8f346c1e0f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:04 -0700 Subject: [PATCH 687/721] bug: clean up helper macros to remove __WARN_TAINT() In preparation for cleaning up "cut here" even more, this removes the __WARN_*TAINT() helpers, as they limit the ability to add new BUGFLAG_* flags to call sites. They are removed by expanding them into full __WARN_FLAGS() calls. Link: http://lkml.kernel.org/r/20190819234111.9019-6-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 598d7072602f..4b18e09094cf 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -62,13 +62,11 @@ struct bug_entry { #endif #ifdef __WARN_FLAGS -#define __WARN_TAINT(taint) __WARN_FLAGS(BUGFLAG_TAINT(taint)) -#define __WARN_ONCE_TAINT(taint) __WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint)) - #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_ONCE_TAINT(TAINT_WARN); \ + __WARN_FLAGS(BUGFLAG_ONCE | \ + BUGFLAG_TAINT(TAINT_WARN)); \ unlikely(__ret_warn_on); \ }) #endif @@ -89,7 +87,7 @@ struct bug_entry { * * Use the versions with printk format strings to provide better diagnostics. */ -#ifndef __WARN_TAINT +#ifndef __WARN_FLAGS extern __printf(4, 5) void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); @@ -99,11 +97,14 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); -#define __WARN() do { \ - printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ -} while (0) -#define __WARN_printf(taint, arg...) \ - do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) +#define __WARN() do { \ + printk(KERN_WARNING CUT_HERE); \ + __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)); \ + } while (0) +#define __WARN_printf(taint, arg...) do { \ + __warn_printk(arg); \ + __WARN_FLAGS(BUGFLAG_TAINT(taint)); \ + } while (0) #endif /* used internally by panic.c */ From 2da1ead4d5f7fa5f61e5805655de1e245d03a763 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:08 -0700 Subject: [PATCH 688/721] bug: consolidate __WARN_FLAGS usage Instead of having separate tests for __WARN_FLAGS, merge the two #ifdef blocks and replace the synonym WANT_WARN_ON_SLOWPATH macro. Link: http://lkml.kernel.org/r/20190819234111.9019-7-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christophe Leroy Cc: Drew Davenport Cc: Feng Tang Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Petr Mladek Cc: "Steven Rostedt (VMware)" Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 18 +++++++----------- kernel/panic.c | 2 +- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 4b18e09094cf..b4a2639130a0 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -61,16 +61,6 @@ struct bug_entry { #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) #endif -#ifdef __WARN_FLAGS -#define WARN_ON_ONCE(condition) ({ \ - int __ret_warn_on = !!(condition); \ - if (unlikely(__ret_warn_on)) \ - __WARN_FLAGS(BUGFLAG_ONCE | \ - BUGFLAG_TAINT(TAINT_WARN)); \ - unlikely(__ret_warn_on); \ -}) -#endif - /* * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report * significant kernel issues that need prompt attention if they should ever @@ -91,7 +81,6 @@ struct bug_entry { extern __printf(4, 5) void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); -#define WANT_WARN_ON_SLOWPATH #define __WARN() __WARN_printf(TAINT_WARN, NULL) #define __WARN_printf(taint, arg...) \ warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) @@ -105,6 +94,13 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); __warn_printk(arg); \ __WARN_FLAGS(BUGFLAG_TAINT(taint)); \ } while (0) +#define WARN_ON_ONCE(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + __WARN_FLAGS(BUGFLAG_ONCE | \ + BUGFLAG_TAINT(TAINT_WARN)); \ + unlikely(__ret_warn_on); \ +}) #endif /* used internally by panic.c */ diff --git a/kernel/panic.c b/kernel/panic.c index a643e5464296..47e8ebccc22b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -596,7 +596,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, add_taint(taint, LOCKDEP_STILL_OK); } -#ifdef WANT_WARN_ON_SLOWPATH +#ifndef __WARN_FLAGS void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) { From a44f71a9ab99b509fec9d5a9f5c222debd89934f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 25 Sep 2019 16:48:11 -0700 Subject: [PATCH 689/721] bug: move WARN_ON() "cut here" into exception handler The original clean up of "cut here" missed the WARN_ON() case (that does not have a printk message), which was fixed recently by adding an explicit printk of "cut here". This had the downside of adding a printk() to every WARN_ON() caller, which reduces the utility of using an instruction exception to streamline the resulting code. By making this a new BUGFLAG, all of these can be removed and "cut here" can be handled by the exception handler. This was very pronounced on PowerPC, but the effect can be seen on x86 as well. The resulting text size of a defconfig build shows some small savings from this patch: text data bss dec hex filename 19691167 5134320 1646664 26472151 193eed7 vmlinux.before 19676362 5134260 1663048 26473670 193f4c6 vmlinux.after This change also opens the door for creating something like BUG_MSG(), where a custom printk() before issuing BUG(), without confusing the "cut here" line. Link: http://lkml.kernel.org/r/201908200943.601DD59DCE@keescook Fixes: 6b15f678fb7d ("include/asm-generic/bug.h: fix "cut here" for WARN_ON for __WARN_TAINT architectures") Signed-off-by: Kees Cook Reported-by: Christophe Leroy Cc: Peter Zijlstra Cc: Christophe Leroy Cc: Drew Davenport Cc: Arnd Bergmann Cc: "Steven Rostedt (VMware)" Cc: Feng Tang Cc: Petr Mladek Cc: Mauro Carvalho Chehab Cc: Borislav Petkov Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 8 +++----- lib/bug.c | 11 +++++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index b4a2639130a0..384b5c835ced 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -10,6 +10,7 @@ #define BUGFLAG_WARNING (1 << 0) #define BUGFLAG_ONCE (1 << 1) #define BUGFLAG_DONE (1 << 2) +#define BUGFLAG_NO_CUT_HERE (1 << 3) /* CUT_HERE already sent */ #define BUGFLAG_TAINT(taint) ((taint) << 8) #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif @@ -86,13 +87,10 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); -#define __WARN() do { \ - printk(KERN_WARNING CUT_HERE); \ - __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)); \ - } while (0) +#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)) #define __WARN_printf(taint, arg...) do { \ __warn_printk(arg); \ - __WARN_FLAGS(BUGFLAG_TAINT(taint)); \ + __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ } while (0) #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ diff --git a/lib/bug.c b/lib/bug.c index 1077366f496b..8c98af0bf585 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -181,6 +181,15 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) } } + /* + * BUG() and WARN_ON() families don't print a custom debug message + * before triggering the exception handler, so we must add the + * "cut here" line now. WARN() issues its own "cut here" before the + * extra debugging message it writes before triggering the handler. + */ + if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0) + printk(KERN_DEFAULT CUT_HERE); + if (warning) { /* this is a WARN_ON rather than BUG/BUG_ON */ __warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs, @@ -188,8 +197,6 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_WARN; } - printk(KERN_DEFAULT CUT_HERE); - if (file) pr_crit("kernel BUG at %s:%u!\n", file, line); else From 97b0b1ad58fab9f71b1a6bc056a09af6065ec3bc Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 25 Sep 2019 16:48:14 -0700 Subject: [PATCH 690/721] ipc/mqueue.c: delete an unnecessary check before the macro call dev_kfree_skb() dev_kfree_skb() input parameter validation, thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Link: http://lkml.kernel.org/r/07477187-63e5-cc80-34c1-32dd16b38e12@web.de Signed-off-by: Markus Elfring Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/mqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 7c15729d9d25..b02eb842b42e 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1333,7 +1333,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) out: if (sock) netlink_detachskb(sock, nc); - else if (nc) + else dev_kfree_skb(nc); return ret; From c231740dd95e854de5034cff8f49737d942bc098 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 25 Sep 2019 16:48:17 -0700 Subject: [PATCH 691/721] ipc/mqueue: improve exception handling in do_mq_notify() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Null pointers were assigned to local variables in a few cases as exception handling. The jump target “out” was used where no meaningful data processing actions should eventually be performed by branches of an if statement then. Use an additional jump target for calling dev_kfree_skb() directly. Return also directly after error conditions were detected when no extra clean-up is needed by this function implementation. Link: http://lkml.kernel.org/r/592ef10e-0b69-72d0-9789-fc48f638fdfd@web.de Signed-off-by: Markus Elfring Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/mqueue.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index b02eb842b42e..3d920ff15c80 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1240,15 +1240,14 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) /* create the notify skb */ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); - if (!nc) { - ret = -ENOMEM; - goto out; - } + if (!nc) + return -ENOMEM; + if (copy_from_user(nc->data, notification->sigev_value.sival_ptr, NOTIFY_COOKIE_LEN)) { ret = -EFAULT; - goto out; + goto free_skb; } /* TODO: add a header? */ @@ -1264,8 +1263,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) fdput(f); if (IS_ERR(sock)) { ret = PTR_ERR(sock); - sock = NULL; - goto out; + goto free_skb; } timeo = MAX_SCHEDULE_TIMEOUT; @@ -1274,11 +1272,8 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) sock = NULL; goto retry; } - if (ret) { - sock = NULL; - nc = NULL; - goto out; - } + if (ret) + return ret; } } @@ -1334,6 +1329,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) if (sock) netlink_detachskb(sock, nc); else +free_skb: dev_kfree_skb(nc); return ret; From 984035ad7b247ccc62b06e113eea3fc673f114cc Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 25 Sep 2019 16:48:20 -0700 Subject: [PATCH 692/721] ipc/sem.c: convert to use built-in RCU list checking CONFIG_PROVE_RCU_LIST requires list_for_each_entry_rcu() to pass a lockdep expression if using srcu or locking for protection. It can only check regular RCU protection, all other protection needs to be passed as lockdep expression. Link: http://lkml.kernel.org/r/20190830231817.76862-2-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Cc: Arnd Bergmann Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: "Gustavo A. R. Silva" Cc: Jonathan Derrick Cc: Keith Busch Cc: Lorenzo Pieralisi Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ipc/sem.c b/ipc/sem.c index 7da4504bcc7c..ec97a7072413 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1852,7 +1852,8 @@ static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; - list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) { + list_for_each_entry_rcu(un, &ulp->list_proc, list_proc, + spin_is_locked(&ulp->lock)) { if (un->semid == semid) return un; } From 09b35b4192f6682dff96a093ab1930998cdb73b4 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Wed, 25 Sep 2019 16:48:24 -0700 Subject: [PATCH 693/721] lib/lzo/lzo1x_compress.c: fix alignment bug in lzo-rle Fix an unaligned access which breaks on platforms where this is not permitted (e.g., Sparc). Link: http://lkml.kernel.org/r/20190912145502.35229-1-dave.rodgman@arm.com Signed-off-by: Dave Rodgman Cc: Dave Rodgman Cc: Markus F.X.J. Oberhumer Cc: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/lzo/lzo1x_compress.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c index ba16c08e8cb9..717c940112f9 100644 --- a/lib/lzo/lzo1x_compress.c +++ b/lib/lzo/lzo1x_compress.c @@ -83,17 +83,19 @@ lzo1x_1_do_compress(const unsigned char *in, size_t in_len, ALIGN((uintptr_t)ir, 4)) && (ir < limit) && (*ir == 0)) ir++; - for (; (ir + 4) <= limit; ir += 4) { - dv = *((u32 *)ir); - if (dv) { + if (IS_ALIGNED((uintptr_t)ir, 4)) { + for (; (ir + 4) <= limit; ir += 4) { + dv = *((u32 *)ir); + if (dv) { # if defined(__LITTLE_ENDIAN) - ir += __builtin_ctz(dv) >> 3; + ir += __builtin_ctz(dv) >> 3; # elif defined(__BIG_ENDIAN) - ir += __builtin_clz(dv) >> 3; + ir += __builtin_clz(dv) >> 3; # else # error "missing endian definition" # endif - break; + break; + } } } #endif From 903f433f8f7a33e292a319259483adece8cc6674 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:27 -0700 Subject: [PATCH 694/721] lib: untag user pointers in strn*_user Patch series "arm64: untag user pointers passed to the kernel", v19. === Overview arm64 has a feature called Top Byte Ignore, which allows to embed pointer tags into the top byte of each pointer. Userspace programs (such as HWASan, a memory debugging tool [1]) might use this feature and pass tagged user pointers to the kernel through syscalls or other interfaces. Right now the kernel is already able to handle user faults with tagged pointers, due to these patches: 1. 81cddd65 ("arm64: traps: fix userspace cache maintenance emulation on a tagged pointer") 2. 7dcd9dd8 ("arm64: hw_breakpoint: fix watchpoint matching for tagged pointers") 3. 276e9327 ("arm64: entry: improve data abort handling of tagged pointers") This patchset extends tagged pointer support to syscall arguments. As per the proposed ABI change [3], tagged pointers are only allowed to be passed to syscalls when they point to memory ranges obtained by anonymous mmap() or sbrk() (see the patchset [3] for more details). For non-memory syscalls this is done by untaging user pointers when the kernel performs pointer checking to find out whether the pointer comes from userspace (most notably in access_ok). The untagging is done only when the pointer is being checked, the tag is preserved as the pointer makes its way through the kernel and stays tagged when the kernel dereferences the pointer when perfoming user memory accesses. The mmap and mremap (only new_addr) syscalls do not currently accept tagged addresses. Architectures may interpret the tag as a background colour for the corresponding vma. Other memory syscalls (mprotect, etc.) don't do user memory accesses but rather deal with memory ranges, and untagged pointers are better suited to describe memory ranges internally. Thus for memory syscalls we untag pointers completely when they enter the kernel. === Other approaches One of the alternative approaches to untagging that was considered is to completely strip the pointer tag as the pointer enters the kernel with some kind of a syscall wrapper, but that won't work with the countless number of different ioctl calls. With this approach we would need a custom wrapper for each ioctl variation, which doesn't seem practical. An alternative approach to untagging pointers in memory syscalls prologues is to inspead allow tagged pointers to be passed to find_vma() (and other vma related functions) and untag them there. Unfortunately, a lot of find_vma() callers then compare or subtract the returned vma start and end fields against the pointer that was being searched. Thus this approach would still require changing all find_vma() callers. === Testing The following testing approaches has been taken to find potential issues with user pointer untagging: 1. Static testing (with sparse [2] and separately with a custom static analyzer based on Clang) to track casts of __user pointers to integer types to find places where untagging needs to be done. 2. Static testing with grep to find parts of the kernel that call find_vma() (and other similar functions) or directly compare against vm_start/vm_end fields of vma. 3. Static testing with grep to find parts of the kernel that compare user pointers with TASK_SIZE or other similar consts and macros. 4. Dynamic testing: adding BUG_ON(has_tag(addr)) to find_vma() and running a modified syzkaller version that passes tagged pointers to the kernel. Based on the results of the testing the requried patches have been added to the patchset. === Notes This patchset is meant to be merged together with "arm64 relaxed ABI" [3]. This patchset is a prerequisite for ARM's memory tagging hardware feature support [4]. This patchset has been merged into the Pixel 2 & 3 kernel trees and is now being used to enable testing of Pixel phones with HWASan. Thanks! [1] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html [2] https://github.com/lucvoo/sparse-dev/commit/5f960cb10f56ec2017c128ef9d16060e0145f292 [3] https://lkml.org/lkml/2019/6/12/745 [4] https://community.arm.com/processors/b/blog/posts/arm-a-profile-architecture-2018-developments-armv85a This patch (of 11) This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. strncpy_from_user and strnlen_user accept user addresses as arguments, and do not go through the same path as copy_from_user and others, so here we need to handle the case of tagged user addresses separately. Untag user pointers passed to these functions. Note, that this patch only temporarily untags the pointers to perform validity checks, but then uses them as is to perform user memory accesses. [andreyknvl@google.com: fix sparc4 build] Link: http://lkml.kernel.org/r/CAAeHK+yx4a-P0sDrXTUxMvO2H0CJZUFPffBrg_cU7oJOZyC7ew@mail.gmail.com Link: http://lkml.kernel.org/r/c5a78bcad3e94d6cda71fcaa60a423231ae71e4c.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Vincenzo Frascino Reviewed-by: Khalid Aziz Acked-by: Kees Cook Reviewed-by: Catalin Marinas Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 5 +++-- lib/strncpy_from_user.c | 3 ++- lib/strnlen_user.c | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index b57f9c631eca..6ec514fe3bef 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1078,7 +1078,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, } #define io_remap_pfn_range io_remap_pfn_range -static inline unsigned long untagged_addr(unsigned long start) +static inline unsigned long __untagged_addr(unsigned long start) { if (adi_capable()) { long addr = start; @@ -1098,7 +1098,8 @@ static inline unsigned long untagged_addr(unsigned long start) return start; } -#define untagged_addr untagged_addr +#define untagged_addr(addr) \ + ((__typeof__(addr))(__untagged_addr((unsigned long)(addr))) static inline bool pte_access_permitted(pte_t pte, bool write) { diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 023ba9f3b99f..dccb95af6003 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -108,7 +109,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) return 0; max_addr = user_addr_max(); - src_addr = (unsigned long)src; + src_addr = (unsigned long)untagged_addr(src); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 7f2db3fe311f..28ff554a1be8 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -2,6 +2,7 @@ #include #include #include +#include #include @@ -109,7 +110,7 @@ long strnlen_user(const char __user *str, long count) return 0; max_addr = user_addr_max(); - src_addr = (unsigned long)str; + src_addr = (unsigned long)untagged_addr(str); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; From 057d3389108eda8a20c7f496f011846932680d88 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:30 -0700 Subject: [PATCH 695/721] mm: untag user pointers passed to memory syscalls This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. This patch allows tagged pointers to be passed to the following memory syscalls: get_mempolicy, madvise, mbind, mincore, mlock, mlock2, mprotect, mremap, msync, munlock, move_pages. The mmap and mremap syscalls do not currently accept tagged addresses. Architectures may interpret the tag as a background colour for the corresponding vma. Link: http://lkml.kernel.org/r/aaf0c0969d46b2feb9017f3e1b3ef3970b633d91.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Vincenzo Frascino Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 2 ++ mm/mempolicy.c | 3 +++ mm/migrate.c | 2 +- mm/mincore.c | 2 ++ mm/mlock.c | 4 ++++ mm/mprotect.c | 2 ++ mm/mremap.c | 7 +++++++ mm/msync.c | 2 ++ 8 files changed, 23 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 68ab988ad433..1f8a6fdc6878 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -784,6 +784,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) size_t len; struct blk_plug plug; + start = untagged_addr(start); + if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 464406e8da91..de27d08b1ff8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1405,6 +1405,7 @@ static long kernel_mbind(unsigned long start, unsigned long len, int err; unsigned short mode_flags; + start = untagged_addr(start); mode_flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if (mode >= MPOL_MAX) @@ -1558,6 +1559,8 @@ static int kernel_get_mempolicy(int __user *policy, int uninitialized_var(pval); nodemask_t nodes; + addr = untagged_addr(addr); + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; diff --git a/mm/migrate.c b/mm/migrate.c index 73d476d690b1..4fe45d1428c8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1612,7 +1612,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, goto out_flush; if (get_user(node, nodes + i)) goto out_flush; - addr = (unsigned long)p; + addr = (unsigned long)untagged_addr(p); err = -ENODEV; if (node < 0 || node >= MAX_NUMNODES) diff --git a/mm/mincore.c b/mm/mincore.c index f9a9dbe8cd33..49b6fa2f6aa1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -256,6 +256,8 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, unsigned long pages; unsigned char *tmp; + start = untagged_addr(start); + /* Check the start address: needs to be page-aligned.. */ if (start & ~PAGE_MASK) return -EINVAL; diff --git a/mm/mlock.c b/mm/mlock.c index a90099da4fb4..a72c1eeded77 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -674,6 +674,8 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla unsigned long lock_limit; int error = -ENOMEM; + start = untagged_addr(start); + if (!can_do_mlock()) return -EPERM; @@ -735,6 +737,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; + start = untagged_addr(start); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; diff --git a/mm/mprotect.c b/mm/mprotect.c index 675e5d34a507..7967825f6d33 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -459,6 +459,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + start = untagged_addr(start); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; diff --git a/mm/mremap.c b/mm/mremap.c index fc241d23cd97..64c9a3b8be0a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -606,6 +606,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); + /* + * Architectures may interpret the tag passed to mmap as a background + * colour for the corresponding vma. For mremap we don't allow tagged + * new_addr to preserve similar behaviour to mmap. + */ + addr = untagged_addr(addr); + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; diff --git a/mm/msync.c b/mm/msync.c index ef30a429623a..c3bd3e75f687 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -37,6 +37,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) int unmapped_error = 0; int error = -EINVAL; + start = untagged_addr(start); + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (offset_in_page(start)) From f9652594195fca8c3d8b8ee392ad0ff9f701bb20 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:34 -0700 Subject: [PATCH 696/721] mm: untag user pointers in mm/gup.c This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. mm/gup.c provides a kernel interface that accepts user addresses and manipulates user pages directly (for example get_user_pages, that is used by the futex syscall). Since a user can provided tagged addresses, we need to handle this case. Add untagging to gup.c functions that use user addresses for vma lookups. Link: http://lkml.kernel.org/r/4731bddba3c938658c10ff4ed55cc01c60f4c8f8.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Vincenzo Frascino Reviewed-by: Kees Cook Reviewed-by: Catalin Marinas Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index 60c3915c8ee6..23a9f9c9d377 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -788,6 +788,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!nr_pages) return 0; + start = untagged_addr(start); + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* @@ -950,6 +952,8 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vm_fault_t ret, major = 0; + address = untagged_addr(address); + if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY; From 5d65e7a7d8cd5c77baa1acf129a11b8b45ffee75 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:37 -0700 Subject: [PATCH 697/721] mm: untag user pointers in get_vaddr_frames This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. get_vaddr_frames uses provided user pointers for vma lookups, which can only by done with untagged pointers. Instead of locating and changing all callers of this function, perform untagging in it. Link: http://lkml.kernel.org/r/28f05e49c92b2a69c4703323d6c12208f3d881fe.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Vincenzo Frascino Acked-by: Catalin Marinas Reviewed-by: Kees Cook Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frame_vector.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c64dca6e27c2..c431ca81dad5 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -46,6 +46,8 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; + start = untagged_addr(start); + down_read(&mm->mmap_sem); locked = 1; vma = find_vma_intersection(mm, start, start + 1); From ed8a66b83269c27f7181c95b477da5d33fecfbc4 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:40 -0700 Subject: [PATCH 698/721] fs/namespace: untag user pointers in copy_mount_options This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. In copy_mount_options a user address is being subtracted from TASK_SIZE. If the address is lower than TASK_SIZE, the size is calculated to not allow the exact_copy_from_user() call to cross TASK_SIZE boundary. However if the address is tagged, then the size will be calculated incorrectly. Untag the address before subtracting. Link: http://lkml.kernel.org/r/1de225e4a54204bfd7f25dac2635e31aa4aa1d90.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Vincenzo Frascino Reviewed-by: Kees Cook Reviewed-by: Catalin Marinas Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 93c043245c46..abcdc5f44865 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3028,7 +3028,7 @@ void *copy_mount_options(const void __user * data) * the remainder of the page. */ /* copy_from_user cannot cross TASK_SIZE ! */ - size = TASK_SIZE - (unsigned long)data; + size = TASK_SIZE - (unsigned long)untagged_addr(data); if (size > PAGE_SIZE) size = PAGE_SIZE; From 7d0325749a6c77b075424ab9de76bcb73a118430 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:44 -0700 Subject: [PATCH 699/721] userfaultfd: untag user pointers This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. userfaultfd code use provided user pointers for vma lookups, which can only by done with untagged pointers. Untag user pointers in validate_range(). Link: http://lkml.kernel.org/r/cdc59ddd7011012ca2e689bc88c3b65b1ea7e413.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Mike Rapoport Reviewed-by: Vincenzo Frascino Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Cc: Al Viro Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Khalid Aziz Cc: Mauro Carvalho Chehab Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index fe6d804a38dc..f9fd18670e22 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1272,21 +1272,23 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, } static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) + __u64 *start, __u64 len) { __u64 task_size = mm->task_size; - if (start & ~PAGE_MASK) + *start = untagged_addr(*start); + + if (*start & ~PAGE_MASK) return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) return -EINVAL; - if (start < mmap_min_addr) + if (*start < mmap_min_addr) return -EINVAL; - if (start >= task_size) + if (*start >= task_size) return -EINVAL; - if (len > task_size - start) + if (len > task_size - *start) return -EINVAL; return 0; } @@ -1336,7 +1338,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; } - ret = validate_range(mm, uffdio_register.range.start, + ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); if (ret) goto out; @@ -1525,7 +1527,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) goto out; - ret = validate_range(mm, uffdio_unregister.start, + ret = validate_range(mm, &uffdio_unregister.start, uffdio_unregister.len); if (ret) goto out; @@ -1676,7 +1678,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) goto out; - ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len); if (ret) goto out; @@ -1716,7 +1718,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, sizeof(uffdio_copy)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; /* @@ -1772,7 +1774,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, sizeof(uffdio_zeropage)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + ret = validate_range(ctx->mm, &uffdio_zeropage.range.start, uffdio_zeropage.range.len); if (ret) goto out; From 35f3fc87bebfb2fffc1a9cdaf661ee3f95c2a5f1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:47 -0700 Subject: [PATCH 700/721] drm/amdgpu: untag user pointers This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. In amdgpu_gem_userptr_ioctl() and amdgpu_amdkfd_gpuvm.c/init_user_pages() an MMU notifier is set up with a (tagged) userspace pointer. The untagged address should be used so that MMU notifiers for the untagged address get correctly matched up with the right BO. This patch untag user pointers in amdgpu_gem_userptr_ioctl() for the GEM case and in amdgpu_amdkfd_gpuvm_ alloc_memory_of_gpu() for the KFD case. This also makes sure that an untagged pointer is passed to amdgpu_ttm_tt_get_user_pages(), which uses it for vma lookups. Link: http://lkml.kernel.org/r/d684e1df08f2ecb6bc292e222b64fa9efbc26e69.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Kees Cook Suggested-by: Felix Kuehling Acked-by: Felix Kuehling Cc: Al Viro Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric Auger Cc: Jens Wiklander Cc: Khalid Aziz Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 42b936b6bbf1..6d021ecc8d59 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1103,7 +1103,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = 0; if (!offset || !*offset) return -EINVAL; - user_addr = *offset; + user_addr = untagged_addr(*offset); } else if (flags & (ALLOC_MEM_FLAGS_DOORBELL | ALLOC_MEM_FLAGS_MMIO_REMAP)) { domain = AMDGPU_GEM_DOMAIN_GTT; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index b174bd5eb38e..8ceb44925947 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -291,6 +291,8 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data, uint32_t handle; int r; + args->addr = untagged_addr(args->addr); + if (offset_in_page(args->addr | args->size)) return -EINVAL; From 4fdfae8d8f855d79b7d83fcd590b6ac7ed0099cf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:51 -0700 Subject: [PATCH 701/721] drm/radeon: untag user pointers in radeon_gem_userptr_ioctl This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. In radeon_gem_userptr_ioctl() an MMU notifier is set up with a (tagged) userspace pointer. The untagged address should be used so that MMU notifiers for the untagged address get correctly matched up with the right BO. This funcation also calls radeon_ttm_tt_pin_userptr(), which uses provided user pointers for vma lookups, which can only by done with untagged pointers. This patch untags user pointers in radeon_gem_userptr_ioctl(). Link: http://lkml.kernel.org/r/c856babeb67195b35603b8d5ba386a2819cec5ff.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Kees Cook Suggested-by: Felix Kuehling Acked-by: Felix Kuehling Cc: Al Viro Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric Auger Cc: Jens Wiklander Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/radeon/radeon_gem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 4cf58dbbe439..b2b076606f54 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -296,6 +296,8 @@ int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, uint32_t handle; int r; + args->addr = untagged_addr(args->addr); + if (offset_in_page(args->addr | args->size)) return -EINVAL; From e275faf367e3a3b9db06a71924b199f429d3d508 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:54 -0700 Subject: [PATCH 702/721] media/v4l2-core: untag user pointers in videobuf_dma_contig_user_get This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. videobuf_dma_contig_user_get() uses provided user pointers for vma lookups, which can only by done with untagged pointers. Untag the pointers in this function. Link: http://lkml.kernel.org/r/100436d5f8e4349a78f27b0bbb27e4801fcb946b.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Khalid Aziz Reviewed-by: Kees Cook Acked-by: Mauro Carvalho Chehab Cc: Al Viro Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Jens Wiklander Cc: Mike Rapoport Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/media/v4l2-core/videobuf-dma-contig.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/media/v4l2-core/videobuf-dma-contig.c b/drivers/media/v4l2-core/videobuf-dma-contig.c index 76b4ac7b1678..aeb2f497c683 100644 --- a/drivers/media/v4l2-core/videobuf-dma-contig.c +++ b/drivers/media/v4l2-core/videobuf-dma-contig.c @@ -157,6 +157,7 @@ static void videobuf_dma_contig_user_put(struct videobuf_dma_contig_memory *mem) static int videobuf_dma_contig_user_get(struct videobuf_dma_contig_memory *mem, struct videobuf_buffer *vb) { + unsigned long untagged_baddr = untagged_addr(vb->baddr); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long prev_pfn, this_pfn; @@ -164,22 +165,22 @@ static int videobuf_dma_contig_user_get(struct videobuf_dma_contig_memory *mem, unsigned int offset; int ret; - offset = vb->baddr & ~PAGE_MASK; + offset = untagged_baddr & ~PAGE_MASK; mem->size = PAGE_ALIGN(vb->size + offset); ret = -EINVAL; down_read(&mm->mmap_sem); - vma = find_vma(mm, vb->baddr); + vma = find_vma(mm, untagged_baddr); if (!vma) goto out_up; - if ((vb->baddr + mem->size) > vma->vm_end) + if ((untagged_baddr + mem->size) > vma->vm_end) goto out_up; pages_done = 0; prev_pfn = 0; /* kill warning */ - user_address = vb->baddr; + user_address = untagged_baddr; while (pages_done < (mem->size >> PAGE_SHIFT)) { ret = follow_pfn(vma, user_address, &this_pfn); From 78063a9dd9637c0450cf6eacc03f42eb1295917f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:48:58 -0700 Subject: [PATCH 703/721] tee/shm: untag user pointers in tee_shm_register This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. tee_shm_register()->optee_shm_unregister()->check_mem_type() uses provided user pointers for vma lookups (via __check_mem_type()), which can only by done with untagged pointers. Untag user pointers in this function. Link: http://lkml.kernel.org/r/4b993f33196b3566ac81285ff8453219e2079b45.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Kees Cook Acked-by: Jens Wiklander Cc: Al Viro Cc: Catalin Marinas Cc: Dave Hansen Cc: Eric Auger Cc: Felix Kuehling Cc: Khalid Aziz Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/tee/tee_shm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 2da026fd12c9..09ddcd06c715 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -254,6 +254,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, shm->teedev = teedev; shm->ctx = ctx; shm->id = -1; + addr = untagged_addr(addr); start = rounddown(addr, PAGE_SIZE); shm->offset = addr - start; shm->size = length; From 6cf5354c1c4b74fd2e5527db084f163e9d4dae4e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 25 Sep 2019 16:49:01 -0700 Subject: [PATCH 704/721] vfio/type1: untag user pointers in vaddr_get_pfn This patch is a part of a series that extends kernel ABI to allow to pass tagged user pointers (with the top byte set to something else other than 0x00) as syscall arguments. vaddr_get_pfn() uses provided user pointers for vma lookups, which can only by done with untagged pointers. Untag user pointers in this function. Link: http://lkml.kernel.org/r/87422b4d72116a975896f2b19b00f38acbd28f33.1563904656.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Eric Auger Reviewed-by: Vincenzo Frascino Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Cc: Dave Hansen Cc: Will Deacon Cc: Al Viro Cc: Felix Kuehling Cc: Jens Wiklander Cc: Khalid Aziz Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/vfio/vfio_iommu_type1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9a50b0558fa9..96fddc1dafc3 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -375,6 +375,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); + vaddr = untagged_addr(vaddr); + vma = find_vma_intersection(mm, vaddr, vaddr + 1); if (vma && vma->vm_flags & VM_PFNMAP) { From ce18d171cb7368557e6498a3ce111d7d3dc03e4d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 25 Sep 2019 16:49:04 -0700 Subject: [PATCH 705/721] mm: untag user pointers in mmap/munmap/mremap/brk There isn't a good reason to differentiate between the user address space layout modification syscalls and the other memory permission/attributes ones (e.g. mprotect, madvise) w.r.t. the tagged address ABI. Untag the user addresses on entry to these functions. Link: http://lkml.kernel.org/r/20190821164730.47450-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Acked-by: Will Deacon Acked-by: Andrey Konovalov Cc: Vincenzo Frascino Cc: Szabolcs Nagy Cc: Kevin Brodsky Cc: Dave P Martin Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 +++++ mm/mremap.c | 6 +----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 14b7da317ec0..a7d8c84d19b7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -201,6 +201,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool downgraded = false; LIST_HEAD(uf); + brk = untagged_addr(brk); + if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -1587,6 +1589,8 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; + addr = untagged_addr(addr); + if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); @@ -2885,6 +2889,7 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { + addr = untagged_addr(addr); profile_munmap(addr); return __vm_munmap(addr, len, true); } diff --git a/mm/mremap.c b/mm/mremap.c index 64c9a3b8be0a..1fc8a29fbe3f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -606,12 +606,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); - /* - * Architectures may interpret the tag passed to mmap as a background - * colour for the corresponding vma. For mremap we don't allow tagged - * new_addr to preserve similar behaviour to mmap. - */ addr = untagged_addr(addr); + new_addr = untagged_addr(new_addr); if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; From 9c276cc65a58faf98be8e56962745ec99ab87636 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Sep 2019 16:49:08 -0700 Subject: [PATCH 706/721] mm: introduce MADV_COLD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Introduce MADV_COLD and MADV_PAGEOUT", v7. - Background The Android terminology used for forking a new process and starting an app from scratch is a cold start, while resuming an existing app is a hot start. While we continually try to improve the performance of cold starts, hot starts will always be significantly less power hungry as well as faster so we are trying to make hot start more likely than cold start. To increase hot start, Android userspace manages the order that apps should be killed in a process called ActivityManagerService. ActivityManagerService tracks every Android app or service that the user could be interacting with at any time and translates that into a ranked list for lmkd(low memory killer daemon). They are likely to be killed by lmkd if the system has to reclaim memory. In that sense they are similar to entries in any other cache. Those apps are kept alive for opportunistic performance improvements but those performance improvements will vary based on the memory requirements of individual workloads. - Problem Naturally, cached apps were dominant consumers of memory on the system. However, they were not significant consumers of swap even though they are good candidate for swap. Under investigation, swapping out only begins once the low zone watermark is hit and kswapd wakes up, but the overall allocation rate in the system might trip lmkd thresholds and cause a cached process to be killed(we measured performance swapping out vs. zapping the memory by killing a process. Unsurprisingly, zapping is 10x times faster even though we use zram which is much faster than real storage) so kill from lmkd will often satisfy the high zone watermark, resulting in very few pages actually being moved to swap. - Approach The approach we chose was to use a new interface to allow userspace to proactively reclaim entire processes by leveraging platform information. This allowed us to bypass the inaccuracy of the kernel’s LRUs for pages that are known to be cold from userspace and to avoid races with lmkd by reclaiming apps as soon as they entered the cached state. Additionally, it could provide many chances for platform to use much information to optimize memory efficiency. To achieve the goal, the patchset introduce two new options for madvise. One is MADV_COLD which will deactivate activated pages and the other is MADV_PAGEOUT which will reclaim private pages instantly. These new options complement MADV_DONTNEED and MADV_FREE by adding non-destructive ways to gain some free memory space. MADV_PAGEOUT is similar to MADV_DONTNEED in a way that it hints the kernel that memory region is not currently needed and should be reclaimed immediately; MADV_COLD is similar to MADV_FREE in a way that it hints the kernel that memory region is not currently needed and should be reclaimed when memory pressure rises. This patch (of 5): When a process expects no accesses to a certain memory range, it could give a hint to kernel that the pages can be reclaimed when memory pressure happens but data should be preserved for future use. This could reduce workingset eviction so it ends up increasing performance. This patch introduces the new MADV_COLD hint to madvise(2) syscall. MADV_COLD can be used by a process to mark a memory range as not expected to be used in the near future. The hint can help kernel in deciding which pages to evict early during memory pressure. It works for every LRU pages like MADV_[DONTNEED|FREE]. IOW, It moves active file page -> inactive file LRU active anon page -> inacdtive anon LRU Unlike MADV_FREE, it doesn't move active anonymous pages to inactive file LRU's head because MADV_COLD is a little bit different symantic. MADV_FREE means it's okay to discard when the memory pressure because the content of the page is *garbage* so freeing such pages is almost zero overhead since we don't need to swap out and access afterward causes just minor fault. Thus, it would make sense to put those freeable pages in inactive file LRU to compete other used-once pages. It makes sense for implmentaion point of view, too because it's not swapbacked memory any longer until it would be re-dirtied. Even, it could give a bonus to make them be reclaimed on swapless system. However, MADV_COLD doesn't mean garbage so reclaiming them requires swap-out/in in the end so it's bigger cost. Since we have designed VM LRU aging based on cost-model, anonymous cold pages would be better to position inactive anon's LRU list, not file LRU. Furthermore, it would help to avoid unnecessary scanning if system doesn't have a swap device. Let's start simpler way without adding complexity at this moment. However, keep in mind, too that it's a caveat that workloads with a lot of pages cache are likely to ignore MADV_COLD on anonymous memory because we rarely age anonymous LRU lists. * man-page material MADV_COLD (since Linux x.x) Pages in the specified regions will be treated as less-recently-accessed compared to pages in the system with similar access frequencies. In contrast to MADV_FREE, the contents of the region are preserved regardless of subsequent writes to pages. MADV_COLD cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP pages. [akpm@linux-foundation.org: resolve conflicts with hmm.git] Link: http://lkml.kernel.org/r/20190726023435.214162-2-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: kbuild test robot Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: James E.J. Bottomley Cc: Richard Henderson Cc: Ralf Baechle Cc: Chris Zankel Cc: Johannes Weiner Cc: Daniel Colascione Cc: Dave Hansen Cc: Hillf Danton Cc: Joel Fernandes (Google) Cc: Kirill A. Shutemov Cc: Oleksandr Natalenko Cc: Shakeel Butt Cc: Sonny Rao Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/mman.h | 2 + arch/mips/include/uapi/asm/mman.h | 2 + arch/parisc/include/uapi/asm/mman.h | 2 + arch/xtensa/include/uapi/asm/mman.h | 2 + include/linux/swap.h | 1 + include/uapi/asm-generic/mman-common.h | 2 + mm/internal.h | 2 +- mm/madvise.c | 179 ++++++++++++++++++++++++- mm/oom_kill.c | 2 +- mm/swap.c | 42 ++++++ 10 files changed, 232 insertions(+), 4 deletions(-) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index ac23379b7a87..f3258fbf03d0 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -68,6 +68,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index c2b40969eb1f..00ad09fc5eb1 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -95,6 +95,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index c98162f494db..eb14e3a7b8f3 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -48,6 +48,8 @@ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ +#define MADV_COLD 20 /* deactivate these pages */ + #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index ebbb48842190..f926b00ff11f 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -103,6 +103,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/linux/swap.h b/include/linux/swap.h index de2c67a33b7e..0ce997edb8bb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -340,6 +340,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); +extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 63b1f506ea67..23431faf0eb6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -67,6 +67,8 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/internal.h b/mm/internal.h index e32390802fd3..0d5f720c75ab 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma) +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); } diff --git a/mm/madvise.c b/mm/madvise.c index 1f8a6fdc6878..e1aee62967c3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_COLD: case MADV_FREE: return 0; default: @@ -289,6 +291,176 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + struct page *page; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + unsigned long next = pmd_addr_end(addr, end); + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + if (page_mapcount(page) != 1) + goto huge_unlock; + + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + + test_and_clear_page_young(page); + deactivate_page(page); +huge_unlock: + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +regular_page: +#endif + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * Creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + + /* + * We are deactivating a page for accelerating reclaiming. + * VM couldn't reclaim the page unless we clear PG_young. + * As a side effect, it makes confuse idle-page tracking + * because they will miss recent referenced history. + */ + test_and_clear_page_young(page); + deactivate_page(page); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); + + return 0; +} + +static const struct mm_walk_ops cold_walk_ops = { + .pmd_entry = madvise_cold_pte_range, +}; + +static void madvise_cold_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); + tlb_end_vma(tlb, vma); +} + +static long madvise_cold(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_cold_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -493,7 +665,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, int behavior) { *prev = vma; - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (!userfaultfd_remove(vma, start, end)) { @@ -515,7 +687,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ return -ENOMEM; } - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (end > vma->vm_end) { /* @@ -669,6 +841,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -690,6 +864,7 @@ madvise_behavior_valid(int behavior) case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: + case MADV_COLD: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c1d9496b4c43..71e3acea7817 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -523,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) set_bit(MMF_UNSTABLE, &mm->flags); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) continue; /* diff --git a/mm/swap.c b/mm/swap.c index 784dc1620620..38c3fa4308e2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -47,6 +47,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); @@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, file, 0); } +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page)); + update_page_reclaim_stat(lruvec, file, 0); + } +} static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) @@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pvec = &per_cpu(lru_lazyfree_pvecs, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); @@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page) } } +/* + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + get_page(page); + if (!pagevec_add(pvec, page) || PageCompound(page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } +} + /** * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate @@ -687,6 +728,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); From 8940b34a4e082ae11498ddae8432f2ac07685d1c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Sep 2019 16:49:11 -0700 Subject: [PATCH 707/721] mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM The local variable references in shrink_page_list is PAGEREF_RECLAIM_CLEAN as default. It is for preventing to reclaim dirty pages when CMA try to migrate pages. Strictly speaking, we don't need it because CMA didn't allow to write out by .may_writepage = 0 in reclaim_clean_pages_from_list. Moreover, it has a problem to prevent anonymous pages's swap out even though force_reclaim = true in shrink_page_list on upcoming patch. So this patch makes references's default value to PAGEREF_RECLAIM and rename force_reclaim with ignore_references to make it more clear. This is a preparatory work for next patch. Link: http://lkml.kernel.org/r/20190726023435.214162-3-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Chris Zankel Cc: Daniel Colascione Cc: Dave Hansen Cc: Hillf Danton Cc: James E.J. Bottomley Cc: Joel Fernandes (Google) Cc: kbuild test robot Cc: Kirill A. Shutemov Cc: Oleksandr Natalenko Cc: Ralf Baechle Cc: Richard Henderson Cc: Shakeel Butt Cc: Sonny Rao Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4911754c93b7..d8bbaf068c35 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1123,7 +1123,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct scan_control *sc, enum ttu_flags ttu_flags, struct reclaim_stat *stat, - bool force_reclaim) + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -1137,7 +1137,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct address_space *mapping; struct page *page; int may_enter_fs; - enum page_references references = PAGEREF_RECLAIM_CLEAN; + enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; @@ -1268,7 +1268,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!force_reclaim) + if (!ignore_references) references = page_check_references(page, sc); switch (references) { From 1a4e58cce84ee88129d5d49c064bd2852b481357 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Sep 2019 16:49:15 -0700 Subject: [PATCH 708/721] mm: introduce MADV_PAGEOUT When a process expects no accesses to a certain memory range for a long time, it could hint kernel that the pages can be reclaimed instantly but data should be preserved for future use. This could reduce workingset eviction so it ends up increasing performance. This patch introduces the new MADV_PAGEOUT hint to madvise(2) syscall. MADV_PAGEOUT can be used by a process to mark a memory range as not expected to be used for a long time so that kernel reclaims *any LRU* pages instantly. The hint can help kernel in deciding which pages to evict proactively. A note: It doesn't apply SWAP_CLUSTER_MAX LRU page isolation limit intentionally because it's automatically bounded by PMD size. If PMD size(e.g., 256) makes some trouble, we could fix it later by limit it to SWAP_CLUSTER_MAX[1]. - man-page material MADV_PAGEOUT (since Linux x.x) Do not expect access in the near future so pages in the specified regions could be reclaimed instantly regardless of memory pressure. Thus, access in the range after successful operation could cause major page fault but never lose the up-to-date contents unlike MADV_DONTNEED. Pages belonging to a shared mapping are only processed if a write access is allowed for the calling process. MADV_PAGEOUT cannot be applied to locked pages, Huge TLB pages, or VM_PFNMAP pages. [1] https://lore.kernel.org/lkml/20190710194719.GS29695@dhcp22.suse.cz/ [minchan@kernel.org: clear PG_active on MADV_PAGEOUT] Link: http://lkml.kernel.org/r/20190802200643.GA181880@google.com [akpm@linux-foundation.org: resolve conflicts with hmm.git] Link: http://lkml.kernel.org/r/20190726023435.214162-5-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: kbuild test robot Acked-by: Michal Hocko Cc: James E.J. Bottomley Cc: Richard Henderson Cc: Ralf Baechle Cc: Chris Zankel Cc: Daniel Colascione Cc: Dave Hansen Cc: Hillf Danton Cc: Joel Fernandes (Google) Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Oleksandr Natalenko Cc: Shakeel Butt Cc: Sonny Rao Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/mman.h | 1 + arch/mips/include/uapi/asm/mman.h | 1 + arch/parisc/include/uapi/asm/mman.h | 1 + arch/xtensa/include/uapi/asm/mman.h | 1 + include/linux/swap.h | 1 + include/uapi/asm-generic/mman-common.h | 1 + mm/madvise.c | 189 +++++++++++++++++++++++++ mm/vmscan.c | 56 ++++++++ 8 files changed, 251 insertions(+) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index f3258fbf03d0..a18ec7f63888 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -69,6 +69,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 00ad09fc5eb1..57dc2ac4f8bd 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -96,6 +96,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index eb14e3a7b8f3..6fd8871e4081 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -49,6 +49,7 @@ #define MADV_DOFORK 11 /* do inherit across fork */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index f926b00ff11f..e5e643752947 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -104,6 +104,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ce997edb8bb..063c0c1e112b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -365,6 +365,7 @@ extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); extern unsigned long vm_total_pages; +extern unsigned long reclaim_pages(struct list_head *page_list); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 23431faf0eb6..c160a5354eb6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -68,6 +68,7 @@ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ #define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/madvise.c b/mm/madvise.c index e1aee62967c3..54c5639774b6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -44,6 +44,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_WILLNEED: case MADV_DONTNEED: case MADV_COLD: + case MADV_PAGEOUT: case MADV_FREE: return 0; default: @@ -461,6 +462,191 @@ static long madvise_cold(struct vm_area_struct *vma, return 0; } +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + LIST_HEAD(page_list); + struct page *page; + + if (fatal_signal_pending(current)) + return -EINTR; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + unsigned long next = pmd_addr_end(addr, end); + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + if (page_mapcount(page) != 1) + goto huge_unlock; + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_tlb_entry(tlb, pmd, addr); + } + + ClearPageReferenced(page); + test_and_clear_page_young(page); + + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); +huge_unlock: + spin_unlock(ptl); + reclaim_pages(&page_list); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +regular_page: +#endif + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + ClearPageReferenced(page); + test_and_clear_page_young(page); + + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + reclaim_pages(&page_list); + cond_resched(); + + return 0; +} + +static void madvise_pageout_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); + tlb_end_vma(tlb, vma); +} + +static inline bool can_do_pageout(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + +static long madvise_pageout(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + if (!can_do_pageout(vma)) + return 0; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -843,6 +1029,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_willneed(vma, prev, start, end); case MADV_COLD: return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -865,6 +1053,7 @@ madvise_behavior_valid(int behavior) case MADV_DONTNEED: case MADV_FREE: case MADV_COLD: + case MADV_PAGEOUT: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: diff --git a/mm/vmscan.c b/mm/vmscan.c index d8bbaf068c35..e5d52d6a24af 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2145,6 +2145,62 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } +unsigned long reclaim_pages(struct list_head *page_list) +{ + int nid = -1; + unsigned long nr_reclaimed = 0; + LIST_HEAD(node_page_list); + struct reclaim_stat dummy_stat; + struct page *page; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + if (nid == -1) { + nid = page_to_nid(page); + INIT_LIST_HEAD(&node_page_list); + } + + if (nid == page_to_nid(page)) { + ClearPageActive(page); + list_move(&page->lru, &node_page_list); + continue; + } + + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + nid = -1; + } + + if (!list_empty(&node_page_list)) { + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + } + + return nr_reclaimed; +} + /* * The inactive anon list should be small enough that the VM never has * to do too much work. From d616d5126503967bf365db0711ee3c78b356efe9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Sep 2019 16:49:19 -0700 Subject: [PATCH 709/721] mm: factor out common parts between MADV_COLD and MADV_PAGEOUT There are many common parts between MADV_COLD and MADV_PAGEOUT. This patch factor them out to save code duplication. Link: http://lkml.kernel.org/r/20190726023435.214162-6-minchan@kernel.org Signed-off-by: Minchan Kim Suggested-by: Johannes Weiner Acked-by: Michal Hocko Cc: Chris Zankel Cc: Daniel Colascione Cc: Dave Hansen Cc: Hillf Danton Cc: James E.J. Bottomley Cc: Joel Fernandes (Google) Cc: kbuild test robot Cc: Kirill A. Shutemov Cc: Oleksandr Natalenko Cc: Ralf Baechle Cc: Richard Henderson Cc: Shakeel Butt Cc: Sonny Rao Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 192 ++++++++++++--------------------------------------- 1 file changed, 45 insertions(+), 147 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 54c5639774b6..2be9f3fdb05e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -32,6 +32,11 @@ #include "internal.h" +struct madvise_walk_private { + struct mmu_gather *tlb; + bool pageout; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_sem for writing. Others, which simply traverse vmas, need @@ -292,15 +297,22 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } -static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) +static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) { - struct mmu_gather *tlb = walk->private; + struct madvise_walk_private *private = walk->private; + struct mmu_gather *tlb = private->tlb; + bool pageout = private->pageout; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; pte_t *orig_pte, *pte, ptent; spinlock_t *ptl; - struct page *page; + struct page *page = NULL; + LIST_HEAD(page_list); + + if (fatal_signal_pending(current)) + return -EINTR; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { @@ -348,10 +360,17 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } + ClearPageReferenced(page); test_and_clear_page_young(page); - deactivate_page(page); + if (pageout) { + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } else + deactivate_page(page); huge_unlock: spin_unlock(ptl); + if (pageout) + reclaim_pages(&page_list); return 0; } @@ -419,27 +438,39 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history. */ + ClearPageReferenced(page); test_and_clear_page_young(page); - deactivate_page(page); + if (pageout) { + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } else + deactivate_page(page); } arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_pte, ptl); + if (pageout) + reclaim_pages(&page_list); cond_resched(); return 0; } static const struct mm_walk_ops cold_walk_ops = { - .pmd_entry = madvise_cold_pte_range, + .pmd_entry = madvise_cold_or_pageout_pte_range, }; static void madvise_cold_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + struct madvise_walk_private walk_private = { + .pageout = false, + .tlb = tlb, + }; + tlb_start_vma(tlb, vma); - walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } @@ -462,150 +493,17 @@ static long madvise_cold(struct vm_area_struct *vma, return 0; } -static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct mmu_gather *tlb = walk->private; - struct mm_struct *mm = tlb->mm; - struct vm_area_struct *vma = walk->vma; - pte_t *orig_pte, *pte, ptent; - spinlock_t *ptl; - LIST_HEAD(page_list); - struct page *page; - - if (fatal_signal_pending(current)) - return -EINTR; - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge(*pmd)) { - pmd_t orig_pmd; - unsigned long next = pmd_addr_end(addr, end); - - tlb_change_page_size(tlb, HPAGE_PMD_SIZE); - ptl = pmd_trans_huge_lock(pmd, vma); - if (!ptl) - return 0; - - orig_pmd = *pmd; - if (is_huge_zero_pmd(orig_pmd)) - goto huge_unlock; - - if (unlikely(!pmd_present(orig_pmd))) { - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(orig_pmd)); - goto huge_unlock; - } - - page = pmd_page(orig_pmd); - if (next - addr != HPAGE_PMD_SIZE) { - int err; - - if (page_mapcount(page) != 1) - goto huge_unlock; - get_page(page); - spin_unlock(ptl); - lock_page(page); - err = split_huge_page(page); - unlock_page(page); - put_page(page); - if (!err) - goto regular_page; - return 0; - } - - if (pmd_young(orig_pmd)) { - pmdp_invalidate(vma, addr, pmd); - orig_pmd = pmd_mkold(orig_pmd); - - set_pmd_at(mm, addr, pmd, orig_pmd); - tlb_remove_tlb_entry(tlb, pmd, addr); - } - - ClearPageReferenced(page); - test_and_clear_page_young(page); - - if (!isolate_lru_page(page)) - list_add(&page->lru, &page_list); -huge_unlock: - spin_unlock(ptl); - reclaim_pages(&page_list); - return 0; - } - - if (pmd_trans_unstable(pmd)) - return 0; -regular_page: -#endif - tlb_change_page_size(tlb, PAGE_SIZE); - orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - flush_tlb_batched_pending(mm); - arch_enter_lazy_mmu_mode(); - for (; addr < end; pte++, addr += PAGE_SIZE) { - ptent = *pte; - if (!pte_present(ptent)) - continue; - - page = vm_normal_page(vma, addr, ptent); - if (!page) - continue; - - /* - * creating a THP page is expensive so split it only if we - * are sure it's worth. Split it if we are only owner. - */ - if (PageTransCompound(page)) { - if (page_mapcount(page) != 1) - break; - get_page(page); - if (!trylock_page(page)) { - put_page(page); - break; - } - pte_unmap_unlock(orig_pte, ptl); - if (split_huge_page(page)) { - unlock_page(page); - put_page(page); - pte_offset_map_lock(mm, pmd, addr, &ptl); - break; - } - unlock_page(page); - put_page(page); - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); - pte--; - addr -= PAGE_SIZE; - continue; - } - - VM_BUG_ON_PAGE(PageTransCompound(page), page); - - if (pte_young(ptent)) { - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - ptent = pte_mkold(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - } - ClearPageReferenced(page); - test_and_clear_page_young(page); - - if (!isolate_lru_page(page)) - list_add(&page->lru, &page_list); - } - - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(orig_pte, ptl); - reclaim_pages(&page_list); - cond_resched(); - - return 0; -} - static void madvise_pageout_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + struct madvise_walk_private walk_private = { + .pageout = true, + .tlb = tlb, + }; + tlb_start_vma(tlb, vma); - walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, NULL); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); } From c7cc8d77316b4386622b2dbd29de800df7b05099 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 25 Sep 2019 16:49:22 -0700 Subject: [PATCH 710/721] hexagon: drop empty and unused free_initrd_mem hexagon never reserves or initializes initrd and the only mention of it is the empty free_initrd_mem() function. As we have a generic implementation of free_initrd_mem(), there is no need to define an empty stub for the hexagon implementation and it can be dropped. Link: http://lkml.kernel.org/r/1565858133-25852-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Christoph Hellwig Cc: Richard Kuo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/mm/init.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index f1f6ebd537b7..c961773a6fff 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -71,19 +71,6 @@ void __init mem_init(void) init_mm.context.ptbase = __pa(init_mm.pgd); } -/* - * free_initrd_mem - frees... initrd memory. - * @start - start of init memory - * @end - end of init memory - * - * Apparently has to be passed the address of the initrd memory. - * - * Wrapped by #ifdef CONFIG_BLKDEV_INITRD - */ -void free_initrd_mem(unsigned long start, unsigned long end) -{ -} - void sync_icache_dcache(pte_t pte) { unsigned long addr; From de3f186f87cf15bed8d13fedafb5bcad0167fc6d Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:25 -0700 Subject: [PATCH 711/721] checkpatch: check for nested (un)?likely() calls IS_ERR(), IS_ERR_OR_NULL(), IS_ERR_VALUE() and WARN*() already contain unlikely() optimization internally. Thus, there is no point in calling these functions and defines under likely()/unlikely(). This check is based on the coccinelle rule developed by Enrico Weigelt https://lore.kernel.org/lkml/1559767582-11081-1-git-send-email-info@metux.net/ Link: http://lkml.kernel.org/r/20190829165025.15750-1-efremov@linux.com Signed-off-by: Denis Efremov Cc: Joe Perches Cc: Alexander Viro Cc: Anton Altaparmakov Cc: Boris Ostrovsky Cc: Boris Pismenny Cc: Darrick J. Wong Cc: "David S. Miller" Cc: Denis Efremov Cc: Dennis Dalessandro Cc: Inaky Perez-Gonzalez Cc: Juergen Gross Cc: Leon Romanovsky Cc: Mike Marciniszyn Cc: Rob Clark Cc: Saeed Mahameed Cc: Sean Paul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4eb355d8ae73..6fcc66afb088 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6507,6 +6507,12 @@ sub process { "Using $1 should generally have parentheses around the comparison\n" . $herecurr); } +# nested likely/unlikely calls + if ($line =~ /\b(?:(?:un)?likely)\s*\(\s*!?\s*(IS_ERR(?:_OR_NULL|_VALUE)?|WARN)/) { + WARN("LIKELY_MISUSE", + "nested (un)?likely() calls, $1 already uses unlikely() internally\n" . $herecurr); + } + # whine mightly about in_atomic if ($line =~ /\bin_atomic\s*\(/) { if ($realfile =~ m@^drivers/@) { From a3bc18a48e2e678efe62f1f9989902f9cd19e0ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 26 Sep 2019 15:21:18 +0100 Subject: [PATCH 712/721] jffs2: Fix mounting under new mount API The mounting of jffs2 is broken due to the changes from the new mount API because it specifies a "source" operation, but then doesn't actually process it. But because it specified it, it doesn't return -ENOPARAM and the caller doesn't process it either and the source gets lost. Fix this by simply removing the source parameter from jffs2 and letting the VFS deal with it in the default manner. To test it, enable CONFIG_MTD_MTDRAM and allow the default size and erase block size parameters, then try and mount the /dev/mtdblock file that that creates as jffs2. No need to initialise it. Fixes: ec10a24f10c8 ("vfs: Convert jffs2 to use the new mount API") Reported-by: Al Viro Signed-off-by: David Howells cc: David Woodhouse cc: Richard Weinberger cc: linux-mtd@lists.infradead.org Signed-off-by: Al Viro --- fs/jffs2/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index cbe70637c117..0e6406c4f362 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -163,13 +163,11 @@ static const struct export_operations jffs2_export_ops = { * Opt_rp_size: size of reserved pool in KiB */ enum { - Opt_source, Opt_override_compr, Opt_rp_size, }; static const struct fs_parameter_spec jffs2_param_specs[] = { - fsparam_string ("source", Opt_source), fsparam_enum ("compr", Opt_override_compr), fsparam_u32 ("rp_size", Opt_rp_size), {} From 89f403541325181748b491fd96118e68292f47e1 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:28 -0700 Subject: [PATCH 713/721] xen/events: remove unlikely() from WARN() condition "unlikely(WARN(x))" is excessive. WARN() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-4-efremov@linux.com Signed-off-by: Denis Efremov Cc: Boris Ostrovsky Cc: Joe Perches Reviewed-by: Juergen Gross Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/xen/events/events_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 2e8570c09789..6c8843968a52 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -247,7 +247,7 @@ static void xen_irq_info_cleanup(struct irq_info *info) */ unsigned int evtchn_from_irq(unsigned irq) { - if (unlikely(WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq))) + if (WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq)) return 0; return info_for_irq(irq)->evtchn; From 7159d54418e0a1a3df91e74501363a1c05379517 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:31 -0700 Subject: [PATCH 714/721] fs: remove unlikely() from WARN_ON() condition "unlikely(WARN_ON(x))" is excessive. WARN_ON() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-5-efremov@linux.com Signed-off-by: Denis Efremov Cc: Alexander Viro Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/open.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index c60cd22cc052..b62f5c0923a8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -776,7 +776,7 @@ static int do_dentry_open(struct file *f, f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); - if (unlikely(WARN_ON(!f->f_op))) { + if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } From 77c0e745bd11fc1ccc4690409eca92ea07200141 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:34 -0700 Subject: [PATCH 715/721] wimax/i2400m: remove unlikely() from WARN*() condition "unlikely(WARN_ON(x))" is excessive. WARN_ON() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-6-efremov@linux.com Signed-off-by: Denis Efremov Cc: Inaky Perez-Gonzalez Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/wimax/i2400m/tx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wimax/i2400m/tx.c b/drivers/net/wimax/i2400m/tx.c index ebd64e083726..1255302e251e 100644 --- a/drivers/net/wimax/i2400m/tx.c +++ b/drivers/net/wimax/i2400m/tx.c @@ -654,8 +654,7 @@ void i2400m_tx_close(struct i2400m *i2400m) padding = aligned_size - tx_msg_moved->size; if (padding > 0) { pad_buf = i2400m_tx_fifo_push(i2400m, padding, 0, 0); - if (unlikely(WARN_ON(pad_buf == NULL - || pad_buf == TAIL_FULL))) { + if (WARN_ON(pad_buf == NULL || pad_buf == TAIL_FULL)) { /* This should not happen -- append should verify * there is always space left at least to append * tx_block_size */ From 14ed8688074ad7ba62d460ab87da841fa5407285 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:37 -0700 Subject: [PATCH 716/721] xfs: remove unlikely() from WARN_ON() condition "unlikely(WARN_ON(x))" is excessive. WARN_ON() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-7-efremov@linux.com Signed-off-by: Denis Efremov Reviewed-by: Darrick J. Wong Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/xfs/xfs_buf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 120ef99d09e8..21c243622a79 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2097,7 +2097,7 @@ xfs_verify_magic( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) return false; return dmagic == bp->b_ops->magic[idx]; } @@ -2115,7 +2115,7 @@ xfs_verify_magic16( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) return false; return dmagic == bp->b_ops->magic16[idx]; } From 7b0b69259433fc1758408a899224db4fcc41b865 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:40 -0700 Subject: [PATCH 717/721] IB/hfi1: remove unlikely() from IS_ERR*() condition "unlikely(IS_ERR_OR_NULL(x))" is excessive. IS_ERR_OR_NULL() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-8-efremov@linux.com Signed-off-by: Denis Efremov Cc: Mike Marciniszyn Cc: Joe Perches Acked-by: Dennis Dalessandro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/hfi1/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 9f53f63b1453..7bff0a1e713d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1041,7 +1041,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (cb) iowait_pio_inc(&priv->s_iowait); pbuf = sc_buffer_alloc(sc, plen, cb, qp); - if (unlikely(IS_ERR_OR_NULL(pbuf))) { + if (IS_ERR_OR_NULL(pbuf)) { if (cb) verbs_pio_complete(qp, 0); if (IS_ERR(pbuf)) { From cc22c800e15b03c87f0e97400f75eba998e75c6a Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Wed, 25 Sep 2019 16:49:43 -0700 Subject: [PATCH 718/721] ntfs: remove (un)?likely() from IS_ERR() conditions "likely(!IS_ERR(x))" is excessive. IS_ERR() already uses unlikely() internally. Link: http://lkml.kernel.org/r/20190829165025.15750-11-efremov@linux.com Signed-off-by: Denis Efremov Cc: Anton Altaparmakov Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/mft.c | 12 ++++++------ fs/ntfs/namei.c | 2 +- fs/ntfs/runlist.c | 2 +- fs/ntfs/super.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 20c841a906f2..3aac5c917afe 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -71,7 +71,7 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) } /* Read, map, and pin the page. */ page = ntfs_map_page(mft_vi->i_mapping, index); - if (likely(!IS_ERR(page))) { + if (!IS_ERR(page)) { /* Catch multi sector transfer fixup errors. */ if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + ofs)))) { @@ -154,7 +154,7 @@ MFT_RECORD *map_mft_record(ntfs_inode *ni) mutex_lock(&ni->mrec_lock); m = map_mft_record_page(ni); - if (likely(!IS_ERR(m))) + if (!IS_ERR(m)) return m; mutex_unlock(&ni->mrec_lock); @@ -271,7 +271,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, m = map_mft_record(ni); /* map_mft_record() has incremented this on success. */ atomic_dec(&ni->count); - if (likely(!IS_ERR(m))) { + if (!IS_ERR(m)) { /* Verify the sequence number. */ if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { ntfs_debug("Done 1."); @@ -1303,7 +1303,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, (ll - 1) >> vol->cluster_size_bits, NULL); - if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { up_write(&mftbmp_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft bitmap attribute."); @@ -1734,7 +1734,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) read_unlock_irqrestore(&mft_ni->size_lock, flags); rl = ntfs_attr_find_vcn_nolock(mft_ni, (ll - 1) >> vol->cluster_size_bits, NULL); - if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { up_write(&mft_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft data attribute."); @@ -1776,7 +1776,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) do { rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, true); - if (likely(!IS_ERR(rl2))) + if (!IS_ERR(rl2)) break; if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { ntfs_error(vol->sb, "Failed to allocate the minimal " diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 2d3cc9e3395d..4e6a44bc654c 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -115,7 +115,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, dent_ino = MREF(mref); ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); dent_inode = ntfs_iget(vol->sb, dent_ino); - if (likely(!IS_ERR(dent_inode))) { + if (!IS_ERR(dent_inode)) { /* Consistency check. */ if (is_bad_inode(dent_inode) || MSEQNO(mref) == NTFS_I(dent_inode)->seq_no || diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c index 508744a93180..97932fb5179c 100644 --- a/fs/ntfs/runlist.c +++ b/fs/ntfs/runlist.c @@ -951,7 +951,7 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, } /* Now combine the new and old runlists checking for overlaps. */ old_rl = ntfs_runlists_merge(old_rl, rl); - if (likely(!IS_ERR(old_rl))) + if (!IS_ERR(old_rl)) return old_rl; ntfs_free(rl); ntfs_error(vol->sb, "Failed to merge runlists."); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 29621d40f448..7dc3bc604f78 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1475,7 +1475,7 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol) kfree(name); /* Get the inode. */ tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) { + if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { if (!IS_ERR(tmp_ino)) iput(tmp_ino); ntfs_error(vol->sb, "Failed to load $UsnJrnl."); From b4ed71f557e458257e0f71b11969954acb389240 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 25 Sep 2019 16:49:46 -0700 Subject: [PATCH 719/721] mm: treewide: clarify pgtable_page_{ctor,dtor}() naming The naming of pgtable_page_{ctor,dtor}() seems to have confused a few people, and until recently arm64 used these erroneously/pointlessly for other levels of page table. To make it incredibly clear that these only apply to the PTE level, and to align with the naming of pgtable_pmd_page_{ctor,dtor}(), let's rename them to pgtable_pte_page_{ctor,dtor}(). These changes were generated with the following shell script: ---- git grep -lw 'pgtable_page_.tor' | while read FILE; do sed -i '{s/pgtable_page_ctor/pgtable_pte_page_ctor/}' $FILE; sed -i '{s/pgtable_page_dtor/pgtable_pte_page_dtor/}' $FILE; done ---- ... with the documentation re-flowed to remain under 80 columns, and whitespace fixed up in macros to keep backslashes aligned. There should be no functional change as a result of this patch. Link: http://lkml.kernel.org/r/20190722141133.3116-1-mark.rutland@arm.com Signed-off-by: Mark Rutland Reviewed-by: Mike Rapoport Acked-by: Geert Uytterhoeven [m68k] Cc: Anshuman Khandual Cc: Matthew Wilcox Cc: Michal Hocko Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/split_page_table_lock.rst | 10 +++++----- arch/arc/include/asm/pgalloc.h | 4 ++-- arch/arm/include/asm/tlb.h | 2 +- arch/arm/mm/mmu.c | 2 +- arch/arm64/include/asm/tlb.h | 2 +- arch/arm64/mm/mmu.c | 2 +- arch/csky/include/asm/pgalloc.h | 2 +- arch/hexagon/include/asm/pgalloc.h | 2 +- arch/m68k/include/asm/mcf_pgalloc.h | 6 +++--- arch/m68k/include/asm/motorola_pgalloc.h | 6 +++--- arch/m68k/include/asm/sun3_pgalloc.h | 2 +- arch/mips/include/asm/pgalloc.h | 2 +- arch/nios2/include/asm/pgalloc.h | 2 +- arch/openrisc/include/asm/pgalloc.h | 6 +++--- arch/powerpc/mm/pgtable-frag.c | 6 +++--- arch/riscv/include/asm/pgalloc.h | 2 +- arch/s390/mm/pgalloc.c | 6 +++--- arch/sh/include/asm/pgalloc.h | 2 +- arch/sparc/mm/init_64.c | 4 ++-- arch/sparc/mm/srmmu.c | 4 ++-- arch/um/include/asm/pgalloc.h | 2 +- arch/unicore32/include/asm/tlb.h | 2 +- arch/x86/mm/pgtable.c | 2 +- arch/xtensa/include/asm/pgalloc.h | 4 ++-- include/asm-generic/pgalloc.h | 8 ++++---- include/linux/mm.h | 4 ++-- 26 files changed, 48 insertions(+), 48 deletions(-) diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst index 889b00be469f..ff51f4a5494d 100644 --- a/Documentation/vm/split_page_table_lock.rst +++ b/Documentation/vm/split_page_table_lock.rst @@ -54,9 +54,9 @@ Hugetlb-specific helpers: Support of split page table lock by an architecture =================================================== -There's no need in special enabling of PTE split page table lock: -everything required is done by pgtable_page_ctor() and pgtable_page_dtor(), -which must be called on PTE table allocation / freeing. +There's no need in special enabling of PTE split page table lock: everything +required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which +must be called on PTE table allocation / freeing. Make sure the architecture doesn't use slab allocator for page table allocation: slab uses page->slab_cache for its pages. @@ -74,7 +74,7 @@ paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. -NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must be handled properly. page->ptl @@ -94,7 +94,7 @@ trick: split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs one more cache line for indirect access; -The spinlock_t allocated in pgtable_page_ctor() for PTE table and in +The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in pgtable_pmd_page_ctor() for PMD table. Please, never access page->ptl directly -- use appropriate helper. diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 4751f2251cd9..b747f2ec2928 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -108,7 +108,7 @@ pte_alloc_one(struct mm_struct *mm) return 0; memzero((void *)pte_pg, PTRS_PER_PTE * sizeof(pte_t)); page = virt_to_page(pte_pg); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return 0; } @@ -123,7 +123,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, pgtable_t ptep) { - pgtable_page_dtor(virt_to_page(ptep)); + pgtable_pte_page_dtor(virt_to_page(ptep)); free_pages((unsigned long)ptep, __get_order_pte()); } diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index b75ea15b85c0..669474add486 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -44,7 +44,7 @@ static inline void __tlb_remove_table(void *_table) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); #ifndef CONFIG_ARM_LPAE /* diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 25da9b2d9610..48c2888297dd 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -731,7 +731,7 @@ static void *__init late_alloc(unsigned long sz) { void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz)); - if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) + if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr))) BUG(); return ptr; } diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index a95d1fcb7e21..b76df828e6b7 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -44,7 +44,7 @@ static inline void tlb_flush(struct mmu_gather *tlb) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); tlb_remove_table(tlb, pte); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 53dc6f24cfb7..60c929f3683b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -384,7 +384,7 @@ static phys_addr_t pgd_pgtable_alloc(int shift) * folded, and if so pgtable_pmd_page_ctor() becomes nop. */ if (shift == PAGE_SHIFT) - BUG_ON(!pgtable_page_ctor(phys_to_page(pa))); + BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa))); else if (shift == PMD_SHIFT) BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa))); diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index d089113fe41f..c7c1ed27e348 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -71,7 +71,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #define __pte_free_tlb(tlb, pte, address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page(tlb, pte); \ } while (0) diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index 5a6e79e7926d..cc9be514a676 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -94,7 +94,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor((pte)); \ + pgtable_pte_page_dtor((pte)); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 4399d712f6db..b34d44d666a4 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -41,7 +41,7 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, unsigned long address) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } @@ -54,7 +54,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -73,7 +73,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) static inline void pte_free(struct mm_struct *mm, struct page *page) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index d04d9ba9b976..acab315c851f 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -36,7 +36,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) page = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); if(!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -51,7 +51,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) static inline void pte_free(struct mm_struct *mm, pgtable_t page) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); cache_page(kmap(page)); kunmap(page); __free_page(page); @@ -60,7 +60,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t page) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, unsigned long address) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); cache_page(kmap(page)); kunmap(page); __free_page(page); diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 1a8ddbd0d23c..856121122b91 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -21,7 +21,7 @@ extern const char bad_pmd_string[]; #define __pte_free_tlb(tlb,pte,addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index aa73cb187a07..166842337eb2 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -54,7 +54,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #define __pte_free_tlb(tlb,pte,address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 750d18d5980b..0b146d773c85 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -41,7 +41,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 787c1b9d2f6d..da12a4c38c4b 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -75,7 +75,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) if (!pte) return NULL; clear_page(page_address(pte)); - if (!pgtable_page_ctor(pte)) { + if (!pgtable_pte_page_ctor(pte)) { __free_page(pte); return NULL; } @@ -89,13 +89,13 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, struct page *pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); __free_page(pte); } #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index a7b05214760c..ee4bd6d38602 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag) count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } @@ -61,7 +61,7 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -113,7 +113,7 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { if (!kernel) - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index f66a00d8cb19..d59ea92285ec 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -78,7 +78,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define __pte_free_tlb(tlb, pte, buf) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 54fcdf66ae96..3dd253f81a77 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -210,7 +210,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page = alloc_page(GFP_KERNEL); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -256,7 +256,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) atomic_xor_bits(&page->_refcount, 3U << 24); } - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } @@ -308,7 +308,7 @@ void __tlb_remove_table(void *_table) case 3: /* 4K page table with pgstes */ if (mask & 3) atomic_xor_bits(&page->_refcount, 3 << 24); - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); break; } diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 8c6341a4d807..22d968bfe9bb 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -29,7 +29,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb,pte,addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 4b099dd7a767..e6d91819da92 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2903,7 +2903,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { free_unref_page(page); return NULL; } @@ -2919,7 +2919,7 @@ static void __pte_free(pgtable_t pte) { struct page *page = virt_to_page(pte); - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index aaebbc00d262..cc3ad64479ac 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -378,7 +378,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) if ((pte = (unsigned long)pte_alloc_one_kernel(mm)) == 0) return NULL; page = pfn_to_page(__nocache_pa(pte) >> PAGE_SHIFT); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -389,7 +389,7 @@ void pte_free(struct mm_struct *mm, pgtable_t pte) { unsigned long p; - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); p = (unsigned long)page_address(pte); /* Cached address (for test) */ if (p == 0) BUG(); diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 446e0c0f4018..881e76da1938 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -29,7 +29,7 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); #define __pte_free_tlb(tlb,pte, address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb),(pte)); \ } while (0) diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h index 10d2356bfddd..4663d8cc80ef 100644 --- a/arch/unicore32/include/asm/tlb.h +++ b/arch/unicore32/include/asm/tlb.h @@ -15,7 +15,7 @@ #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 463940faf52f..3e4b9035bb9a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -45,7 +45,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index dd744aa450fa..1d38f0e755ba 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -55,7 +55,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) if (!pte) return NULL; page = virt_to_page(pte); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -69,7 +69,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, pgtable_t pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); __free_page(pte); } #define pmd_pgtable(pmd) pmd_page(pmd) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 6f8cc06ee44e..73f7421413cb 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -49,7 +49,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) * @mm: the mm_struct of the current context * @gfp: GFP flags to use for the allocation * - * Allocates a page and runs the pgtable_page_ctor(). + * Allocates a page and runs the pgtable_pte_page_ctor(). * * This function is intended for architectures that need * anything beyond simple page allocation or must have custom GFP flags. @@ -63,7 +63,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) pte = alloc_page(gfp); if (!pte) return NULL; - if (!pgtable_page_ctor(pte)) { + if (!pgtable_pte_page_ctor(pte)) { __free_page(pte); return NULL; } @@ -76,7 +76,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) * pte_alloc_one - allocate a page for PTE-level user page table * @mm: the mm_struct of the current context * - * Allocates a page and runs the pgtable_page_ctor(). + * Allocates a page and runs the pgtable_pte_page_ctor(). * * Return: `struct page` initialized as page table or %NULL on error */ @@ -98,7 +98,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) */ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { - pgtable_page_dtor(pte_page); + pgtable_pte_page_dtor(pte_page); __free_page(pte_page); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 294a67b94147..cc292273e6ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1949,7 +1949,7 @@ static inline void pgtable_init(void) pgtable_cache_init(); } -static inline bool pgtable_page_ctor(struct page *page) +static inline bool pgtable_pte_page_ctor(struct page *page) { if (!ptlock_init(page)) return false; @@ -1958,7 +1958,7 @@ static inline bool pgtable_page_ctor(struct page *page) return true; } -static inline void pgtable_page_dtor(struct page *page) +static inline void pgtable_pte_page_dtor(struct page *page) { ptlock_free(page); __ClearPageTable(page); From a22fea94992a2bc5328005e62f368413ede49c14 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 26 Sep 2019 07:28:17 -0700 Subject: [PATCH 720/721] arch/sparc/include/asm/pgtable_64.h: fix build A last-minute fixlet which I'd failed to merge at the appropriate time had the predictable effect. Fixes: f672e2c217e2d4b2 ("lib: untag user pointers in strn*_user") Cc: Andrey Konovalov Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 6ec514fe3bef..6ae8016ef4ec 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1099,7 +1099,7 @@ static inline unsigned long __untagged_addr(unsigned long start) return start; } #define untagged_addr(addr) \ - ((__typeof__(addr))(__untagged_addr((unsigned long)(addr))) + ((__typeof__(addr))(__untagged_addr((unsigned long)(addr)))) static inline bool pte_access_permitted(pte_t pte, bool write) { From 7be3cb019db1cbd5fd5ffe6d64a23fefa4b6f229 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 26 Sep 2019 10:15:25 -0700 Subject: [PATCH 721/721] binfmt_elf: Do not move brk for INTERP-less ET_EXEC When brk was moved for binaries without an interpreter, it should have been limited to ET_DYN only. In other words, the special case was an ET_DYN that lacks an INTERP, not just an executable that lacks INTERP. The bug manifested for giant static executables, where the brk would end up in the middle of the text area on 32-bit architectures. Reported-and-tested-by: Richard Kojedzinszky Fixes: bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index cec3b4146440..ad4c6b1d5074 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1121,7 +1121,8 @@ static int load_elf_binary(struct linux_binprm *bprm) * (since it grows up, and may collide early with the stack * growing down), and into the unused ELF_ET_DYN_BASE region. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter) + if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + loc->elf_ex.e_type == ET_DYN && !interpreter) current->mm->brk = current->mm->start_brk = ELF_ET_DYN_BASE;