From 26db22a6397b62b34d4e8abecc56d54496a0ec32 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Sep 2023 12:34:40 -0400 Subject: [PATCH 001/228] NFS: Use the correct commit info in nfs_join_page_group() [ Upstream commit b193a78ddb5ee7dba074d3f28dc050069ba083c0 ] Ensure that nfs_clear_request_commit() updates the correct counters when it removes them from the commit list. Fixes: ed5d588fe47f ("NFS: Try to join page groups before an O_DIRECT retransmission") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/direct.c | 8 +++++--- fs/nfs/write.c | 23 ++++++++++++----------- include/linux/nfs_page.h | 4 +++- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 018af6ec97b4..5d86ffa72cea 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -525,7 +525,9 @@ static void nfs_direct_add_page_head(struct list_head *list, kref_get(&head->wb_kref); } -static void nfs_direct_join_group(struct list_head *list, struct inode *inode) +static void nfs_direct_join_group(struct list_head *list, + struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *req, *subreq; @@ -547,7 +549,7 @@ static void nfs_direct_join_group(struct list_head *list, struct inode *inode) nfs_release_request(subreq); } } while ((subreq = subreq->wb_this_page) != req); - nfs_join_page_group(req, inode); + nfs_join_page_group(req, cinfo, inode); } } @@ -573,7 +575,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); - nfs_direct_join_group(&reqs, dreq->inode); + nfs_direct_join_group(&reqs, &cinfo, dreq->inode); dreq->count = 0; dreq->max_count = 0; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index dc08a0c02f09..d3cd099ffb6e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -58,7 +58,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_inode_remove_request(struct nfs_page *req); -static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); static struct nfs_page * @@ -500,8 +501,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. */ -void -nfs_join_page_group(struct nfs_page *head, struct inode *inode) +void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; @@ -531,7 +532,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) * Commit list removal accounting is done after locks are dropped */ subreq = head; do { - nfs_clear_request_commit(subreq); + nfs_clear_request_commit(cinfo, subreq); subreq = subreq->wb_this_page; } while (subreq != head); @@ -565,8 +566,10 @@ nfs_lock_and_join_requests(struct page *page) { struct inode *inode = page_file_mapping(page)->host; struct nfs_page *head; + struct nfs_commit_info cinfo; int ret; + nfs_init_cinfo_from_inode(&cinfo, inode); /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed @@ -583,7 +586,7 @@ nfs_lock_and_join_requests(struct page *page) return ERR_PTR(ret); } - nfs_join_page_group(head, inode); + nfs_join_page_group(head, &cinfo, inode); return head; } @@ -944,18 +947,16 @@ nfs_clear_page_commit(struct page *page) } /* Called holding the request lock on @req */ -static void -nfs_clear_request_commit(struct nfs_page *req) +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { struct nfs_open_context *ctx = nfs_req_openctx(req); struct inode *inode = d_inode(ctx->dentry); - struct nfs_commit_info cinfo; - nfs_init_cinfo_from_inode(&cinfo, inode); mutex_lock(&NFS_I(inode)->commit_mutex); - if (!pnfs_clear_request_commit(req, &cinfo)) { - nfs_request_remove_commit_list(req, &cinfo); + if (!pnfs_clear_request_commit(req, cinfo)) { + nfs_request_remove_commit_list(req, cinfo); } mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_clear_page_commit(req->wb_page); diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index f0373a6cb5fb..40aa09a21f75 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -145,7 +145,9 @@ extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); extern struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req); extern int nfs_page_group_lock_subrequests(struct nfs_page *head); -extern void nfs_join_page_group(struct nfs_page *head, struct inode *inode); +extern void nfs_join_page_group(struct nfs_page *head, + struct nfs_commit_info *cinfo, + struct inode *inode); extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); From 25ae2b2fdb1239a90f42933a0f75f868d271bd98 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Sep 2023 12:43:58 -0400 Subject: [PATCH 002/228] NFS/pNFS: Report EINVAL errors from connect() to the server [ Upstream commit dd7d7ee3ba2a70d12d02defb478790cf57d5b87b ] With IPv6, connect() can occasionally return EINVAL if a route is unavailable. If this happens during I/O to a data server, we want to report it using LAYOUTERROR as an inability to connect. Fixes: dd52128afdde ("NFSv4.1/pnfs Ensure flexfiles reports all connection related errors") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/flexfilelayout/flexfilelayout.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index a8a02081942d..e4f2820ba5a5 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1240,6 +1240,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -EPFNOSUPPORT: case -EPROTONOSUPPORT: case -EOPNOTSUPP: + case -EINVAL: case -ECONNREFUSED: case -ECONNRESET: case -EHOSTDOWN: From 3f7df02fa0d4c688ecd073e9612f52f5532d0b46 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Sep 2023 12:50:09 -0400 Subject: [PATCH 003/228] SUNRPC: Mark the cred for revalidation if the server rejects it [ Upstream commit 611fa42dfa9d2f3918ac5f4dd5705dfad81b323d ] If the server rejects the credential as being stale, or bad, then we should mark it for revalidation before retransmitting. Fixes: 7f5667a5f8c4 ("SUNRPC: Clean up rpc_verify_header()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- net/sunrpc/clnt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index e1ce0f261f0b..e9a3fca4aedc 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2630,6 +2630,7 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) case rpc_autherr_rejectedverf: case rpcsec_gsserr_credproblem: case rpcsec_gsserr_ctxproblem: + rpcauth_invalcred(task); if (!task->tk_cred_retry) break; task->tk_cred_retry--; From 407bf1c140f0757706c0b28604bcc90837d45ce2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Sep 2023 22:47:12 -0400 Subject: [PATCH 004/228] tracing: Increase trace array ref count on enable and filter files [ Upstream commit f5ca233e2e66dc1c249bf07eefa37e34a6c9346a ] When the trace event enable and filter files are opened, increment the trace array ref counter, otherwise they can be accessed when the trace array is being deleted. The ref counter keeps the trace array from being deleted while those files are opened. Link: https://lkml.kernel.org/r/20230907024803.456187066@goodmis.org Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Fixes: 8530dec63e7b4 ("tracing: Add tracing_check_open_get_tr()") Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Reported-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin --- kernel/trace/trace.c | 27 +++++++++++++++++++++++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 6 ++++-- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7a64c0cd8819..196eec0423ff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4559,6 +4559,33 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) return 0; } +/* + * The private pointer of the inode is the trace_event_file. + * Update the tr ref count associated to it. + */ +int tracing_open_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(file->tr); + if (ret) + return ret; + + filp->private_data = inode->i_private; + + return 0; +} + +int tracing_release_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + + trace_array_put(file->tr); + + return 0; +} + static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dfde855dafda..7fa00b83dfa4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -730,6 +730,8 @@ void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_open_file_tr(struct inode *inode, struct file *filp); +int tracing_release_file_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a46d34d840f6..321cfda1b333 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1855,9 +1855,10 @@ static const struct file_operations ftrace_set_event_notrace_pid_fops = { }; static const struct file_operations ftrace_enable_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_enable_read, .write = event_enable_write, + .release = tracing_release_file_tr, .llseek = default_llseek, }; @@ -1874,9 +1875,10 @@ static const struct file_operations ftrace_event_id_fops = { }; static const struct file_operations ftrace_event_filter_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_filter_read, .write = event_filter_write, + .release = tracing_release_file_tr, .llseek = default_llseek, }; From 11d15a115c905034cd6842d55158ca60cb9e39ff Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Dec 2021 08:20:47 +0100 Subject: [PATCH 005/228] ata: ahci: Drop pointless VPRINTK() calls and convert the remaining ones [ Upstream commit 93c7711494f47f9c829321e2a8711671b02f6e4c ] Drop pointless VPRINTK() calls for entering and existing interrupt routines and convert the remaining calls to dev_dbg(). Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal Stable-dep-of: 737dd811a3db ("ata: libahci: clear pending interrupt status") Signed-off-by: Sasha Levin --- drivers/ata/ahci.c | 4 +--- drivers/ata/ahci_xgene.c | 4 ---- drivers/ata/libahci.c | 18 ++++-------------- 3 files changed, 5 insertions(+), 21 deletions(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index bf949f7da483..d831a80c25f0 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -703,7 +703,7 @@ static void ahci_pci_init_controller(struct ata_host *host) /* clear port IRQ */ tmp = readl(port_mmio + PORT_IRQ_STAT); - VPRINTK("PORT_IRQ_STAT 0x%x\n", tmp); + dev_dbg(&pdev->dev, "PORT_IRQ_STAT 0x%x\n", tmp); if (tmp) writel(tmp, port_mmio + PORT_IRQ_STAT); } @@ -1495,7 +1495,6 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) u32 irq_stat, irq_masked; unsigned int handled = 1; - VPRINTK("ENTER\n"); hpriv = host->private_data; mmio = hpriv->mmio; irq_stat = readl(mmio + HOST_IRQ_STAT); @@ -1512,7 +1511,6 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) irq_stat = readl(mmio + HOST_IRQ_STAT); spin_unlock(&host->lock); } while (irq_stat); - VPRINTK("EXIT\n"); return IRQ_RETVAL(handled); } diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c index 16246c843365..e0f0577ac191 100644 --- a/drivers/ata/ahci_xgene.c +++ b/drivers/ata/ahci_xgene.c @@ -588,8 +588,6 @@ static irqreturn_t xgene_ahci_irq_intr(int irq, void *dev_instance) void __iomem *mmio; u32 irq_stat, irq_masked; - VPRINTK("ENTER\n"); - hpriv = host->private_data; mmio = hpriv->mmio; @@ -612,8 +610,6 @@ static irqreturn_t xgene_ahci_irq_intr(int irq, void *dev_instance) spin_unlock(&host->lock); - VPRINTK("EXIT\n"); - return IRQ_RETVAL(rc); } diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index fec2e9754aed..08c4b641691b 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -1215,12 +1215,12 @@ static void ahci_port_init(struct device *dev, struct ata_port *ap, /* clear SError */ tmp = readl(port_mmio + PORT_SCR_ERR); - VPRINTK("PORT_SCR_ERR 0x%x\n", tmp); + dev_dbg(dev, "PORT_SCR_ERR 0x%x\n", tmp); writel(tmp, port_mmio + PORT_SCR_ERR); /* clear port IRQ */ tmp = readl(port_mmio + PORT_IRQ_STAT); - VPRINTK("PORT_IRQ_STAT 0x%x\n", tmp); + dev_dbg(dev, "PORT_IRQ_STAT 0x%x\n", tmp); if (tmp) writel(tmp, port_mmio + PORT_IRQ_STAT); @@ -1251,10 +1251,10 @@ void ahci_init_controller(struct ata_host *host) } tmp = readl(mmio + HOST_CTL); - VPRINTK("HOST_CTL 0x%x\n", tmp); + dev_dbg(host->dev, "HOST_CTL 0x%x\n", tmp); writel(tmp | HOST_IRQ_EN, mmio + HOST_CTL); tmp = readl(mmio + HOST_CTL); - VPRINTK("HOST_CTL 0x%x\n", tmp); + dev_dbg(host->dev, "HOST_CTL 0x%x\n", tmp); } EXPORT_SYMBOL_GPL(ahci_init_controller); @@ -1905,8 +1905,6 @@ static irqreturn_t ahci_multi_irqs_intr_hard(int irq, void *dev_instance) void __iomem *port_mmio = ahci_port_base(ap); u32 status; - VPRINTK("ENTER\n"); - status = readl(port_mmio + PORT_IRQ_STAT); writel(status, port_mmio + PORT_IRQ_STAT); @@ -1914,8 +1912,6 @@ static irqreturn_t ahci_multi_irqs_intr_hard(int irq, void *dev_instance) ahci_handle_port_interrupt(ap, port_mmio, status); spin_unlock(ap->lock); - VPRINTK("EXIT\n"); - return IRQ_HANDLED; } @@ -1932,9 +1928,7 @@ u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked) ap = host->ports[i]; if (ap) { ahci_port_intr(ap); - VPRINTK("port %u\n", i); } else { - VPRINTK("port %u (no irq)\n", i); if (ata_ratelimit()) dev_warn(host->dev, "interrupt on disabled port %u\n", i); @@ -1955,8 +1949,6 @@ static irqreturn_t ahci_single_level_irq_intr(int irq, void *dev_instance) void __iomem *mmio; u32 irq_stat, irq_masked; - VPRINTK("ENTER\n"); - hpriv = host->private_data; mmio = hpriv->mmio; @@ -1984,8 +1976,6 @@ static irqreturn_t ahci_single_level_irq_intr(int irq, void *dev_instance) spin_unlock(&host->lock); - VPRINTK("EXIT\n"); - return IRQ_RETVAL(rc); } From 69c966583022fc3d1e52da717d9942a55907bff4 Mon Sep 17 00:00:00 2001 From: Szuying Chen Date: Thu, 7 Sep 2023 16:17:10 +0800 Subject: [PATCH 006/228] ata: libahci: clear pending interrupt status [ Upstream commit 737dd811a3dbfd7edd4ad2ba5152e93d99074f83 ] When a CRC error occurs, the HBA asserts an interrupt to indicate an interface fatal error (PxIS.IFS). The ISR clears PxIE and PxIS, then does error recovery. If the adapter receives another SDB FIS with an error (PxIS.TFES) from the device before the start of the EH recovery process, the interrupt signaling the new SDB cannot be serviced as PxIE was cleared already. This in turn results in the HBA inability to issue any command during the error recovery process after setting PxCMD.ST to 1 because PxIS.TFES is still set. According to AHCI 1.3.1 specifications section 6.2.2, fatal errors notified by setting PxIS.HBFS, PxIS.HBDS, PxIS.IFS or PxIS.TFES will cause the HBA to enter the ERR:Fatal state. In this state, the HBA shall not issue any new commands. To avoid this situation, introduce the function ahci_port_clear_pending_irq() to clear pending interrupts before executing a COMRESET. This follows the AHCI 1.3.1 - section 6.2.2.2 specification. Signed-off-by: Szuying Chen Fixes: e0bfd149973d ("[PATCH] ahci: stop engine during hard reset") Cc: stable@vger.kernel.org Reviewed-by: Niklas Cassel Signed-off-by: Damien Le Moal Signed-off-by: Sasha Levin --- drivers/ata/libahci.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 08c4b641691b..e188850f65ff 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -1199,6 +1199,26 @@ static ssize_t ahci_activity_show(struct ata_device *dev, char *buf) return sprintf(buf, "%d\n", emp->blink_policy); } +static void ahci_port_clear_pending_irq(struct ata_port *ap) +{ + struct ahci_host_priv *hpriv = ap->host->private_data; + void __iomem *port_mmio = ahci_port_base(ap); + u32 tmp; + + /* clear SError */ + tmp = readl(port_mmio + PORT_SCR_ERR); + dev_dbg(ap->host->dev, "PORT_SCR_ERR 0x%x\n", tmp); + writel(tmp, port_mmio + PORT_SCR_ERR); + + /* clear port IRQ */ + tmp = readl(port_mmio + PORT_IRQ_STAT); + dev_dbg(ap->host->dev, "PORT_IRQ_STAT 0x%x\n", tmp); + if (tmp) + writel(tmp, port_mmio + PORT_IRQ_STAT); + + writel(1 << ap->port_no, hpriv->mmio + HOST_IRQ_STAT); +} + static void ahci_port_init(struct device *dev, struct ata_port *ap, int port_no, void __iomem *mmio, void __iomem *port_mmio) @@ -1213,18 +1233,7 @@ static void ahci_port_init(struct device *dev, struct ata_port *ap, if (rc) dev_warn(dev, "%s (%d)\n", emsg, rc); - /* clear SError */ - tmp = readl(port_mmio + PORT_SCR_ERR); - dev_dbg(dev, "PORT_SCR_ERR 0x%x\n", tmp); - writel(tmp, port_mmio + PORT_SCR_ERR); - - /* clear port IRQ */ - tmp = readl(port_mmio + PORT_IRQ_STAT); - dev_dbg(dev, "PORT_IRQ_STAT 0x%x\n", tmp); - if (tmp) - writel(tmp, port_mmio + PORT_IRQ_STAT); - - writel(1 << port_no, mmio + HOST_IRQ_STAT); + ahci_port_clear_pending_irq(ap); /* mark esata ports */ tmp = readl(port_mmio + PORT_CMD); @@ -1554,6 +1563,8 @@ int ahci_do_hardreset(struct ata_link *link, unsigned int *class, tf.command = ATA_BUSY; ata_tf_to_fis(&tf, 0, 0, d2h_fis); + ahci_port_clear_pending_irq(ap); + rc = sata_link_hardreset(link, timing, deadline, online, ahci_check_ready); From 6f5fc957dfb7b2665c8b82c400dec3a38e961a9a Mon Sep 17 00:00:00 2001 From: Wang Jianchao Date: Sat, 24 Jul 2021 15:41:20 +0800 Subject: [PATCH 007/228] ext4: remove the 'group' parameter of ext4_trim_extent [ Upstream commit bd2eea8d0a6b6a9aca22f20bf74f73b71d8808af ] Get rid of the 'group' parameter of ext4_trim_extent as we can get it from the 'e4b'. Reviewed-by: Andreas Dilger Signed-off-by: Wang Jianchao Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210724074124.25731-2-jianchao.wan9@gmail.com Signed-off-by: Theodore Ts'o Stable-dep-of: 45e4ab320c9b ("ext4: move setting of trimmed bit into ext4_try_to_trim_range()") Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2f6ed59d81f0..342225c1cf33 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5859,19 +5859,19 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group * @count: number of blocks to TRIM - * @group: alloc. group we are working with * @e4b: ext4 buddy for the group * * Trim "count" blocks starting at "start" in the "group". To assure that no * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static int ext4_trim_extent(struct super_block *sb, int start, int count, - ext4_group_t group, struct ext4_buddy *e4b) +static int ext4_trim_extent(struct super_block *sb, + int start, int count, struct ext4_buddy *e4b) __releases(bitlock) __acquires(bitlock) { struct ext4_free_extent ex; + ext4_group_t group = e4b->bd_group; int ret = 0; trace_ext4_trim_extent(sb, group, start, count); @@ -5947,8 +5947,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { - ret = ext4_trim_extent(sb, start, - next - start, group, &e4b); + ret = ext4_trim_extent(sb, start, next - start, &e4b); if (ret && ret != -EOPNOTSUPP) break; ret = 0; From da1895f731f3d1fb7d79c3812d88ef1beadd30f2 Mon Sep 17 00:00:00 2001 From: Wang Jianchao Date: Sat, 24 Jul 2021 15:41:21 +0800 Subject: [PATCH 008/228] ext4: add new helper interface ext4_try_to_trim_range() [ Upstream commit 6920b3913235f517728bb69abe9b39047a987113 ] There is no functional change in this patch but just split the codes, which serachs free block and does trim, into a new function ext4_try_to_trim_range. This is preparing for the following async backgroup discard. Reviewed-by: Andreas Dilger Signed-off-by: Wang Jianchao Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210724074124.25731-3-jianchao.wan9@gmail.com Signed-off-by: Theodore Ts'o Stable-dep-of: 45e4ab320c9b ("ext4: move setting of trimmed bit into ext4_try_to_trim_range()") Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 102 ++++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 45 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 342225c1cf33..54c718d47197 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5894,6 +5894,54 @@ __acquires(bitlock) return ret; } +static int ext4_try_to_trim_range(struct super_block *sb, + struct ext4_buddy *e4b, ext4_grpblk_t start, + ext4_grpblk_t max, ext4_grpblk_t minblocks) +{ + ext4_grpblk_t next, count, free_count; + void *bitmap; + int ret = 0; + + bitmap = e4b->bd_bitmap; + start = (e4b->bd_info->bb_first_free > start) ? + e4b->bd_info->bb_first_free : start; + count = 0; + free_count = 0; + + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) + break; + next = mb_find_next_bit(bitmap, max + 1, start); + + if ((next - start) >= minblocks) { + ret = ext4_trim_extent(sb, start, next - start, e4b); + if (ret && ret != -EOPNOTSUPP) + break; + ret = 0; + count += next - start; + } + free_count += next - start; + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if (need_resched()) { + ext4_unlock_group(sb, e4b->bd_group); + cond_resched(); + ext4_lock_group(sb, e4b->bd_group); + } + + if ((e4b->bd_info->bb_free - free_count) < minblocks) + break; + } + + return count; +} + /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system @@ -5917,10 +5965,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { - void *bitmap; - ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; - int ret = 0; + int ret; trace_ext4_trim_all_free(sb, group, start, max); @@ -5930,57 +5976,23 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ret, group); return ret; } - bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); - if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && - minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) - goto out; - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; - - while (start <= max) { - start = mb_find_next_zero_bit(bitmap, max + 1, start); - if (start > max) - break; - next = mb_find_next_bit(bitmap, max + 1, start); - - if ((next - start) >= minblocks) { - ret = ext4_trim_extent(sb, start, next - start, &e4b); - if (ret && ret != -EOPNOTSUPP) - break; - ret = 0; - count += next - start; - } - free_count += next - start; - start = next + 1; - - if (fatal_signal_pending(current)) { - count = -ERESTARTSYS; - break; - } - - if (need_resched()) { - ext4_unlock_group(sb, group); - cond_resched(); - ext4_lock_group(sb, group); - } - - if ((e4b.bd_info->bb_free - free_count) < minblocks) - break; + if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || + minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) { + ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); + if (ret >= 0) + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } else { + ret = 0; } - if (!ret) { - ret = count; - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); - } -out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", - count, group); + ret, group); return ret; } From 24a86315a3533d3c5fc6efcf56635484970561ee Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 20 Aug 2021 14:08:53 +0200 Subject: [PATCH 009/228] ext4: scope ret locally in ext4_try_to_trim_range() [ Upstream commit afcc4e32f606dbfb47aa7309172c89174b86e74c ] As commit 6920b3913235 ("ext4: add new helper interface ext4_try_to_trim_range()") moves some code into the separate function ext4_try_to_trim_range(), the use of the variable ret within that function is more limited and can be adjusted as well. Scope the use of the variable ret locally and drop dead assignments. No functional change. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20210820120853.23134-1-lukas.bulwahn@gmail.com Signed-off-by: Theodore Ts'o Stable-dep-of: 45e4ab320c9b ("ext4: move setting of trimmed bit into ext4_try_to_trim_range()") Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 54c718d47197..e0428cec6ff6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5900,7 +5900,6 @@ static int ext4_try_to_trim_range(struct super_block *sb, { ext4_grpblk_t next, count, free_count; void *bitmap; - int ret = 0; bitmap = e4b->bd_bitmap; start = (e4b->bd_info->bb_first_free > start) ? @@ -5915,10 +5914,10 @@ static int ext4_try_to_trim_range(struct super_block *sb, next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { - ret = ext4_trim_extent(sb, start, next - start, e4b); + int ret = ext4_trim_extent(sb, start, next - start, e4b); + if (ret && ret != -EOPNOTSUPP) break; - ret = 0; count += next - start; } free_count += next - start; From 5eaf4a1e06cf80240f7b62858eacc703ca500dba Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 3 Nov 2021 15:51:21 +0100 Subject: [PATCH 010/228] ext4: change s_last_trim_minblks type to unsigned long [ Upstream commit 2327fb2e23416cfb2795ccca2f77d4d65925be99 ] There is no good reason for the s_last_trim_minblks to be atomic. There is no data integrity needed and there is no real danger in setting and reading it in a racy manner. Change it to be unsigned long, the same type as s_clusters_per_group which is the maximum that's allowed. Signed-off-by: Lukas Czerner Suggested-by: Andreas Dilger Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20211103145122.17338-1-lczerner@redhat.com Signed-off-by: Theodore Ts'o Stable-dep-of: 45e4ab320c9b ("ext4: move setting of trimmed bit into ext4_try_to_trim_range()") Signed-off-by: Sasha Levin --- fs/ext4/ext4.h | 2 +- fs/ext4/mballoc.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c3e9cb503763..fec021e6bb60 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1580,7 +1580,7 @@ struct ext4_sb_info { struct task_struct *s_mmp_tsk; /* record the last minlen when FITRIM is called. */ - atomic_t s_last_trim_minblks; + unsigned long s_last_trim_minblks; /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e0428cec6ff6..7bc17eb5ea74 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5979,7 +5979,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || - minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) { + minblocks < EXT4_SB(sb)->s_last_trim_minblks) { ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); if (ret >= 0) EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); @@ -6090,7 +6090,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) } if (!ret) - atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + EXT4_SB(sb)->s_last_trim_minblks = minlen; out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; From cbf6a0f65404baa967806ac7ef402297dc88333c Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 17 Apr 2022 20:03:15 +0300 Subject: [PATCH 011/228] ext4: mark group as trimmed only if it was fully scanned [ Upstream commit d63c00ea435a5352f486c259665a4ced60399421 ] Otherwise nonaligned fstrim calls will works inconveniently for iterative scanners, for example: // trim [0,16MB] for group-1, but mark full group as trimmed fstrim -o $((1024*1024*128)) -l $((1024*1024*16)) ./m // handle [16MB,16MB] for group-1, do nothing because group already has the flag. fstrim -o $((1024*1024*144)) -l $((1024*1024*16)) ./m [ Update function documentation for ext4_trim_all_free -- TYT ] Signed-off-by: Dmitry Monakhov Link: https://lore.kernel.org/r/1650214995-860245-1-git-send-email-dmtrmonakhov@yandex-team.ru Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Stable-dep-of: 45e4ab320c9b ("ext4: move setting of trimmed bit into ext4_try_to_trim_range()") Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7bc17eb5ea74..5c650e28dcb6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5948,6 +5948,7 @@ static int ext4_try_to_trim_range(struct super_block *sb, * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count + * @set_trimmed: set the trimmed flag if at least one block is trimmed * * ext4_trim_all_free walks through group's buddy bitmap searching for free * extents. When the free block is found, ext4_trim_extent is called to TRIM @@ -5962,7 +5963,7 @@ static int ext4_try_to_trim_range(struct super_block *sb, static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks) + ext4_grpblk_t minblocks, bool set_trimmed) { struct ext4_buddy e4b; int ret; @@ -5981,7 +5982,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || minblocks < EXT4_SB(sb)->s_last_trim_minblks) { ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); - if (ret >= 0) + if (ret >= 0 && set_trimmed) EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); } else { ret = 0; @@ -6018,6 +6019,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); + bool whole_group, eof = false; int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -6036,8 +6038,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } - if (end >= max_blks) + if (end >= max_blks - 1) { end = max_blks - 1; + eof = true; + } if (end <= first_data_blk) goto out; if (start < first_data_blk) @@ -6051,6 +6055,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + whole_group = true; for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); @@ -6069,12 +6074,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ - if (group == last_group) + if (group == last_group) { end = last_cluster; - + whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; + } if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - end, minlen); + end, minlen, whole_group); if (cnt < 0) { ret = cnt; break; From e78e9f08a24e2fdc3dd23a8c03ff12860a46aa2d Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:00 +0800 Subject: [PATCH 012/228] ext4: replace the traditional ternary conditional operator with with max()/min() [ Upstream commit de8bf0e5ee7482585450357c6d4eddec8efc5cb7 ] Replace the traditional ternary conditional operator with with max()/min() Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o Stable-dep-of: 45e4ab320c9b ("ext4: move setting of trimmed bit into ext4_try_to_trim_range()") Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5c650e28dcb6..c06f304f38ed 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5902,8 +5902,7 @@ static int ext4_try_to_trim_range(struct super_block *sb, void *bitmap; bitmap = e4b->bd_bitmap; - start = (e4b->bd_info->bb_first_free > start) ? - e4b->bd_info->bb_first_free : start; + start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -6125,8 +6124,7 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; + start = max(e4b.bd_info->bb_first_free, start); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; From c502b09d9befc39b483ec9e8dfb54754d594b779 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 13 Sep 2023 17:04:54 +0200 Subject: [PATCH 013/228] ext4: move setting of trimmed bit into ext4_try_to_trim_range() [ Upstream commit 45e4ab320c9b5fa67b1fc3b6a9b381cfcc0c8488 ] Currently we set the group's trimmed bit in ext4_trim_all_free() based on return value of ext4_try_to_trim_range(). However when we will want to abort trimming because of suspend attempt, we want to return success from ext4_try_to_trim_range() but not set the trimmed bit. Instead implementing awkward propagation of this information, just move setting of trimmed bit into ext4_try_to_trim_range() when the whole group is trimmed. Cc: stable@kernel.org Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230913150504.9054-1-jack@suse.cz Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c06f304f38ed..2907bf57744a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5894,14 +5894,27 @@ __acquires(bitlock) return ret; } +static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, + ext4_group_t grp) +{ + if (grp < ext4_get_groups_count(sb)) + return EXT4_CLUSTERS_PER_GROUP(sb) - 1; + return (ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, grp) - 1) >> + EXT4_CLUSTER_BITS(sb); +} + static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { ext4_grpblk_t next, count, free_count; + bool set_trimmed = false; void *bitmap; bitmap = e4b->bd_bitmap; + if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group)) + set_trimmed = true; start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -5916,16 +5929,14 @@ static int ext4_try_to_trim_range(struct super_block *sb, int ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) - break; + return count; count += next - start; } free_count += next - start; start = next + 1; - if (fatal_signal_pending(current)) { - count = -ERESTARTSYS; - break; - } + if (fatal_signal_pending(current)) + return -ERESTARTSYS; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); @@ -5937,6 +5948,9 @@ static int ext4_try_to_trim_range(struct super_block *sb, break; } + if (set_trimmed) + EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); + return count; } @@ -5947,7 +5961,6 @@ static int ext4_try_to_trim_range(struct super_block *sb, * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count - * @set_trimmed: set the trimmed flag if at least one block is trimmed * * ext4_trim_all_free walks through group's buddy bitmap searching for free * extents. When the free block is found, ext4_trim_extent is called to TRIM @@ -5962,7 +5975,7 @@ static int ext4_try_to_trim_range(struct super_block *sb, static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks, bool set_trimmed) + ext4_grpblk_t minblocks) { struct ext4_buddy e4b; int ret; @@ -5979,13 +5992,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || - minblocks < EXT4_SB(sb)->s_last_trim_minblks) { + minblocks < EXT4_SB(sb)->s_last_trim_minblks) ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); - if (ret >= 0 && set_trimmed) - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); - } else { + else ret = 0; - } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -6018,7 +6028,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); - bool whole_group, eof = false; int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -6037,10 +6046,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } - if (end >= max_blks - 1) { + if (end >= max_blks - 1) end = max_blks - 1; - eof = true; - } if (end <= first_data_blk) goto out; if (start < first_data_blk) @@ -6054,7 +6061,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - whole_group = true; for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); @@ -6073,13 +6079,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ - if (group == last_group) { + if (group == last_group) end = last_cluster; - whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; - } if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - end, minlen, whole_group); + end, minlen); if (cnt < 0) { ret = cnt; break; From f8a86ab3c4a4acde551aaa86776a55cbba2d4a53 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 13 Sep 2023 17:04:55 +0200 Subject: [PATCH 014/228] ext4: do not let fstrim block system suspend [ Upstream commit 5229a658f6453362fbb9da6bf96872ef25a7097e ] Len Brown has reported that system suspend sometimes fail due to inability to freeze a task working in ext4_trim_fs() for one minute. Trimming a large filesystem on a disk that slowly processes discard requests can indeed take a long time. Since discard is just an advisory call, it is perfectly fine to interrupt it at any time and the return number of discarded blocks until that moment. Do that when we detect the task is being frozen. Cc: stable@kernel.org Reported-by: Len Brown Suggested-by: Dave Chinner References: https://bugzilla.kernel.org/show_bug.cgi?id=216322 Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230913150504.9054-2-jack@suse.cz Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/mballoc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2907bf57744a..b35d59d41c89 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -16,6 +16,7 @@ #include #include #include +#include #include /* @@ -5904,6 +5905,11 @@ static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, EXT4_CLUSTER_BITS(sb); } +static bool ext4_trim_interrupted(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) @@ -5935,8 +5941,8 @@ static int ext4_try_to_trim_range(struct super_block *sb, free_count += next - start; start = next + 1; - if (fatal_signal_pending(current)) - return -ERESTARTSYS; + if (ext4_trim_interrupted()) + return count; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); @@ -6063,6 +6069,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { + if (ext4_trim_interrupted()) + break; grp = ext4_get_group_info(sb, group); if (!grp) continue; From 9b65bff30a61ee538b197fd147135b687b915b4d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 6 Sep 2023 22:47:16 -0400 Subject: [PATCH 015/228] tracing: Have event inject files inc the trace array ref count [ Upstream commit e5c624f027ac74f97e97c8f36c69228ac9f1102d ] The event inject files add events for a specific trace array. For an instance, if the file is opened and the instance is deleted, reading or writing to the file will cause a use after free. Up the ref count of the trace_array when a event inject file is opened. Link: https://lkml.kernel.org/r/20230907024804.292337868@goodmis.org Link: https://lore.kernel.org/all/1cb3aee2-19af-c472-e265-05176fe9bd84@huawei.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Zheng Yejian Fixes: 6c3edaf9fd6a ("tracing: Introduce trace event injection") Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin --- kernel/trace/trace_events_inject.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index 22bcf7c51d1e..149c7dc6a447 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -323,7 +323,8 @@ event_inject_read(struct file *file, char __user *buf, size_t size, } const struct file_operations event_inject_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_inject_read, .write = event_inject_write, + .release = tracing_release_file_tr, }; From f8bf7706151a8d41c3107fc1ac37818a87b6afcd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:02 +0200 Subject: [PATCH 016/228] netfilter: nf_tables: integrate pipapo into commit protocol commit 212ed75dc5fb9d1423b3942c8f872a868cda3466 upstream. The pipapo set backend follows copy-on-update approach, maintaining one clone of the existing datastructure that is being updated. The clone and current datastructures are swapped via rcu from the commit step. The existing integration with the commit protocol is flawed because there is no operation to clean up the clone if the transaction is aborted. Moreover, the datastructure swap happens on set element activation. This patch adds two new operations for sets: commit and abort, these new operations are invoked from the commit and abort steps, after the transactions have been digested, and it updates the pipapo set backend to use it. This patch adds a new ->pending_update field to sets to maintain a list of sets that require this new commit and abort operations. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables.h | 4 ++- net/netfilter/nf_tables_api.c | 56 +++++++++++++++++++++++++++++++ net/netfilter/nft_set_pipapo.c | 55 +++++++++++++++++++++--------- 3 files changed, 99 insertions(+), 16 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index eec29dd6681c..a3068ed0f316 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -373,7 +373,8 @@ struct nft_set_ops { const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); - + void (*commit)(const struct nft_set *set); + void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, @@ -454,6 +455,7 @@ struct nft_set { u16 udlen; unsigned char *udata; struct nft_expr *expr; + struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:14, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2669999d1bc9..430dcd0f6c3b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4509,6 +4509,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } set->handle = nf_tables_alloc_handle(table); + INIT_LIST_HEAD(&set->pending_update); err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) @@ -8141,10 +8142,25 @@ static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation) } } +static void nft_set_commit_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->commit) + continue; + + set->ops->commit(set); + } +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; @@ -8310,6 +8326,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_NEWSETELEM, 0); + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: @@ -8321,6 +8342,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) te->set->ops->remove(net, te->set, &te->elem); atomic_dec(&te->set->nelems); te->set->ndeact--; + if (te->set->ops->commit && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { @@ -8381,6 +8407,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } } + nft_set_commit_update(&set_update_list); + nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); @@ -8437,10 +8465,25 @@ static void nf_tables_abort_release(struct nft_trans *trans) kfree(trans); } +static void nft_set_abort_update(struct list_head *set_update_list) +{ + struct nft_set *set, *next; + + list_for_each_entry_safe(set, next, set_update_list, pending_update) { + list_del_init(&set->pending_update); + + if (!set->ops->abort) + continue; + + set->ops->abort(set); + } +} + static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; + LIST_HEAD(set_update_list); struct nft_trans_elem *te; if (action == NFNL_ABORT_VALIDATE && @@ -8529,6 +8572,12 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) te = (struct nft_trans_elem *)trans->data; te->set->ops->remove(net, te->set, &te->elem); atomic_dec(&te->set->nelems); + + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } break; case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; @@ -8537,6 +8586,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; + if (te->set->ops->abort && + list_empty(&te->set->pending_update)) { + list_add_tail(&te->set->pending_update, + &set_update_list); + } nft_trans_destroy(trans); break; case NFT_MSG_NEWOBJ: @@ -8577,6 +8631,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } } + nft_set_abort_update(&set_update_list); + synchronize_rcu(); list_for_each_entry_safe_reverse(trans, next, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 50f840e312b0..ce6c07ea7244 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1603,17 +1603,10 @@ static void pipapo_free_fields(struct nft_pipapo_match *m) } } -/** - * pipapo_reclaim_match - RCU callback to free fields from old matching data - * @rcu: RCU head - */ -static void pipapo_reclaim_match(struct rcu_head *rcu) +static void pipapo_free_match(struct nft_pipapo_match *m) { - struct nft_pipapo_match *m; int i; - m = container_of(rcu, struct nft_pipapo_match, rcu); - for_each_possible_cpu(i) kfree(*per_cpu_ptr(m->scratch, i)); @@ -1628,7 +1621,19 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) } /** - * pipapo_commit() - Replace lookup data with current working copy + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + pipapo_free_match(m); +} + +/** + * nft_pipapo_commit() - Replace lookup data with current working copy * @set: nftables API set representation * * While at it, check if we should perform garbage collection on the working @@ -1638,7 +1643,7 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) * We also need to create a new working copy for subsequent insertions and * deletions. */ -static void pipapo_commit(const struct nft_set *set) +static void nft_pipapo_commit(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *new_clone, *old; @@ -1663,6 +1668,26 @@ static void pipapo_commit(const struct nft_set *set) priv->clone = new_clone; } +static void nft_pipapo_abort(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *new_clone, *m; + + if (!priv->dirty) + return; + + m = rcu_dereference(priv->match); + + new_clone = pipapo_clone(m); + if (IS_ERR(new_clone)) + return; + + priv->dirty = false; + + pipapo_free_match(priv->clone); + priv->clone = new_clone; +} + /** * nft_pipapo_activate() - Mark element reference as active given key, commit * @net: Network namespace @@ -1670,8 +1695,7 @@ static void pipapo_commit(const struct nft_set *set) * @elem: nftables API element representation containing key data * * On insertion, elements are added to a copy of the matching data currently - * in use for lookups, and not directly inserted into current lookup data, so - * we'll take care of that by calling pipapo_commit() here. Both + * in use for lookups, and not directly inserted into current lookup data. Both * nft_pipapo_insert() and nft_pipapo_activate() are called once for each * element, hence we can't purpose either one as a real commit operation. */ @@ -1687,8 +1711,6 @@ static void nft_pipapo_activate(const struct net *net, nft_set_elem_change_active(net, set, &e->ext); nft_set_elem_clear_busy(&e->ext); - - pipapo_commit(set); } /** @@ -1938,7 +1960,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, if (i == m->field_count) { priv->dirty = true; pipapo_drop(m, rulemap); - pipapo_commit(set); return; } @@ -2245,6 +2266,8 @@ const struct nft_set_type nft_set_pipapo_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; @@ -2267,6 +2290,8 @@ const struct nft_set_type nft_set_pipapo_avx2_type = { .init = nft_pipapo_init, .destroy = nft_pipapo_destroy, .gc_init = nft_pipapo_gc_init, + .commit = nft_pipapo_commit, + .abort = nft_pipapo_abort, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; From b15ea4017af82011dd55225ce77cce3d4dfc169c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 Sep 2023 19:01:03 +0200 Subject: [PATCH 017/228] netfilter: nf_tables: don't skip expired elements during walk commit 24138933b97b055d486e8064b4a1721702442a9b upstream. There is an asymmetry between commit/abort and preparation phase if the following conditions are met: 1. set is a verdict map ("1.2.3.4 : jump foo") 2. timeouts are enabled In this case, following sequence is problematic: 1. element E in set S refers to chain C 2. userspace requests removal of set S 3. kernel does a set walk to decrement chain->use count for all elements from preparation phase 4. kernel does another set walk to remove elements from the commit phase (or another walk to do a chain->use increment for all elements from abort phase) If E has already expired in 1), it will be ignored during list walk, so its use count won't have been changed. Then, when set is culled, ->destroy callback will zap the element via nf_tables_set_elem_destroy(), but this function is only safe for elements that have been deactivated earlier from the preparation phase: lack of earlier deactivate removes the element but leaks the chain use count, which results in a WARN splat when the chain gets removed later, plus a leak of the nft_chain structure. Update pipapo_get() not to skip expired elements, otherwise flush command reports bogus ENOENT errors. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 4 ++++ net/netfilter/nft_set_hash.c | 2 -- net/netfilter/nft_set_pipapo.c | 18 ++++++++++++------ net/netfilter/nft_set_rbtree.c | 2 -- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 430dcd0f6c3b..5eef671578a2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4929,8 +4929,12 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, const struct nft_set_iter *iter, struct nft_set_elem *elem) { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); struct nft_set_dump_args *args; + if (nft_set_elem_expired(ext)) + return 0; + args = container_of(iter, struct nft_set_dump_args, iter); return nf_tables_fill_setelem(args->skb, set, elem); } diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 51d3e6f0934a..ea7bd8549bea 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -277,8 +277,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&he->ext)) - goto cont; if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index ce6c07ea7244..89fa1fedadf7 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -566,8 +566,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, goto out; if (last) { - if (nft_set_elem_expired(&f->mt[b].e->ext) || - (genmask && + if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) goto next_match; @@ -601,8 +600,17 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { - return pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); + struct nft_pipapo_elem *ret; + + ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); + if (IS_ERR(ret)) + return ret; + + if (nft_set_elem_expired(&ret->ext)) + return ERR_PTR(-ENOENT); + + return ret; } /** @@ -2009,8 +2017,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, goto cont; e = f->mt[r].e; - if (nft_set_elem_expired(&e->ext)) - goto cont; elem.priv = e; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index eae760adae4d..2aa3776c5fbb 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -551,8 +551,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (nft_set_elem_expired(&rbe->ext)) - goto cont; if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; From 448be0774882f95a74fa5eb7519761152add601b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:04 +0200 Subject: [PATCH 018/228] netfilter: nf_tables: GC transaction API to avoid race with control plane commit 5f68718b34a531a556f2f50300ead2862278da26 upstream. The set types rhashtable and rbtree use a GC worker to reclaim memory. From system work queue, in periodic intervals, a scan of the table is done. The major caveat here is that the nft transaction mutex is not held. This causes a race between control plane and GC when they attempt to delete the same element. We cannot grab the netlink mutex from the work queue, because the control plane has to wait for the GC work queue in case the set is to be removed, so we get following deadlock: cpu 1 cpu2 GC work transaction comes in , lock nft mutex `acquire nft mutex // BLOCKS transaction asks to remove the set set destruction calls cancel_work_sync() cancel_work_sync will now block forever, because it is waiting for the mutex the caller already owns. This patch adds a new API that deals with garbage collection in two steps: 1) Lockless GC of expired elements sets on the NFT_SET_ELEM_DEAD_BIT so they are not visible via lookup. Annotate current GC sequence in the GC transaction. Enqueue GC transaction work as soon as it is full. If ruleset is updated, then GC transaction is aborted and retried later. 2) GC work grabs the mutex. If GC sequence has changed then this GC transaction lost race with control plane, abort it as it contains stale references to objects and let GC try again later. If the ruleset is intact, then this GC transaction deactivates and removes the elements and it uses call_rcu() to destroy elements. Note that no elements are removed from GC lockless path, the _DEAD bit is set and pointers are collected. GC catchall does not remove the elements anymore too. There is a new set->dead flag that is set on to abort the GC transaction to deal with set->ops->destroy() path which removes the remaining elements in the set from commit_release, where no mutex is held. To deal with GC when mutex is held, which allows safe deactivate and removal, add sync GC API which releases the set element object via call_rcu(). This is used by rbtree and pipapo backends which also perform garbage collection from control plane path. Since element removal from sets can happen from control plane and element garbage collection/timeout, it is necessary to keep the set structure alive until all elements have been deactivated and destroyed. We cannot do a cancel_work_sync or flush_work in nft_set_destroy because its called with the transaction mutex held, but the aforementioned async work queue might be blocked on the very mutex that nft_set_destroy() callchain is sitting on. This gives us the choice of ABBA deadlock or UaF. To avoid both, add set->refs refcount_t member. The GC API can then increment the set refcount and release it once the elements have been free'd. Set backends are adapted to use the GC transaction API in a follow up patch entitled: ("netfilter: nf_tables: use gc transaction API in set backends") This is joint work with Florian Westphal. Fixes: cfed7e1b1f8e ("netfilter: nf_tables: add set garbage collection helpers") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables.h | 61 ++++++++- net/netfilter/nf_tables_api.c | 216 ++++++++++++++++++++++++++++-- 2 files changed, 267 insertions(+), 10 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index a3068ed0f316..39a0b37e8a1a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -407,6 +407,7 @@ struct nft_set_type { * * @list: table set list node * @bindings: list of set bindings + * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set @@ -436,6 +437,7 @@ struct nft_set_type { struct nft_set { struct list_head list; struct list_head bindings; + refcount_t refs; struct nft_table *table; possible_net_t net; char *name; @@ -458,7 +460,8 @@ struct nft_set { struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:14, + u16 flags:13, + dead:1, genmask:2; u8 klen; u8 dlen; @@ -1450,6 +1453,32 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) clear_bit(NFT_SET_ELEM_BUSY_BIT, word); } +#define NFT_SET_ELEM_DEAD_MASK (1 << 3) + +#if defined(__LITTLE_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT 3 +#elif defined(__BIG_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) +#else +#error +#endif + +static inline void nft_set_elem_dead(struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + set_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + +static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + return test_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + /** * struct nft_trans - nf_tables object update in transaction * @@ -1575,6 +1604,35 @@ struct nft_trans_flowtable { #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) +#define NFT_TRANS_GC_BATCHCOUNT 256 + +struct nft_trans_gc { + struct list_head list; + struct net *net; + struct nft_set *set; + u32 seq; + u8 count; + void *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; +}; + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_destroy(struct nft_trans_gc *trans); + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); + +void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); + +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); @@ -1595,6 +1653,7 @@ struct nftables_pernet { struct mutex commit_mutex; unsigned int base_seq; u8 validate_state; + unsigned int gc_seq; }; #endif /* _NET_NF_TABLES_H */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5eef671578a2..1f06dd065d75 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -32,7 +32,9 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); +static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); +static DEFINE_SPINLOCK(nf_tables_gc_list_lock); static u64 table_handle; enum { @@ -124,6 +126,9 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); +static void nft_trans_gc_work(struct work_struct *work); +static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); + static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, @@ -559,10 +564,6 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, return 0; } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem); - static int nft_mapelem_deactivate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, @@ -4474,6 +4475,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, } INIT_LIST_HEAD(&set->bindings); + refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -4534,6 +4536,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return err; } +static void nft_set_put(struct nft_set *set) +{ + if (refcount_dec_and_test(&set->refs)) { + kfree(set->name); + kvfree(set); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { if (WARN_ON(set->use > 0)) @@ -4543,8 +4553,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) nft_expr_destroy(ctx, set->expr); set->ops->destroy(ctx, set); - kfree(set->name); - kvfree(set); + nft_set_put(set); } static int nf_tables_delset(struct net *net, struct sock *nlsk, @@ -5768,9 +5777,9 @@ static void nft_setelem_data_activate(const struct net *net, nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem) +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); @@ -8002,6 +8011,179 @@ void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, + struct nft_trans_gc *trans) +{ + void **priv = trans->priv; + unsigned int i; + + for (i = 0; i < trans->count; i++) { + struct nft_set_elem elem = { + .priv = priv[i], + }; + + nft_setelem_data_deactivate(ctx->net, trans->set, &elem); + trans->set->ops->remove(trans->net, trans->set, &elem); + } +} + +void nft_trans_gc_destroy(struct nft_trans_gc *trans) +{ + nft_set_put(trans->set); + put_net(trans->net); + kfree(trans); +} + +static void nft_trans_gc_trans_free(struct rcu_head *rcu) +{ + struct nft_set_elem elem = {}; + struct nft_trans_gc *trans; + struct nft_ctx ctx = {}; + unsigned int i; + + trans = container_of(rcu, struct nft_trans_gc, rcu); + ctx.net = read_pnet(&trans->set->net); + + for (i = 0; i < trans->count; i++) { + elem.priv = trans->priv[i]; + atomic_dec(&trans->set->nelems); + + nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); + } + + nft_trans_gc_destroy(trans); +} + +static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) +{ + struct nftables_pernet *nft_net; + struct nft_ctx ctx = {}; + + nft_net = net_generic(trans->net, nf_tables_net_id); + + mutex_lock(&nft_net->commit_mutex); + + /* Check for race with transaction, otherwise this batch refers to + * stale objects that might not be there anymore. Skip transaction if + * set has been destroyed from control plane transaction in case gc + * worker loses race. + */ + if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { + mutex_unlock(&nft_net->commit_mutex); + return false; + } + + ctx.net = trans->net; + ctx.table = trans->set->table; + + nft_trans_gc_setelem_remove(&ctx, trans); + mutex_unlock(&nft_net->commit_mutex); + + return true; +} + +static void nft_trans_gc_work(struct work_struct *work) +{ + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + + spin_lock(&nf_tables_destroy_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); + spin_unlock(&nf_tables_destroy_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); + if (!nft_trans_gc_work_done(trans)) { + nft_trans_gc_destroy(trans); + continue; + } + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp) +{ + struct net *net = read_pnet(&set->net); + struct nft_trans_gc *trans; + + trans = kzalloc(sizeof(*trans), gfp); + if (!trans) + return NULL; + + refcount_inc(&set->refs); + trans->set = set; + trans->net = get_net(net); + trans->seq = gc_seq; + + return trans; +} + +void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) +{ + trans->priv[trans->count++] = priv; +} + +static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) +{ + spin_lock(&nf_tables_gc_list_lock); + list_add_tail(&trans->list, &nf_tables_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + schedule_work(&trans_gc_work); +} + +static int nft_trans_gc_space(struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) +{ + if (nft_trans_gc_space(gc)) + return gc; + + nft_trans_gc_queue_work(gc); + + return nft_trans_gc_alloc(gc->set, gc_seq, gfp); +} + +void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +{ + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + nft_trans_gc_queue_work(trans); +} + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) +{ + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + + return nft_trans_gc_alloc(gc->set, 0, gfp); +} + +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +{ + WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); + + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + call_rcu(&trans->rcu, nft_trans_gc_trans_free); +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); @@ -8168,6 +8350,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + unsigned int gc_seq; LIST_HEAD(adl); int err; @@ -8240,6 +8423,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) while (++nft_net->base_seq == 0) ; + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -8319,6 +8506,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_trans_destroy(trans); break; case NFT_MSG_DELSET: + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), NFT_MSG_DELSET, GFP_KERNEL); @@ -8416,6 +8604,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); + + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); nf_tables_commit_release(net); return 0; @@ -9353,6 +9543,7 @@ static int __net_init nf_tables_init_net(struct net *net) mutex_init(&nft_net->commit_mutex); nft_net->base_seq = 1; nft_net->validate_state = NFT_VALIDATE_SKIP; + nft_net->gc_seq = 0; return 0; } @@ -9380,10 +9571,16 @@ static void __net_exit nf_tables_exit_net(struct net *net) WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } +static void nf_tables_exit_batch(struct list_head *net_exit_list) +{ + flush_work(&trans_gc_work); +} + static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .exit_batch = nf_tables_exit_batch, .id = &nf_tables_net_id, .size = sizeof(struct nftables_pernet), }; @@ -9448,6 +9645,7 @@ static void __exit nf_tables_module_exit(void) nft_chain_filter_fini(); nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); + cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); From 146c76866795553dbc19998f36718d7986ad302b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:05 +0200 Subject: [PATCH 019/228] netfilter: nf_tables: adapt set backend to use GC transaction API commit f6c383b8c31a93752a52697f8430a71dcbc46adf upstream. Use the GC transaction API to replace the old and buggy gc API and the busy mark approach. No set elements are removed from async garbage collection anymore, instead the _DEAD bit is set on so the set element is not visible from lookup path anymore. Async GC enqueues transaction work that might be aborted and retried later. rbtree and pipapo set backends does not set on the _DEAD bit from the sync GC path since this runs in control plane path where mutex is held. In this case, set elements are deactivated, removed and then released via RCU callback, sync GC never fails. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_set_hash.c | 75 ++++++++++++------ net/netfilter/nft_set_pipapo.c | 43 +++++++--- net/netfilter/nft_set_rbtree.c | 138 +++++++++++++++++++++------------ 3 files changed, 171 insertions(+), 85 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index ea7bd8549bea..6ae99b3107bc 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -17,6 +17,9 @@ #include #include #include +#include + +extern unsigned int nf_tables_net_id; /* We target a hash table size of 4, element hint is 75% of final size */ #define NFT_RHASH_ELEMENT_HINT 3 @@ -59,6 +62,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) return 1; + if (nft_set_elem_is_dead(&he->ext)) + return 1; if (nft_set_elem_expired(&he->ext)) return 1; if (!nft_set_elem_active(&he->ext, x->genmask)) @@ -187,7 +192,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, struct nft_rhash_elem *he = elem->priv; nft_set_elem_change_active(net, set, &he->ext); - nft_set_elem_clear_busy(&he->ext); } static bool nft_rhash_flush(const struct net *net, @@ -195,12 +199,9 @@ static bool nft_rhash_flush(const struct net *net, { struct nft_rhash_elem *he = priv; - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) { - nft_set_elem_change_active(net, set, &he->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &he->ext); + + return true; } static void *nft_rhash_deactivate(const struct net *net, @@ -217,9 +218,8 @@ static void *nft_rhash_deactivate(const struct net *net, rcu_read_lock(); he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); - if (he != NULL && - !nft_rhash_flush(net, set, he)) - he = NULL; + if (he) + nft_set_elem_change_active(net, set, &he->ext); rcu_read_unlock(); @@ -295,49 +295,75 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, static void nft_rhash_gc(struct work_struct *work) { + struct nftables_pernet *nft_net; struct nft_set *set; struct nft_rhash_elem *he; struct nft_rhash *priv; - struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; + struct nft_trans_gc *gc; + struct net *net; + u32 gc_seq; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); + net = read_pnet(&set->net); + nft_net = net_generic(net, nf_tables_net_id); + gc_seq = READ_ONCE(nft_net->gc_seq); + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) - break; + if (PTR_ERR(he) != -EAGAIN) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } continue; } + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + + if (nft_set_elem_is_dead(&he->ext)) + goto dead_elem; + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) { struct nft_expr *expr = nft_set_ext_expr(&he->ext); if (expr->ops->gc && expr->ops->gc(read_pnet(&set->net), expr)) - goto gc; + goto needs_gc_run; } + if (!nft_set_elem_expired(&he->ext)) continue; -gc: - if (nft_set_elem_mark_busy(&he->ext)) - continue; +needs_gc_run: + nft_set_elem_dead(&he->ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb == NULL) - break; - rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, he); + nft_trans_gc_elem_add(gc, he); } + +try_later: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - nft_set_gc_batch_complete(gcb); + if (gc) + nft_trans_gc_queue_async_done(gc); +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } @@ -400,7 +426,6 @@ static void nft_rhash_destroy(const struct nft_ctx *ctx, }; cancel_delayed_work_sync(&priv->gc_work); - rcu_barrier(); rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, (void *)&rhash_ctx); } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 89fa1fedadf7..63d0723950d3 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1544,15 +1544,32 @@ static void pipapo_drop(struct nft_pipapo_match *m, } } +static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, + struct nft_pipapo_elem *e) +{ + struct nft_set_elem elem = { + .priv = e, + }; + + nft_setelem_data_deactivate(net, set, &elem); +} + /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements - * @set: nftables API set representation + * @_set: nftables API set representation * @m: Matching data */ -static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) { + struct nft_set *set = (struct nft_set *) _set; struct nft_pipapo *priv = nft_set_priv(set); + struct net *net = read_pnet(&set->net); int rules_f0, first_rule = 0; + struct nft_trans_gc *gc; + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; @@ -1577,13 +1594,19 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) f--; i--; e = f->mt[rulemap[i].to].e; - if (nft_set_elem_expired(&e->ext) && - !nft_set_elem_mark_busy(&e->ext)) { + /* synchronous gc never fails, there is no need to set on + * NFT_SET_ELEM_DEAD_BIT. + */ + if (nft_set_elem_expired(&e->ext)) { priv->dirty = true; - pipapo_drop(m, rulemap); - rcu_barrier(); - nft_set_elem_destroy(set, e, true); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (!gc) + break; + + nft_pipapo_gc_deactivate(net, set, e); + pipapo_drop(m, rulemap); + nft_trans_gc_elem_add(gc, e); /* And check again current first rule, which is now the * first we haven't checked. @@ -1593,7 +1616,10 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) } } - priv->last_gc = jiffies; + if (gc) { + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; + } } /** @@ -1718,7 +1744,6 @@ static void nft_pipapo_activate(const struct net *net, return; nft_set_elem_change_active(net, set, &e->ext); - nft_set_elem_clear_busy(&e->ext); } /** diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 2aa3776c5fbb..ed14849aa47f 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -14,6 +14,9 @@ #include #include #include +#include + +extern unsigned int nf_tables_net_id; struct nft_rbtree { struct rb_root root; @@ -46,6 +49,12 @@ static int nft_rbtree_cmp(const struct nft_set *set, set->klen); } +static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) +{ + return nft_set_elem_expired(&rbe->ext) || + nft_set_elem_is_dead(&rbe->ext); +} + static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext, unsigned int seq) @@ -80,7 +89,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set continue; } - if (nft_set_elem_expired(&rbe->ext)) + if (nft_rbtree_elem_expired(rbe)) return false; if (nft_rbtree_interval_end(rbe)) { @@ -98,7 +107,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && + !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; @@ -214,6 +223,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, return rbe; } +static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + struct nft_set_elem elem = { + .priv = rbe, + }; + + nft_setelem_data_deactivate(net, set, &elem); + rb_erase(&rbe->node, &priv->root); +} + static int nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, struct nft_rbtree_elem *rbe, @@ -221,11 +242,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); + struct net *net = read_pnet(&set->net); struct nft_rbtree_elem *rbe_prev; - struct nft_set_gc_batch *gcb; + struct nft_trans_gc *gc; - gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); - if (!gcb) + gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); + if (!gc) return -ENOMEM; /* search for end interval coming before this element. @@ -243,17 +265,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); + nft_rbtree_gc_remove(net, set, priv, rbe_prev); - rb_erase(&rbe_prev->node, &priv->root); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_prev); + /* There is always room in this trans gc for this element, + * memory allocation never actually happens, hence, the warning + * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, + * this is synchronous gc which never fails. + */ + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; + + nft_trans_gc_elem_add(gc, rbe_prev); } - rb_erase(&rbe->node, &priv->root); - atomic_dec(&set->nelems); + nft_rbtree_gc_remove(net, set, priv, rbe); + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (WARN_ON_ONCE(!gc)) + return -ENOMEM; - nft_set_gc_batch_add(gcb, rbe); - nft_set_gc_batch_complete(gcb); + nft_trans_gc_elem_add(gc, rbe); + + nft_trans_gc_queue_sync_done(gc); return 0; } @@ -481,7 +514,6 @@ static void nft_rbtree_activate(const struct net *net, struct nft_rbtree_elem *rbe = elem->priv; nft_set_elem_change_active(net, set, &rbe->ext); - nft_set_elem_clear_busy(&rbe->ext); } static bool nft_rbtree_flush(const struct net *net, @@ -489,12 +521,9 @@ static bool nft_rbtree_flush(const struct net *net, { struct nft_rbtree_elem *rbe = priv; - if (!nft_set_elem_mark_busy(&rbe->ext) || - !nft_is_active(net, &rbe->ext)) { - nft_set_elem_change_active(net, set, &rbe->ext); - return true; - } - return false; + nft_set_elem_change_active(net, set, &rbe->ext); + + return true; } static void *nft_rbtree_deactivate(const struct net *net, @@ -569,26 +598,40 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, static void nft_rbtree_gc(struct work_struct *work) { - struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; - struct nft_set_gc_batch *gcb = NULL; + struct nft_rbtree_elem *rbe, *rbe_end = NULL; + struct nftables_pernet *nft_net; struct nft_rbtree *priv; + struct nft_trans_gc *gc; struct rb_node *node; struct nft_set *set; + unsigned int gc_seq; struct net *net; - u8 genmask; priv = container_of(work, struct nft_rbtree, gc_work.work); set = nft_set_container_of(priv); net = read_pnet(&set->net); - genmask = nft_genmask_cur(net); + nft_net = net_generic(net, nf_tables_net_id); + gc_seq = READ_ONCE(nft_net->gc_seq); + + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; write_lock_bh(&priv->lock); write_seqcount_begin(&priv->count); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + + /* Ruleset has been updated, try later. */ + if (READ_ONCE(nft_net->gc_seq) != gc_seq) { + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; + } + rbe = rb_entry(node, struct nft_rbtree_elem, node); - if (!nft_set_elem_active(&rbe->ext, genmask)) - continue; + if (nft_set_elem_is_dead(&rbe->ext)) + goto dead_elem; /* elements are reversed in the rbtree for historical reasons, * from highest to lowest value, that is why end element is @@ -601,40 +644,33 @@ static void nft_rbtree_gc(struct work_struct *work) if (!nft_set_elem_expired(&rbe->ext)) continue; - if (nft_set_elem_mark_busy(&rbe->ext)) { - rbe_end = NULL; + nft_set_elem_dead(&rbe->ext); + + if (!rbe_end) continue; - } - if (rbe_prev) { - rb_erase(&rbe_prev->node, &priv->root); - rbe_prev = NULL; - } - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (!gcb) - break; + nft_set_elem_dead(&rbe_end->ext); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe); - rbe_prev = rbe; + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; - if (rbe_end) { - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, rbe_end); - rb_erase(&rbe_end->node, &priv->root); - rbe_end = NULL; - } - node = rb_next(node); - if (!node) - break; + nft_trans_gc_elem_add(gc, rbe_end); + rbe_end = NULL; +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + goto try_later; + + nft_trans_gc_elem_add(gc, rbe); } - if (rbe_prev) - rb_erase(&rbe_prev->node, &priv->root); +try_later: write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); - nft_set_gc_batch_complete(gcb); - + if (gc) + nft_trans_gc_queue_async_done(gc); +done: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } From 77046cb00850e35ba935944b5100996b2ce34bba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:06 +0200 Subject: [PATCH 020/228] netfilter: nft_set_hash: mark set element as dead when deleting from packet path commit c92db3030492b8ad1d0faace7a93bbcf53850d0c upstream. Set on the NFT_SET_ELEM_DEAD_BIT flag on this element, instead of performing element removal which might race with an ongoing transaction. Enable gc when dynamic flag is set on since dynset deletion requires garbage collection after this patch. Fixes: d0a8d877da97 ("netfilter: nft_dynset: support for element deletion") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_set_hash.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6ae99b3107bc..9cdf348b048a 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -251,7 +251,9 @@ static bool nft_rhash_delete(const struct nft_set *set, if (he == NULL) return false; - return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; + nft_set_elem_dead(&he->ext); + + return true; } static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, @@ -398,7 +400,7 @@ static int nft_rhash_init(const struct nft_set *set, return err; INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); - if (set->flags & NFT_SET_TIMEOUT) + if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL)) nft_rhash_gc_init(set); return 0; From 911dd3cdf1083f4c2e7df72aaab486a1d6dbcc0a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:07 +0200 Subject: [PATCH 021/228] netfilter: nf_tables: remove busy mark and gc batch API commit a2dd0233cbc4d8a0abb5f64487487ffc9265beb5 upstream. Ditch it, it has been replace it by the GC transaction API and it has no clients anymore. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables.h | 97 +------------------------------ net/netfilter/nf_tables_api.c | 26 +-------- 2 files changed, 5 insertions(+), 118 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 39a0b37e8a1a..9182b583d429 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -695,62 +695,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, void *elem); -/** - * struct nft_set_gc_batch_head - nf_tables set garbage collection batch - * - * @rcu: rcu head - * @set: set the elements belong to - * @cnt: count of elements - */ -struct nft_set_gc_batch_head { - struct rcu_head rcu; - const struct nft_set *set; - unsigned int cnt; -}; - -#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \ - sizeof(struct nft_set_gc_batch_head)) / \ - sizeof(void *)) - -/** - * struct nft_set_gc_batch - nf_tables set garbage collection batch - * - * @head: GC batch head - * @elems: garbage collection elements - */ -struct nft_set_gc_batch { - struct nft_set_gc_batch_head head; - void *elems[NFT_SET_GC_BATCH_SIZE]; -}; - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp); -void nft_set_gc_batch_release(struct rcu_head *rcu); - -static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb) -{ - if (gcb != NULL) - call_rcu(&gcb->head.rcu, nft_set_gc_batch_release); -} - -static inline struct nft_set_gc_batch * -nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb, - gfp_t gfp) -{ - if (gcb != NULL) { - if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems)) - return gcb; - nft_set_gc_batch_complete(gcb); - } - return nft_set_gc_batch_alloc(set, gfp); -} - -static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb, - void *elem) -{ - gcb->elems[gcb->head.cnt++] = elem; -} - struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type @@ -1418,47 +1362,12 @@ static inline void nft_set_elem_change_active(const struct net *net, #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ -/* - * We use a free bit in the genmask field to indicate the element - * is busy, meaning it is currently being processed either by - * the netlink API or GC. - * - * Even though the genmask is only a single byte wide, this works - * because the extension structure if fully constant once initialized, - * so there are no non-atomic write accesses unless it is already - * marked busy. - */ -#define NFT_SET_ELEM_BUSY_MASK (1 << 2) +#define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT 2 +#define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) -#else -#error -#endif - -static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext) -{ - unsigned long *word = (unsigned long *)ext; - - BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); - return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word); -} - -static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) -{ - unsigned long *word = (unsigned long *)ext; - - clear_bit(NFT_SET_ELEM_BUSY_BIT, word); -} - -#define NFT_SET_ELEM_DEAD_MASK (1 << 3) - -#if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_DEAD_BIT 3 -#elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1f06dd065d75..206755eb35f3 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5637,7 +5637,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_elem_expr; } - ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; + ext->genmask = nft_genmask_cur(ctx->net); + err = set->ops->insert(ctx->net, set, &elem, &ext2); if (err) { if (err == -EEXIST) { @@ -5945,29 +5946,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, return err; } -void nft_set_gc_batch_release(struct rcu_head *rcu) -{ - struct nft_set_gc_batch *gcb; - unsigned int i; - - gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); - for (i = 0; i < gcb->head.cnt; i++) - nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); - kfree(gcb); -} - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp) -{ - struct nft_set_gc_batch *gcb; - - gcb = kzalloc(sizeof(*gcb), gfp); - if (gcb == NULL) - return gcb; - gcb->head.set = set; - return gcb; -} - /* * Stateful objects */ From 891ca5dfe3b718b441fc786014a7ba8f517da188 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 Sep 2023 19:01:08 +0200 Subject: [PATCH 022/228] netfilter: nf_tables: don't fail inserts if duplicate has expired commit 7845914f45f066497ac75b30c50dbc735e84e884 upstream. nftables selftests fail: run-tests.sh testcases/sets/0044interval_overlap_0 Expected: 0-2 . 0-3, got: W: [FAILED] ./testcases/sets/0044interval_overlap_0: got 1 Insertion must ignore duplicate but expired entries. Moreover, there is a strange asymmetry in nft_pipapo_activate: It refetches the current element, whereas the other ->activate callbacks (bitmap, hash, rhash, rbtree) use elem->priv. Same for .remove: other set implementations take elem->priv, nft_pipapo_remove fetches elem->priv, then does a relookup, remove this. I suspect this was the reason for the change that prompted the removal of the expired check in pipapo_get() in the first place, but skipping exired elements there makes no sense to me, this helper is used for normal get requests, insertions (duplicate check) and deactivate callback. In first two cases expired elements must be skipped. For ->deactivate(), this gets called for DELSETELEM, so it seems to me that expired elements should be skipped as well, i.e. delete request should fail with -ENOENT error. Fixes: 24138933b97b ("netfilter: nf_tables: don't skip expired elements during walk") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_set_pipapo.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 63d0723950d3..80440ac5d44c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -566,6 +566,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, goto out; if (last) { + if (nft_set_elem_expired(&f->mt[b].e->ext)) + goto next_match; if ((genmask && !nft_set_elem_active(&f->mt[b].e->ext, genmask))) goto next_match; @@ -600,17 +602,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { - struct nft_pipapo_elem *ret; - - ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, - nft_genmask_cur(net)); - if (IS_ERR(ret)) - return ret; - - if (nft_set_elem_expired(&ret->ext)) - return ERR_PTR(-ENOENT); - - return ret; + return pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); } /** @@ -1737,11 +1730,7 @@ static void nft_pipapo_activate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nft_pipapo_elem *e; - - e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); - if (IS_ERR(e)) - return; + struct nft_pipapo_elem *e = elem->priv; nft_set_elem_change_active(net, set, &e->ext); } @@ -1955,10 +1944,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, data = (const u8 *)nft_set_ext_key(&e->ext); - e = pipapo_get(net, set, data, 0); - if (IS_ERR(e)) - return; - while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *match_start, *match_end; From 4046f2b56e5a7ba7e123ff961dd51187b8d59e78 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:09 +0200 Subject: [PATCH 023/228] netfilter: nf_tables: fix GC transaction races with netns and netlink event exit path commit 6a33d8b73dfac0a41f3877894b38082bd0c9a5bc upstream. Netlink event path is missing a synchronization point with GC transactions. Add GC sequence number update to netns release path and netlink event path, any GC transaction losing race will be discarded. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 206755eb35f3..43da2f0a5262 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8320,6 +8320,22 @@ static void nft_set_commit_update(struct list_head *set_update_list) } } +static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net) +{ + unsigned int gc_seq; + + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + + return gc_seq; +} + +static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq) +{ + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); @@ -8401,9 +8417,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) while (++nft_net->base_seq == 0) ; - /* Bump gc counter, it becomes odd, this is the busy mark. */ - gc_seq = READ_ONCE(nft_net->gc_seq); - WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + gc_seq = nft_gc_seq_begin(nft_net); /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -8583,7 +8597,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); - WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + nft_gc_seq_end(nft_net, gc_seq); nf_tables_commit_release(net); return 0; @@ -9538,11 +9552,18 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + unsigned int gc_seq; mutex_lock(&nft_net->commit_mutex); + + gc_seq = nft_gc_seq_begin(nft_net); + if (!list_empty(&nft_net->commit_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); __nft_release_tables(net); + + nft_gc_seq_end(nft_net, gc_seq); + mutex_unlock(&nft_net->commit_mutex); WARN_ON_ONCE(!list_empty(&nft_net->tables)); WARN_ON_ONCE(!list_empty(&nft_net->module_list)); From dc0b1f019554e601f57e78d8f5c70e59d77e49a5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:10 +0200 Subject: [PATCH 024/228] netfilter: nf_tables: GC transaction race with netns dismantle commit 02c6c24402bf1c1e986899c14ba22a10b510916b upstream. Use maybe_get_net() since GC workqueue might race with netns exit path. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 43da2f0a5262..78bf82f89ecd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8089,9 +8089,14 @@ struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, if (!trans) return NULL; + trans->net = maybe_get_net(net); + if (!trans->net) { + kfree(trans); + return NULL; + } + refcount_inc(&set->refs); trans->set = set; - trans->net = get_net(net); trans->seq = gc_seq; return trans; From 23292bdfda5f04e704a843b8f97b0eb95ace1ca6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:11 +0200 Subject: [PATCH 025/228] netfilter: nf_tables: GC transaction race with abort path commit 720344340fb9be2765bbaab7b292ece0a4570eae upstream. Abort path is missing a synchronization point with GC transactions. Add GC sequence number hence any GC transaction losing race will be discarded. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 78bf82f89ecd..1f67931b86d8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8844,7 +8844,12 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); - int ret = __nf_tables_abort(net, action); + unsigned int gc_seq; + int ret; + + gc_seq = nft_gc_seq_begin(nft_net); + ret = __nf_tables_abort(net, action); + nft_gc_seq_end(nft_net, gc_seq); mutex_unlock(&nft_net->commit_mutex); From b71dcee2fc9c07289f63c0122a5d39108f476ce1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:12 +0200 Subject: [PATCH 026/228] netfilter: nf_tables: use correct lock to protect gc_list commit 8357bc946a2abc2a10ca40e5a2105d2b4c57515e upstream. Use nf_tables_gc_list_lock spinlock, not nf_tables_destroy_list_lock to protect the gc_list. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1f67931b86d8..9fc302a6836b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8065,9 +8065,9 @@ static void nft_trans_gc_work(struct work_struct *work) struct nft_trans_gc *trans, *next; LIST_HEAD(trans_gc_list); - spin_lock(&nf_tables_destroy_list_lock); + spin_lock(&nf_tables_gc_list_lock); list_splice_init(&nf_tables_gc_list, &trans_gc_list); - spin_unlock(&nf_tables_destroy_list_lock); + spin_unlock(&nf_tables_gc_list_lock); list_for_each_entry_safe(trans, next, &trans_gc_list, list) { list_del(&trans->list); From 09f2dda1e5762a60bad113bc1c04eeaf805ee565 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 Sep 2023 19:01:13 +0200 Subject: [PATCH 027/228] netfilter: nf_tables: defer gc run if previous batch is still pending commit 8e51830e29e12670b4c10df070a4ea4c9593e961 upstream. Don't queue more gc work, else we may queue the same elements multiple times. If an element is flagged as dead, this can mean that either the previous gc request was invalidated/discarded by a transaction or that the previous request is still pending in the system work queue. The latter will happen if the gc interval is set to a very low value, e.g. 1ms, and system work queue is backlogged. The sets refcount is 1 if no previous gc requeusts are queued, so add a helper for this and skip gc run if old requests are pending. Add a helper for this and skip the gc run in this case. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables.h | 5 +++++ net/netfilter/nft_set_hash.c | 3 +++ net/netfilter/nft_set_rbtree.c | 3 +++ 3 files changed, 11 insertions(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9182b583d429..bbe472c07d07 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -479,6 +479,11 @@ static inline void *nft_set_priv(const struct nft_set *set) return (void *)set->data; } +static inline bool nft_set_gc_is_pending(const struct nft_set *s) +{ + return refcount_read(&s->refs) != 1; +} + static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 9cdf348b048a..68a16ee37b3d 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -312,6 +312,9 @@ static void nft_rhash_gc(struct work_struct *work) nft_net = net_generic(net, nf_tables_net_id); gc_seq = READ_ONCE(nft_net->gc_seq); + if (nft_set_gc_is_pending(set)) + goto done; + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); if (!gc) goto done; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index ed14849aa47f..9b0bdd421615 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -613,6 +613,9 @@ static void nft_rbtree_gc(struct work_struct *work) nft_net = net_generic(net, nf_tables_net_id); gc_seq = READ_ONCE(nft_net->gc_seq); + if (nft_set_gc_is_pending(set)) + goto done; + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); if (!gc) goto done; From c323ed65f66e5387ee0a73452118d49f1dae81b8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:14 +0200 Subject: [PATCH 028/228] netfilter: nft_set_rbtree: skip sync GC for new elements in this transaction commit 2ee52ae94baabf7ee09cf2a8d854b990dac5d0e4 upstream. New elements in this transaction might expired before such transaction ends. Skip sync GC for such elements otherwise commit path might walk over an already released object. Once transaction is finished, async GC will collect such expired element. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_set_rbtree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9b0bdd421615..535076b4de53 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -314,6 +314,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); + u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); int d, err; @@ -359,8 +360,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, if (!nft_set_elem_active(&rbe->ext, genmask)) continue; - /* perform garbage collection to avoid bogus overlap reports. */ - if (nft_set_elem_expired(&rbe->ext)) { + /* perform garbage collection to avoid bogus overlap reports + * but skip new elements in this transaction. + */ + if (nft_set_elem_expired(&rbe->ext) && + nft_set_elem_active(&rbe->ext, cur_genmask)) { err = nft_rbtree_gc_elem(set, priv, rbe, genmask); if (err < 0) return err; From b796c4e4bf29bbf9481eac0517449e4ce6385ad8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:15 +0200 Subject: [PATCH 029/228] netfilter: nft_set_rbtree: use read spinlock to avoid datapath contention commit 96b33300fba880ec0eafcf3d82486f3463b4b6da upstream. rbtree GC does not modify the datastructure, instead it collects expired elements and it enqueues a GC transaction. Use a read spinlock instead to avoid data contention while GC worker is running. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_set_rbtree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 535076b4de53..cc32e19b4041 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -624,8 +624,7 @@ static void nft_rbtree_gc(struct work_struct *work) if (!gc) goto done; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); + read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { /* Ruleset has been updated, try later. */ @@ -672,8 +671,7 @@ static void nft_rbtree_gc(struct work_struct *work) nft_trans_gc_elem_add(gc, rbe); } try_later: - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + read_unlock_bh(&priv->lock); if (gc) nft_trans_gc_queue_async_done(gc); From 26d0e4d632f8afee2ac73a5c6fb28e40144403af Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:16 +0200 Subject: [PATCH 030/228] netfilter: nft_set_pipapo: stop GC iteration if GC transaction allocation fails commit 6d365eabce3c018a80f6e0379b17df2abb17405e upstream. nft_trans_gc_queue_sync() enqueues the GC transaction and it allocates a new one. If this allocation fails, then stop this GC sync run and retry later. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_set_pipapo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 80440ac5d44c..fbfcc3275cad 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1595,7 +1595,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (!gc) - break; + return; nft_pipapo_gc_deactivate(net, set, e); pipapo_drop(m, rulemap); From 4deaf1316b42ae9f6dff0911ba75435ab3475d5c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 22 Sep 2023 19:01:17 +0200 Subject: [PATCH 031/228] netfilter: nft_set_hash: try later when GC hits EAGAIN on iteration commit b079155faae94e9b3ab9337e82100a914ebb4e8d upstream. Skip GC run if iterator rewinds to the beginning with EAGAIN, otherwise GC might collect the same element more than once. Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_set_hash.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 68a16ee37b3d..f0a9ad1c4ea4 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -324,12 +324,9 @@ static void nft_rhash_gc(struct work_struct *work) while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) { - nft_trans_gc_destroy(gc); - gc = NULL; - goto try_later; - } - continue; + nft_trans_gc_destroy(gc); + gc = NULL; + goto try_later; } /* Ruleset has been updated, try later. */ From 09c85f2d21ab6b5acba31a037985b13e8e6565b8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 Sep 2023 19:01:18 +0200 Subject: [PATCH 032/228] netfilter: nf_tables: fix memleak when more than 255 elements expired commit cf5000a7787cbc10341091d37245a42c119d26c5 upstream. When more than 255 elements expired we're supposed to switch to a new gc container structure. This never happens: u8 type will wrap before reaching the boundary and nft_trans_gc_space() always returns true. This means we recycle the initial gc container structure and lose track of the elements that came before. While at it, don't deref 'gc' after we've passed it to call_rcu. Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Reported-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index bbe472c07d07..5619642b9ad4 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1525,7 +1525,7 @@ struct nft_trans_gc { struct net *net; struct nft_set *set; u32 seq; - u8 count; + u16 count; void *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9fc302a6836b..32c97cc87ddc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8124,12 +8124,15 @@ static int nft_trans_gc_space(struct nft_trans_gc *trans) struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp) { + struct nft_set *set; + if (nft_trans_gc_space(gc)) return gc; + set = gc->set; nft_trans_gc_queue_work(gc); - return nft_trans_gc_alloc(gc->set, gc_seq, gfp); + return nft_trans_gc_alloc(set, gc_seq, gfp); } void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) @@ -8144,15 +8147,18 @@ void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) { + struct nft_set *set; + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) return NULL; if (nft_trans_gc_space(gc)) return gc; + set = gc->set; call_rcu(&gc->rcu, nft_trans_gc_trans_free); - return nft_trans_gc_alloc(gc->set, 0, gfp); + return nft_trans_gc_alloc(set, 0, gfp); } void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) From a45632f9971315aca1330213fd27e128c77dfc4d Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Thu, 7 Sep 2023 11:05:04 +0200 Subject: [PATCH 033/228] ASoC: meson: spdifin: start hw on dai probe [ Upstream commit aedf323b66b2b875137422ecb7d2525179759076 ] For spdif input to report the locked rate correctly, even when no capture is running, the HW and reference clock must be started as soon as the dai is probed. Fixes: 5ce5658375e6 ("ASoC: meson: add axg spdif input") Signed-off-by: Jerome Brunet Link: https://lore.kernel.org/r/20230907090504.12700-1-jbrunet@baylibre.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/meson/axg-spdifin.c | 49 ++++++++++++----------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/sound/soc/meson/axg-spdifin.c b/sound/soc/meson/axg-spdifin.c index d0d09f945b48..7aaded1fc376 100644 --- a/sound/soc/meson/axg-spdifin.c +++ b/sound/soc/meson/axg-spdifin.c @@ -112,34 +112,6 @@ static int axg_spdifin_prepare(struct snd_pcm_substream *substream, return 0; } -static int axg_spdifin_startup(struct snd_pcm_substream *substream, - struct snd_soc_dai *dai) -{ - struct axg_spdifin *priv = snd_soc_dai_get_drvdata(dai); - int ret; - - ret = clk_prepare_enable(priv->refclk); - if (ret) { - dev_err(dai->dev, - "failed to enable spdifin reference clock\n"); - return ret; - } - - regmap_update_bits(priv->map, SPDIFIN_CTRL0, SPDIFIN_CTRL0_EN, - SPDIFIN_CTRL0_EN); - - return 0; -} - -static void axg_spdifin_shutdown(struct snd_pcm_substream *substream, - struct snd_soc_dai *dai) -{ - struct axg_spdifin *priv = snd_soc_dai_get_drvdata(dai); - - regmap_update_bits(priv->map, SPDIFIN_CTRL0, SPDIFIN_CTRL0_EN, 0); - clk_disable_unprepare(priv->refclk); -} - static void axg_spdifin_write_mode_param(struct regmap *map, int mode, unsigned int val, unsigned int num_per_reg, @@ -251,25 +223,38 @@ static int axg_spdifin_dai_probe(struct snd_soc_dai *dai) ret = axg_spdifin_sample_mode_config(dai, priv); if (ret) { dev_err(dai->dev, "mode configuration failed\n"); - clk_disable_unprepare(priv->pclk); - return ret; + goto pclk_err; } + ret = clk_prepare_enable(priv->refclk); + if (ret) { + dev_err(dai->dev, + "failed to enable spdifin reference clock\n"); + goto pclk_err; + } + + regmap_update_bits(priv->map, SPDIFIN_CTRL0, SPDIFIN_CTRL0_EN, + SPDIFIN_CTRL0_EN); + return 0; + +pclk_err: + clk_disable_unprepare(priv->pclk); + return ret; } static int axg_spdifin_dai_remove(struct snd_soc_dai *dai) { struct axg_spdifin *priv = snd_soc_dai_get_drvdata(dai); + regmap_update_bits(priv->map, SPDIFIN_CTRL0, SPDIFIN_CTRL0_EN, 0); + clk_disable_unprepare(priv->refclk); clk_disable_unprepare(priv->pclk); return 0; } static const struct snd_soc_dai_ops axg_spdifin_ops = { .prepare = axg_spdifin_prepare, - .startup = axg_spdifin_startup, - .shutdown = axg_spdifin_shutdown, }; static int axg_spdifin_iec958_info(struct snd_kcontrol *kcontrol, From 6dc85d848c264f9fae506c2b6967697222c9b58e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 10 Sep 2023 19:04:45 +0200 Subject: [PATCH 034/228] netfilter: nf_tables: disallow element removal on anonymous sets [ Upstream commit 23a3bfd4ba7acd36abf52b78605f61b21bdac216 ] Anonymous sets need to be populated once at creation and then they are bound to rule since 938154b93be8 ("netfilter: nf_tables: reject unbound anonymous set before commit phase"), otherwise transaction reports EINVAL. Userspace does not need to delete elements of anonymous sets that are not yet bound, reject this with EOPNOTSUPP. From flush command path, skip anonymous sets, they are expected to be bound already. Otherwise, EINVAL is hit at the end of this transaction for unbound sets. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 32c97cc87ddc..52c776b5967e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1267,8 +1267,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, set)) continue; - if (nft_set_is_anonymous(set) && - !list_empty(&set->bindings)) + if (nft_set_is_anonymous(set)) continue; err = nft_delset(ctx, set); @@ -5922,8 +5921,10 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, if (IS_ERR(set)) return PTR_ERR(set); - if (!list_empty(&set->bindings) && - (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS))) + if (nft_set_is_anonymous(set)) + return -EOPNOTSUPP; + + if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT)) return -EBUSY; if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) { From 388c9d3eefaea99828ee5000c693128a5b41ee7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 11 Sep 2023 15:28:14 +0200 Subject: [PATCH 035/228] bpf: Avoid deadlock when using queue and stack maps from NMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit a34a9f1a19afe9c60ca0ea61dfeee63a1c2baac8 ] Sysbot discovered that the queue and stack maps can deadlock if they are being used from a BPF program that can be called from NMI context (such as one that is attached to a perf HW counter event). To fix this, add an in_nmi() check and use raw_spin_trylock() in NMI context, erroring out if grabbing the lock fails. Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Reported-by: Hsin-Wei Hung Tested-by: Hsin-Wei Hung Co-developed-by: Hsin-Wei Hung Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20230911132815.717240-1-toke@redhat.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/queue_stack_maps.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 0ee2347ba510..a047a2053d41 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -111,7 +111,12 @@ static int __queue_map_get(struct bpf_map *map, void *value, bool delete) int err = 0; void *ptr; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -141,7 +146,12 @@ static int __stack_map_get(struct bpf_map *map, void *value, bool delete) void *ptr; u32 index; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -206,7 +216,12 @@ static int queue_stack_map_push_elem(struct bpf_map *map, void *value, if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; - raw_spin_lock_irqsave(&qs->lock, irq_flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, irq_flags); + } if (queue_stack_map_is_full(qs)) { if (!replace) { From 78ef69b6e7705e0da62d2bb23ee92b95a00a1070 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 26 May 2021 20:27:19 -0700 Subject: [PATCH 036/228] selftests/tls: Add {} to avoid static checker warning [ Upstream commit f50688b47c5858d2ff315d020332bf4cb6710837 ] This silences a static checker warning due to the unusual macro construction of EXPECT_*() by adding explicit {}s around the enclosing while loop. Reported-by: Dan Carpenter Fixes: 7f657d5bf507 ("selftests: tls: add selftests for TLS sockets") Signed-off-by: Kees Cook Signed-off-by: Shuah Khan Stable-dep-of: c326ca98446e ("selftests: tls: swap the TX and RX sockets in some tests") Signed-off-by: Sasha Levin --- tools/testing/selftests/net/tls.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index b599f1fa99b5..44984741bd41 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -387,8 +387,9 @@ TEST_F(tls, sendmsg_large) EXPECT_EQ(sendmsg(self->cfd, &msg, 0), send_len); } - while (recvs++ < sends) + while (recvs++ < sends) { EXPECT_NE(recv(self->fd, mem, send_len, 0), -1); + } free(mem); } From c4ecedf980b0b7e11947d2c97c7320ec771e998b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 12 Sep 2023 16:16:25 +0200 Subject: [PATCH 037/228] selftests: tls: swap the TX and RX sockets in some tests [ Upstream commit c326ca98446e0ae4fee43a40acf79412b74cfedb ] tls.sendmsg_large and tls.sendmsg_multiple are trying to send through the self->cfd socket (only configured with TLS_RX) and to receive through the self->fd socket (only configured with TLS_TX), so they're not using kTLS at all. Swap the sockets. Fixes: 7f657d5bf507 ("selftests: tls: add selftests for TLS sockets") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- tools/testing/selftests/net/tls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 44984741bd41..44a25a9f1f72 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -384,11 +384,11 @@ TEST_F(tls, sendmsg_large) msg.msg_iov = &vec; msg.msg_iovlen = 1; - EXPECT_EQ(sendmsg(self->cfd, &msg, 0), send_len); + EXPECT_EQ(sendmsg(self->fd, &msg, 0), send_len); } while (recvs++ < sends) { - EXPECT_NE(recv(self->fd, mem, send_len, 0), -1); + EXPECT_NE(recv(self->cfd, mem, send_len, 0), -1); } free(mem); @@ -417,9 +417,9 @@ TEST_F(tls, sendmsg_multiple) msg.msg_iov = vec; msg.msg_iovlen = iov_len; - EXPECT_EQ(sendmsg(self->cfd, &msg, 0), total_len); + EXPECT_EQ(sendmsg(self->fd, &msg, 0), total_len); buf = malloc(total_len); - EXPECT_NE(recv(self->fd, buf, total_len, 0), -1); + EXPECT_NE(recv(self->cfd, buf, total_len, 0), -1); for (i = 0; i < iov_len; i++) { EXPECT_EQ(memcmp(test_strs[i], buf + len_cmp, strlen(test_strs[i])), From a91861446f1c98b302ebad7c8681cfb5b39e9a9a Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Fri, 15 Sep 2023 14:02:11 +0800 Subject: [PATCH 038/228] ASoC: imx-audmix: Fix return error with devm_clk_get() [ Upstream commit b19a5733de255cabba5feecabf6e900638b582d1 ] The devm_clk_get() can return -EPROBE_DEFER error, modify the error code to be -EINVAL is not correct, which cause the -EPROBE_DEFER error is not correctly handled. This patch is to fix the return error code. Fixes: b86ef5367761 ("ASoC: fsl: Add Audio Mixer machine driver") Signed-off-by: Shengjiu Wang Reviewed-by: Daniel Baluta Link: https://lore.kernel.org/r/1694757731-18308-1-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/fsl/imx-audmix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/fsl/imx-audmix.c b/sound/soc/fsl/imx-audmix.c index 77d8234c7ac4..bb2aab1d2389 100644 --- a/sound/soc/fsl/imx-audmix.c +++ b/sound/soc/fsl/imx-audmix.c @@ -322,7 +322,7 @@ static int imx_audmix_probe(struct platform_device *pdev) if (IS_ERR(priv->cpu_mclk)) { ret = PTR_ERR(priv->cpu_mclk); dev_err(&cpu_pdev->dev, "failed to get DAI mclk1: %d\n", ret); - return -EINVAL; + return ret; } priv->audmix_pdev = audmix_pdev; From 47907ebeb77a96edfdb5ea1baa83cd334b5e0522 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 7 Sep 2023 17:44:57 +0200 Subject: [PATCH 039/228] i40e: Fix VF VLAN offloading when port VLAN is configured [ Upstream commit d0d362ffa33da4acdcf7aee2116ceef8c8fef658 ] If port VLAN is configured on a VF then any other VLANs on top of this VF are broken. During i40e_ndo_set_vf_port_vlan() call the i40e driver reset the VF and iavf driver asks PF (using VIRTCHNL_OP_GET_VF_RESOURCES) for VF capabilities but this reset occurs too early, prior setting of vf->info.pvid field and because this field can be zero during i40e_vc_get_vf_resources_msg() then VIRTCHNL_VF_OFFLOAD_VLAN capability is reported to iavf driver. This is wrong because iavf driver should not report VLAN offloading capability when port VLAN is configured as i40e does not support QinQ offloading. Fix the issue by moving VF reset after setting of vf->port_vlan_id field. Without this patch: $ echo 1 > /sys/class/net/enp2s0f0/device/sriov_numvfs $ ip link set enp2s0f0 vf 0 vlan 3 $ ip link set enp2s0f0v0 up $ ip link add link enp2s0f0v0 name vlan4 type vlan id 4 $ ip link set vlan4 up ... $ ethtool -k enp2s0f0v0 | grep vlan-offload rx-vlan-offload: on tx-vlan-offload: on $ dmesg -l err | grep iavf [1292500.742914] iavf 0000:02:02.0: Failed to add VLAN filter, error IAVF_ERR_INVALID_QP_ID With this patch: $ echo 1 > /sys/class/net/enp2s0f0/device/sriov_numvfs $ ip link set enp2s0f0 vf 0 vlan 3 $ ip link set enp2s0f0v0 up $ ip link add link enp2s0f0v0 name vlan4 type vlan id 4 $ ip link set vlan4 up ... $ ethtool -k enp2s0f0v0 | grep vlan-offload rx-vlan-offload: off [requested on] tx-vlan-offload: off [requested on] $ dmesg -l err | grep iavf Fixes: f9b4b6278d51 ("i40e: Reset the VF upon conflicting VLAN configuration") Signed-off-by: Ivan Vecera Reviewed-by: Jesse Brandeburg Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index bb2a79b70c3a..dfaa34f2473a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -4332,9 +4332,6 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, /* duplicate request, so just return success */ goto error_pvid; - i40e_vc_reset_vf(vf, true); - /* During reset the VF got a new VSI, so refresh a pointer. */ - vsi = pf->vsi[vf->lan_vsi_idx]; /* Locked once because multiple functions below iterate list */ spin_lock_bh(&vsi->mac_filter_hash_lock); @@ -4420,6 +4417,10 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, */ vf->port_vlan_id = le16_to_cpu(vsi->info.pvid); + i40e_vc_reset_vf(vf, true); + /* During reset the VF got a new VSI, so refresh a pointer. */ + vsi = pf->vsi[vf->lan_vsi_idx]; + ret = i40e_config_vf_promiscuous_mode(vf, vsi->id, allmulti, alluni); if (ret) { dev_err(&pf->pdev->dev, "Unable to config vf promiscuous mode\n"); From 8689c9ace976d6c078e6dc844b09598796e84099 Mon Sep 17 00:00:00 2001 From: Kyle Zeng Date: Thu, 14 Sep 2023 22:12:57 -0700 Subject: [PATCH 040/228] ipv4: fix null-deref in ipv4_link_failure [ Upstream commit 0113d9c9d1ccc07f5a3710dac4aa24b6d711278c ] Currently, we assume the skb is associated with a device before calling __ip_options_compile, which is not always the case if it is re-routed by ipvs. When skb->dev is NULL, dev_net(skb->dev) will become null-dereference. This patch adds a check for the edge case and switch to use the net_device from the rtable when skb->dev is NULL. Fixes: ed0de45a1008 ("ipv4: recompile ip options in ipv4_link_failure") Suggested-by: David Ahern Signed-off-by: Kyle Zeng Cc: Stephen Suryaputra Cc: Vadim Fedorenko Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3ddeb4fc0d08..445b1a2966d7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1240,6 +1240,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) static void ipv4_send_dest_unreach(struct sk_buff *skb) { + struct net_device *dev; struct ip_options opt; int res; @@ -1257,7 +1258,8 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb) opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; + res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); rcu_read_unlock(); if (res) From 09a1c790e1b9d2b5a58b668f29760c1e5c5ea1e9 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 25 Aug 2023 11:26:01 +0530 Subject: [PATCH 041/228] powerpc/perf/hv-24x7: Update domain value check [ Upstream commit 4ff3ba4db5943cac1045e3e4a3c0463ea10f6930 ] Valid domain value is in range 1 to HV_PERF_DOMAIN_MAX. Current code has check for domain value greater than or equal to HV_PERF_DOMAIN_MAX. But the check for domain value 0 is missing. Fix this issue by adding check for domain value 0. Before: # ./perf stat -v -e hv_24x7/CPM_ADJUNCT_INST,domain=0,core=1/ sleep 1 Using CPUID 00800200 Control descriptor is not initialized Error: The sys_perf_event_open() syscall returned with 5 (Input/output error) for event (hv_24x7/CPM_ADJUNCT_INST,domain=0,core=1/). /bin/dmesg | grep -i perf may provide additional information. Result from dmesg: [ 37.819387] hv-24x7: hcall failed: [0 0x60040000 0x100 0] => ret 0xfffffffffffffffc (-4) detail=0x2000000 failing ix=0 After: # ./perf stat -v -e hv_24x7/CPM_ADJUNCT_INST,domain=0,core=1/ sleep 1 Using CPUID 00800200 Control descriptor is not initialized Warning: hv_24x7/CPM_ADJUNCT_INST,domain=0,core=1/ event is not supported by the kernel. failed to read counter hv_24x7/CPM_ADJUNCT_INST,domain=0,core=1/ Fixes: ebd4a5a3ebd9 ("powerpc/perf/hv-24x7: Minor improvements") Reported-by: Krishan Gopal Sarawast Signed-off-by: Kajol Jain Tested-by: Disha Goel Signed-off-by: Michael Ellerman Link: https://msgid.link/20230825055601.360083-1-kjain@linux.ibm.com Signed-off-by: Sasha Levin --- arch/powerpc/perf/hv-24x7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 1cd2351d241e..61a08747b164 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -1410,7 +1410,7 @@ static int h_24x7_event_init(struct perf_event *event) } domain = event_get_domain(event); - if (domain >= HV_PERF_DOMAIN_MAX) { + if (domain == 0 || domain >= HV_PERF_DOMAIN_MAX) { pr_devel("invalid domain %d\n", domain); return -EINVAL; } From 60d73c62e3e4464f375758b6f2459c13d46465b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Sep 2023 19:00:35 +0000 Subject: [PATCH 042/228] dccp: fix dccp_v4_err()/dccp_v6_err() again [ Upstream commit 6af289746a636f71f4c0535a9801774118486c7a ] dh->dccph_x is the 9th byte (offset 8) in "struct dccp_hdr", not in the "byte 7" as Jann claimed. We need to make sure the ICMP messages are big enough, using more standard ways (no more assumptions). syzbot reported: BUG: KMSAN: uninit-value in pskb_may_pull_reason include/linux/skbuff.h:2667 [inline] BUG: KMSAN: uninit-value in pskb_may_pull include/linux/skbuff.h:2681 [inline] BUG: KMSAN: uninit-value in dccp_v6_err+0x426/0x1aa0 net/dccp/ipv6.c:94 pskb_may_pull_reason include/linux/skbuff.h:2667 [inline] pskb_may_pull include/linux/skbuff.h:2681 [inline] dccp_v6_err+0x426/0x1aa0 net/dccp/ipv6.c:94 icmpv6_notify+0x4c7/0x880 net/ipv6/icmp.c:867 icmpv6_rcv+0x19d5/0x30d0 ip6_protocol_deliver_rcu+0xda6/0x2a60 net/ipv6/ip6_input.c:438 ip6_input_finish net/ipv6/ip6_input.c:483 [inline] NF_HOOK include/linux/netfilter.h:304 [inline] ip6_input+0x15d/0x430 net/ipv6/ip6_input.c:492 ip6_mc_input+0xa7e/0xc80 net/ipv6/ip6_input.c:586 dst_input include/net/dst.h:468 [inline] ip6_rcv_finish+0x5db/0x870 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:304 [inline] ipv6_rcv+0xda/0x390 net/ipv6/ip6_input.c:310 __netif_receive_skb_one_core net/core/dev.c:5523 [inline] __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5637 netif_receive_skb_internal net/core/dev.c:5723 [inline] netif_receive_skb+0x58/0x660 net/core/dev.c:5782 tun_rx_batched+0x83b/0x920 tun_get_user+0x564c/0x6940 drivers/net/tun.c:2002 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:1985 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x8ef/0x15c0 fs/read_write.c:584 ksys_write+0x20f/0x4c0 fs/read_write.c:637 __do_sys_write fs/read_write.c:649 [inline] __se_sys_write fs/read_write.c:646 [inline] __x64_sys_write+0x93/0xd0 fs/read_write.c:646 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Uninit was created at: slab_post_alloc_hook+0x12f/0xb70 mm/slab.h:767 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x577/0xa80 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:559 __alloc_skb+0x318/0x740 net/core/skbuff.c:650 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6313 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2795 tun_alloc_skb drivers/net/tun.c:1531 [inline] tun_get_user+0x23cf/0x6940 drivers/net/tun.c:1846 tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:1985 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x8ef/0x15c0 fs/read_write.c:584 ksys_write+0x20f/0x4c0 fs/read_write.c:637 __do_sys_write fs/read_write.c:649 [inline] __se_sys_write fs/read_write.c:646 [inline] __x64_sys_write+0x93/0xd0 fs/read_write.c:646 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd CPU: 0 PID: 4995 Comm: syz-executor153 Not tainted 6.6.0-rc1-syzkaller-00014-ga747acc0b752 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/04/2023 Fixes: 977ad86c2a1b ("dccp: Fix out of bounds access in DCCP error handler") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Jann Horn Reviewed-by: Jann Horn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/dccp/ipv4.c | 9 ++------- net/dccp/ipv6.c | 9 ++------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 398dc3e47d0c..f2a0a4e6dd74 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -243,13 +243,8 @@ static int dccp_v4_err(struct sk_buff *skb, u32 info) int err; struct net *net = dev_net(skb->dev); - /* For the first __dccp_basic_hdr_len() check, we only need dh->dccph_x, - * which is in byte 7 of the dccp header. - * Our caller (icmp_socket_deliver()) already pulled 8 bytes for us. - * - * Later on, we want to access the sequence number fields, which are - * beyond 8 bytes, so we have to pskb_may_pull() ourselves. - */ + if (!pskb_may_pull(skb, offset + sizeof(*dh))) + return -EINVAL; dh = (struct dccp_hdr *)(skb->data + offset); if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) return -EINVAL; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index bfe11e96af7c..6d6bbd43a141 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -76,13 +76,8 @@ static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, __u64 seq; struct net *net = dev_net(skb->dev); - /* For the first __dccp_basic_hdr_len() check, we only need dh->dccph_x, - * which is in byte 7 of the dccp header. - * Our caller (icmpv6_notify()) already pulled 8 bytes for us. - * - * Later on, we want to access the sequence number fields, which are - * beyond 8 bytes, so we have to pskb_may_pull() ourselves. - */ + if (!pskb_may_pull(skb, offset + sizeof(*dh))) + return -EINVAL; dh = (struct dccp_hdr *)(skb->data + offset); if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) return -EINVAL; From c463898b6e72e5327703dc11e223b6385fdb5e10 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 13 Sep 2023 14:27:19 -0700 Subject: [PATCH 043/228] platform/x86: intel_scu_ipc: Check status after timeout in busy_loop() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit e0b4ab3bb92bda8d12f55842614362989d5b2cb3 ] It's possible for the polling loop in busy_loop() to get scheduled away for a long time. status = ipc_read_status(scu); // status = IPC_STATUS_BUSY if (!(status & IPC_STATUS_BUSY)) If this happens, then the status bit could change while the task is scheduled away and this function would never read the status again after timing out. Instead, the function will return -ETIMEDOUT when it's possible that scheduling didn't work out and the status bit was cleared. Bit polling code should always check the bit being polled one more time after the timeout in case this happens. Fix this by reading the status once more after the while loop breaks. The readl_poll_timeout() macro implements all of this, and it is shorter, so use that macro here to consolidate code and fix this. There were some concerns with using readl_poll_timeout() because it uses timekeeping, and timekeeping isn't running early on or during the late stages of system suspend or early stages of system resume, but an audit of the code concluded that this code isn't called during those times so it is safe to use the macro. Cc: Prashant Malani Reviewed-by: Andy Shevchenko Reviewed-by: Mika Westerberg Reviewed-by: Kuppuswamy Sathyanarayanan Fixes: e7b7ab3847c9 ("platform/x86: intel_scu_ipc: Sleeping is fine when polling") Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20230913212723.3055315-2-swboyd@chromium.org Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/intel_scu_ipc.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index bdeb888c0fea..0b5029bca4a4 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -232,19 +233,15 @@ static inline u32 ipc_data_readl(struct intel_scu_ipc_dev *scu, u32 offset) /* Wait till scu status is busy */ static inline int busy_loop(struct intel_scu_ipc_dev *scu) { - unsigned long end = jiffies + IPC_TIMEOUT; + u8 status; + int err; - do { - u32 status; + err = readx_poll_timeout(ipc_read_status, scu, status, !(status & IPC_STATUS_BUSY), + 100, jiffies_to_usecs(IPC_TIMEOUT)); + if (err) + return err; - status = ipc_read_status(scu); - if (!(status & IPC_STATUS_BUSY)) - return (status & IPC_STATUS_ERR) ? -EIO : 0; - - usleep_range(50, 100); - } while (time_before(jiffies, end)); - - return -ETIMEDOUT; + return (status & IPC_STATUS_ERR) ? -EIO : 0; } /* Wait till ipc ioc interrupt is received or timeout in 10 HZ */ From 441b61d742effd40e186e6c5dcd9b5b01caae514 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 13 Sep 2023 14:27:20 -0700 Subject: [PATCH 044/228] platform/x86: intel_scu_ipc: Check status upon timeout in ipc_wait_for_interrupt() [ Upstream commit 427fada620733e6474d783ae6037a66eae42bf8c ] It's possible for the completion in ipc_wait_for_interrupt() to timeout, simply because the interrupt was delayed in being processed. A timeout in itself is not an error. This driver should check the status register upon a timeout to ensure that scheduling or interrupt processing delays don't affect the outcome of the IPC return value. CPU0 SCU ---- --- ipc_wait_for_interrupt() wait_for_completion_timeout(&scu->cmd_complete) [TIMEOUT] status[IPC_STATUS_BUSY]=0 Fix this problem by reading the status bit in all cases, regardless of the timeout. If the completion times out, we'll assume the problem was that the IPC_STATUS_BUSY bit was still set, but if the status bit is cleared in the meantime we know that we hit some scheduling delay and we should just check the error bit. Cc: Prashant Malani Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Andy Shevchenko Reviewed-by: Mika Westerberg Fixes: ed12f295bfd5 ("ipc: Added support for IPC interrupt mode") Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20230913212723.3055315-3-swboyd@chromium.org Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/intel_scu_ipc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index 0b5029bca4a4..4c053c715cde 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -249,10 +249,12 @@ static inline int ipc_wait_for_interrupt(struct intel_scu_ipc_dev *scu) { int status; - if (!wait_for_completion_timeout(&scu->cmd_complete, IPC_TIMEOUT)) - return -ETIMEDOUT; + wait_for_completion_timeout(&scu->cmd_complete, IPC_TIMEOUT); status = ipc_read_status(scu); + if (status & IPC_STATUS_BUSY) + return -ETIMEDOUT; + if (status & IPC_STATUS_ERR) return -EIO; From 4c5eaf6d8bb4ea30f736946bb42b6f3de25c43ff Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 13 Sep 2023 14:27:21 -0700 Subject: [PATCH 045/228] platform/x86: intel_scu_ipc: Don't override scu in intel_scu_ipc_dev_simple_command() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit efce78584e583226e9a1f6cb2fb555d6ff47c3e7 ] Andy discovered this bug during patch review. The 'scu' argument to this function shouldn't be overridden by the function itself. It doesn't make any sense. Looking at the commit history, we see that commit f57fa18583f5 ("platform/x86: intel_scu_ipc: Introduce new SCU IPC API") removed the setting of the scu to ipcdev in other functions, but not this one. That was an oversight. Remove this line so that we stop overriding the scu instance that is used by this function. Reported-by: Andy Shevchenko Closes: https://lore.kernel.org/r/ZPjdZ3xNmBEBvNiS@smile.fi.intel.com Cc: Prashant Malani Reviewed-by: Andy Shevchenko Reviewed-by: Mika Westerberg Fixes: f57fa18583f5 ("platform/x86: intel_scu_ipc: Introduce new SCU IPC API") Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20230913212723.3055315-4-swboyd@chromium.org Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/intel_scu_ipc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index 4c053c715cde..60e7f95bc555 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -444,7 +444,6 @@ int intel_scu_ipc_dev_simple_command(struct intel_scu_ipc_dev *scu, int cmd, mutex_unlock(&ipclock); return -ENODEV; } - scu = ipcdev; cmdval = sub << 12 | cmd; ipc_command(scu, cmdval); err = intel_scu_ipc_check_status(scu); From 1ec40ef6f7658e852f794e33b9c475f2557d4148 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 13 Sep 2023 14:27:22 -0700 Subject: [PATCH 046/228] platform/x86: intel_scu_ipc: Fail IPC send if still busy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 85e654c9f722853a595fa941dca60c157b707b86 ] It's possible for interrupts to get significantly delayed to the point that callers of intel_scu_ipc_dev_command() and friends can call the function once, hit a timeout, and call it again while the interrupt still hasn't been processed. This driver will get seriously confused if the interrupt is finally processed after the second IPC has been sent with ipc_command(). It won't know which IPC has been completed. This could be quite disastrous if calling code assumes something has happened upon return from intel_scu_ipc_dev_simple_command() when it actually hasn't. Let's avoid this scenario by simply returning -EBUSY in this case. Hopefully higher layers will know to back off or fail gracefully when this happens. It's all highly unlikely anyway, but it's better to be correct here as we have no way to know which IPC the status register is telling us about if we send a second IPC while the previous IPC is still processing. Cc: Prashant Malani Cc: Kuppuswamy Sathyanarayanan Reviewed-by: Andy Shevchenko Reviewed-by: Mika Westerberg Fixes: ed12f295bfd5 ("ipc: Added support for IPC interrupt mode") Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20230913212723.3055315-5-swboyd@chromium.org Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/intel_scu_ipc.c | 40 +++++++++++++++++++--------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index 60e7f95bc555..84ed82869463 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -266,6 +266,24 @@ static int intel_scu_ipc_check_status(struct intel_scu_ipc_dev *scu) return scu->irq > 0 ? ipc_wait_for_interrupt(scu) : busy_loop(scu); } +static struct intel_scu_ipc_dev *intel_scu_ipc_get(struct intel_scu_ipc_dev *scu) +{ + u8 status; + + if (!scu) + scu = ipcdev; + if (!scu) + return ERR_PTR(-ENODEV); + + status = ipc_read_status(scu); + if (status & IPC_STATUS_BUSY) { + dev_dbg(&scu->dev, "device is busy\n"); + return ERR_PTR(-EBUSY); + } + + return scu; +} + /* Read/Write power control(PMIC in Langwell, MSIC in PenWell) registers */ static int pwr_reg_rdwr(struct intel_scu_ipc_dev *scu, u16 *addr, u8 *data, u32 count, u32 op, u32 id) @@ -279,11 +297,10 @@ static int pwr_reg_rdwr(struct intel_scu_ipc_dev *scu, u16 *addr, u8 *data, memset(cbuf, 0, sizeof(cbuf)); mutex_lock(&ipclock); - if (!scu) - scu = ipcdev; - if (!scu) { + scu = intel_scu_ipc_get(scu); + if (IS_ERR(scu)) { mutex_unlock(&ipclock); - return -ENODEV; + return PTR_ERR(scu); } for (nc = 0; nc < count; nc++, offset += 2) { @@ -438,12 +455,12 @@ int intel_scu_ipc_dev_simple_command(struct intel_scu_ipc_dev *scu, int cmd, int err; mutex_lock(&ipclock); - if (!scu) - scu = ipcdev; - if (!scu) { + scu = intel_scu_ipc_get(scu); + if (IS_ERR(scu)) { mutex_unlock(&ipclock); - return -ENODEV; + return PTR_ERR(scu); } + cmdval = sub << 12 | cmd; ipc_command(scu, cmdval); err = intel_scu_ipc_check_status(scu); @@ -483,11 +500,10 @@ int intel_scu_ipc_dev_command_with_size(struct intel_scu_ipc_dev *scu, int cmd, return -EINVAL; mutex_lock(&ipclock); - if (!scu) - scu = ipcdev; - if (!scu) { + scu = intel_scu_ipc_get(scu); + if (IS_ERR(scu)) { mutex_unlock(&ipclock); - return -ENODEV; + return PTR_ERR(scu); } memcpy(inbuf, in, inlen); From 423ba1b3a5a768935b12b3f9dde6937fe7949039 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 4 Sep 2023 22:04:45 -0700 Subject: [PATCH 047/228] x86/srso: Fix srso_show_state() side effect [ Upstream commit a8cf700c17d9ca6cb8ee7dc5c9330dbac3948237 ] Reading the 'spec_rstack_overflow' sysfs file can trigger an unnecessary MSR write, and possibly even a (handled) exception if the microcode hasn't been updated. Avoid all that by just checking X86_FEATURE_IBPB_BRTYPE instead, which gets set by srso_select_mitigation() if the updated microcode exists. Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/27d128899cb8aee9eb2b57ddc996742b0c1d776b.1693889988.git.jpoimboe@kernel.org Signed-off-by: Sasha Levin --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4d11a50089b2..4719089029f0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2622,7 +2622,7 @@ static ssize_t srso_show_state(char *buf) return sysfs_emit(buf, "%s%s\n", srso_strings[srso_mitigation], - (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode")); + boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ? "" : ", no microcode"); } static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, From ae806c74c0634b0c23855066d8ba28d850fd1260 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 4 Sep 2023 22:04:48 -0700 Subject: [PATCH 048/228] x86/srso: Fix SBPB enablement for spec_rstack_overflow=off [ Upstream commit 01b057b2f4cc2d905a0bd92195657dbd9a7005ab ] If the user has requested no SRSO mitigation, other mitigations can use the lighter-weight SBPB instead of IBPB. Fixes: fb3bd914b3ec ("x86/srso: Add a Speculative RAS Overflow mitigation") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Signed-off-by: Borislav Petkov (AMD) Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/b20820c3cfd1003171135ec8d762a0b957348497.1693889988.git.jpoimboe@kernel.org Signed-off-by: Sasha Levin --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 4719089029f0..ec3ddb9a456b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2344,7 +2344,7 @@ static void __init srso_select_mitigation(void) switch (srso_cmd) { case SRSO_CMD_OFF: - return; + goto pred_cmd; case SRSO_CMD_MICROCODE: if (has_microcode) { From 309af4a39b8e1e38e73929297bde9f03973af2c8 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 18 Sep 2023 15:48:38 +0800 Subject: [PATCH 049/228] net: hns3: only enable unicast promisc when mac table full [ Upstream commit f2ed304922a55690529bcca59678dd92d7466ce8 ] Currently, the driver will enable unicast promisc for the function once configure mac address fail. It's unreasonable when the failure is caused by using same mac address with other functions. So only enable unicast promisc when mac table full. Fixes: c631c696823c ("net: hns3: refactor the promisc mode setting") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 47f8f66cf7ec..49eeeb0c9a1f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -7850,7 +7850,7 @@ static void hclge_update_overflow_flags(struct hclge_vport *vport, if (mac_type == HCLGE_MAC_ADDR_UC) { if (is_all_added) vport->overflow_promisc_flags &= ~HNAE3_OVERFLOW_UPE; - else + else if (hclge_is_umv_space_full(vport, true)) vport->overflow_promisc_flags |= HNAE3_OVERFLOW_UPE; } else { if (is_all_added) From 1671dc1b25e5a338fcada0f8e692008b6fe6bcf6 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Mon, 18 Sep 2023 15:48:40 +0800 Subject: [PATCH 050/228] net: hns3: add 5ms delay before clear firmware reset irq source [ Upstream commit 0770063096d5da4a8e467b6e73c1646a75589628 ] Currently the reset process in hns3 and firmware watchdog init process is asynchronous. we think firmware watchdog initialization is completed before hns3 clear the firmware interrupt source. However, firmware initialization may not complete early. so we add delay before hns3 clear firmware interrupt source and 5 ms delay is enough to avoid second firmware reset interrupt. Fixes: c1a81619d73a ("net: hns3: Add mailbox interrupt handling to PF driver") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 49eeeb0c9a1f..deba485ced1b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3125,8 +3125,13 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) static void hclge_clear_event_cause(struct hclge_dev *hdev, u32 event_type, u32 regclr) { +#define HCLGE_IMP_RESET_DELAY 5 + switch (event_type) { case HCLGE_VECTOR0_EVENT_RST: + if (regclr == BIT(HCLGE_VECTOR0_IMPRESET_INT_B)) + mdelay(HCLGE_IMP_RESET_DELAY); + hclge_write_dev(&hdev->hw, HCLGE_MISC_RESET_STS_REG, regclr); break; case HCLGE_VECTOR0_EVENT_MBX: From 04cc361f029c14dd067ad180525c7392334c9bfd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Sep 2023 09:13:51 +0000 Subject: [PATCH 051/228] net: bridge: use DEV_STATS_INC() [ Upstream commit 44bdb313da57322c9b3c108eb66981c6ec6509f4 ] syzbot/KCSAN reported data-races in br_handle_frame_finish() [1] This function can run from multiple cpus without mutual exclusion. Adopt SMP safe DEV_STATS_INC() to update dev->stats fields. Handles updates to dev->stats.tx_dropped while we are at it. [1] BUG: KCSAN: data-race in br_handle_frame_finish / br_handle_frame_finish read-write to 0xffff8881374b2178 of 8 bytes by interrupt on cpu 1: br_handle_frame_finish+0xd4f/0xef0 net/bridge/br_input.c:189 br_nf_hook_thresh+0x1ed/0x220 br_nf_pre_routing_finish_ipv6+0x50f/0x540 NF_HOOK include/linux/netfilter.h:304 [inline] br_nf_pre_routing_ipv6+0x1e3/0x2a0 net/bridge/br_netfilter_ipv6.c:178 br_nf_pre_routing+0x526/0xba0 net/bridge/br_netfilter_hooks.c:508 nf_hook_entry_hookfn include/linux/netfilter.h:144 [inline] nf_hook_bridge_pre net/bridge/br_input.c:272 [inline] br_handle_frame+0x4c9/0x940 net/bridge/br_input.c:417 __netif_receive_skb_core+0xa8a/0x21e0 net/core/dev.c:5417 __netif_receive_skb_one_core net/core/dev.c:5521 [inline] __netif_receive_skb+0x57/0x1b0 net/core/dev.c:5637 process_backlog+0x21f/0x380 net/core/dev.c:5965 __napi_poll+0x60/0x3b0 net/core/dev.c:6527 napi_poll net/core/dev.c:6594 [inline] net_rx_action+0x32b/0x750 net/core/dev.c:6727 __do_softirq+0xc1/0x265 kernel/softirq.c:553 run_ksoftirqd+0x17/0x20 kernel/softirq.c:921 smpboot_thread_fn+0x30a/0x4a0 kernel/smpboot.c:164 kthread+0x1d7/0x210 kernel/kthread.c:388 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 read-write to 0xffff8881374b2178 of 8 bytes by interrupt on cpu 0: br_handle_frame_finish+0xd4f/0xef0 net/bridge/br_input.c:189 br_nf_hook_thresh+0x1ed/0x220 br_nf_pre_routing_finish_ipv6+0x50f/0x540 NF_HOOK include/linux/netfilter.h:304 [inline] br_nf_pre_routing_ipv6+0x1e3/0x2a0 net/bridge/br_netfilter_ipv6.c:178 br_nf_pre_routing+0x526/0xba0 net/bridge/br_netfilter_hooks.c:508 nf_hook_entry_hookfn include/linux/netfilter.h:144 [inline] nf_hook_bridge_pre net/bridge/br_input.c:272 [inline] br_handle_frame+0x4c9/0x940 net/bridge/br_input.c:417 __netif_receive_skb_core+0xa8a/0x21e0 net/core/dev.c:5417 __netif_receive_skb_one_core net/core/dev.c:5521 [inline] __netif_receive_skb+0x57/0x1b0 net/core/dev.c:5637 process_backlog+0x21f/0x380 net/core/dev.c:5965 __napi_poll+0x60/0x3b0 net/core/dev.c:6527 napi_poll net/core/dev.c:6594 [inline] net_rx_action+0x32b/0x750 net/core/dev.c:6727 __do_softirq+0xc1/0x265 kernel/softirq.c:553 do_softirq+0x5e/0x90 kernel/softirq.c:454 __local_bh_enable_ip+0x64/0x70 kernel/softirq.c:381 __raw_spin_unlock_bh include/linux/spinlock_api_smp.h:167 [inline] _raw_spin_unlock_bh+0x36/0x40 kernel/locking/spinlock.c:210 spin_unlock_bh include/linux/spinlock.h:396 [inline] batadv_tt_local_purge+0x1a8/0x1f0 net/batman-adv/translation-table.c:1356 batadv_tt_purge+0x2b/0x630 net/batman-adv/translation-table.c:3560 process_one_work kernel/workqueue.c:2630 [inline] process_scheduled_works+0x5b8/0xa30 kernel/workqueue.c:2703 worker_thread+0x525/0x730 kernel/workqueue.c:2784 kthread+0x1d7/0x210 kernel/kthread.c:388 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 value changed: 0x00000000000d7190 -> 0x00000000000d7191 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 14848 Comm: kworker/u4:11 Not tainted 6.6.0-rc1-syzkaller-00236-gad8a69f361b9 #0 Fixes: 1c29fc4989bc ("[BRIDGE]: keep track of received multicast packets") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Roopa Prabhu Cc: Nikolay Aleksandrov Cc: bridge@lists.linux-foundation.org Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230918091351.1356153-1-edumazet@google.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/bridge/br_forward.c | 4 ++-- net/bridge/br_input.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 4610f3a13966..f2ef75c7ccc6 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -118,7 +118,7 @@ static int deliver_clone(const struct net_bridge_port *prev, skb = skb_clone(skb, GFP_ATOMIC); if (!skb) { - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return -ENOMEM; } @@ -255,7 +255,7 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, skb = skb_copy(skb, GFP_ATOMIC); if (!skb) { - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index bf5bf148091f..52dd0708fd14 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -145,12 +145,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if ((mdst && mdst->host_joined) || br_multicast_is_router(br)) { local_rcv = true; - br->dev->stats.multicast++; + DEV_STATS_INC(br->dev, multicast); } mcast_hit = true; } else { local_rcv = true; - br->dev->stats.multicast++; + DEV_STATS_INC(br->dev, multicast); } break; case BR_PKT_UNICAST: From b44dd92e2afd89eb6e9d27616858e72a67bdc1a7 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Mon, 18 Sep 2023 20:30:11 +0800 Subject: [PATCH 052/228] team: fix null-ptr-deref when team device type is changed [ Upstream commit 492032760127251e5540a5716a70996bacf2a3fd ] Get a null-ptr-deref bug as follows with reproducer [1]. BUG: kernel NULL pointer dereference, address: 0000000000000228 ... RIP: 0010:vlan_dev_hard_header+0x35/0x140 [8021q] ... Call Trace: ? __die+0x24/0x70 ? page_fault_oops+0x82/0x150 ? exc_page_fault+0x69/0x150 ? asm_exc_page_fault+0x26/0x30 ? vlan_dev_hard_header+0x35/0x140 [8021q] ? vlan_dev_hard_header+0x8e/0x140 [8021q] neigh_connected_output+0xb2/0x100 ip6_finish_output2+0x1cb/0x520 ? nf_hook_slow+0x43/0xc0 ? ip6_mtu+0x46/0x80 ip6_finish_output+0x2a/0xb0 mld_sendpack+0x18f/0x250 mld_ifc_work+0x39/0x160 process_one_work+0x1e6/0x3f0 worker_thread+0x4d/0x2f0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 [1] $ teamd -t team0 -d -c '{"runner": {"name": "loadbalance"}}' $ ip link add name t-dummy type dummy $ ip link add link t-dummy name t-dummy.100 type vlan id 100 $ ip link add name t-nlmon type nlmon $ ip link set t-nlmon master team0 $ ip link set t-nlmon nomaster $ ip link set t-dummy up $ ip link set team0 up $ ip link set t-dummy.100 down $ ip link set t-dummy.100 master team0 When enslave a vlan device to team device and team device type is changed from non-ether to ether, header_ops of team device is changed to vlan_header_ops. That is incorrect and will trigger null-ptr-deref for vlan->real_dev in vlan_dev_hard_header() because team device is not a vlan device. Cache eth_header_ops in team_setup(), then assign cached header_ops to header_ops of team net device when its type is changed from non-ether to ether to fix the bug. Fixes: 1d76efe1577b ("team: add support for non-ethernet devices") Suggested-by: Hangbin Liu Reviewed-by: Hangbin Liu Signed-off-by: Ziyang Xuan Reviewed-by: Jiri Pirko Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230918123011.1884401-1-william.xuanziyang@huawei.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/team/team.c | 10 +++++++++- include/linux/if_team.h | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 721b536ce886..97a77dabed64 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2122,7 +2122,12 @@ static const struct ethtool_ops team_ethtool_ops = { static void team_setup_by_port(struct net_device *dev, struct net_device *port_dev) { - dev->header_ops = port_dev->header_ops; + struct team *team = netdev_priv(dev); + + if (port_dev->type == ARPHRD_ETHER) + dev->header_ops = team->header_ops_cache; + else + dev->header_ops = port_dev->header_ops; dev->type = port_dev->type; dev->hard_header_len = port_dev->hard_header_len; dev->needed_headroom = port_dev->needed_headroom; @@ -2169,8 +2174,11 @@ static int team_dev_type_check_change(struct net_device *dev, static void team_setup(struct net_device *dev) { + struct team *team = netdev_priv(dev); + ether_setup(dev); dev->max_mtu = ETH_MAX_MTU; + team->header_ops_cache = dev->header_ops; dev->netdev_ops = &team_netdev_ops; dev->ethtool_ops = &team_ethtool_ops; diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 5dd1657947b7..762c77d13e7d 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -189,6 +189,8 @@ struct team { struct net_device *dev; /* associated netdevice */ struct team_pcpu_stats __percpu *pcpu_stats; + const struct header_ops *header_ops_cache; + struct mutex lock; /* used for overall locking, e.g. port lists write */ /* From f1893feb20ea033bcd9c449b55df3dab3802c907 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 19 Sep 2023 20:04:45 +0200 Subject: [PATCH 053/228] netfilter: ipset: Fix race between IPSET_CMD_CREATE and IPSET_CMD_SWAP [ Upstream commit 7433b6d2afd512d04398c73aa984d1e285be125b ] Kyle Zeng reported that there is a race between IPSET_CMD_ADD and IPSET_CMD_SWAP in netfilter/ip_set, which can lead to the invocation of `__ip_set_put` on a wrong `set`, triggering the `BUG_ON(set->ref == 0);` check in it. The race is caused by using the wrong reference counter, i.e. the ref counter instead of ref_netlink. Fixes: 24e227896bbf ("netfilter: ipset: Add schedule point in call_ad().") Reported-by: Kyle Zeng Closes: https://lore.kernel.org/netfilter-devel/ZPZqetxOmH+w%2Fmyc@westworld/#r Tested-by: Kyle Zeng Signed-off-by: Jozsef Kadlecsik Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin --- net/netfilter/ipset/ip_set_core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 55ac0cc12657..26613e3731d0 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -682,6 +682,14 @@ __ip_set_put(struct ip_set *set) /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ +static void +__ip_set_get_netlink(struct ip_set *set) +{ + write_lock_bh(&ip_set_ref_lock); + set->ref_netlink++; + write_unlock_bh(&ip_set_ref_lock); +} + static void __ip_set_put_netlink(struct ip_set *set) { @@ -1705,11 +1713,11 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, do { if (retried) { - __ip_set_get(set); + __ip_set_get_netlink(set); nfnl_unlock(NFNL_SUBSYS_IPSET); cond_resched(); nfnl_lock(NFNL_SUBSYS_IPSET); - __ip_set_put(set); + __ip_set_put_netlink(set); } ip_set_lock(set); From a8460ee6c80b6bae053797842fbbdaa718166095 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 17:50:38 +0100 Subject: [PATCH 054/228] seqlock: avoid -Wshadow warnings [ Upstream commit a07c45312f06e288417049208c344ad76074627d ] When building with W=2, there is a flood of warnings about the seqlock macros shadowing local variables: 19806 linux/seqlock.h:331:11: warning: declaration of 'seq' shadows a previous local [-Wshadow] 48 linux/seqlock.h:348:11: warning: declaration of 'seq' shadows a previous local [-Wshadow] 8 linux/seqlock.h:379:11: warning: declaration of 'seq' shadows a previous local [-Wshadow] Prefix the local variables to make the warning useful elsewhere again. Fixes: 52ac39e5db51 ("seqlock: seqcount_t: Implement all read APIs as statement expressions") Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201026165044.3722931-1-arnd@kernel.org Stable-dep-of: 41b43b6c6e30 ("locking/seqlock: Do the lockdep annotation before locking in do_write_seqcount_begin_nested()") Signed-off-by: Sasha Levin --- include/linux/seqlock.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 1ac20d75b061..fb89b05066f4 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -328,13 +328,13 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu */ #define __read_seqcount_begin(s) \ ({ \ - unsigned seq; \ + unsigned __seq; \ \ - while ((seq = __seqcount_sequence(s)) & 1) \ + while ((__seq = __seqcount_sequence(s)) & 1) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ - seq; \ + __seq; \ }) /** @@ -345,10 +345,10 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu */ #define raw_read_seqcount_begin(s) \ ({ \ - unsigned seq = __read_seqcount_begin(s); \ + unsigned _seq = __read_seqcount_begin(s); \ \ smp_rmb(); \ - seq; \ + _seq; \ }) /** @@ -376,11 +376,11 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu */ #define raw_read_seqcount(s) \ ({ \ - unsigned seq = __seqcount_sequence(s); \ + unsigned __seq = __seqcount_sequence(s); \ \ smp_rmb(); \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ - seq; \ + __seq; \ }) /** From ac01a0dd790593c3d448d7729a01c95bd37e002f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Nov 2020 13:44:17 +0100 Subject: [PATCH 055/228] seqlock: Rename __seqprop() users [ Upstream commit ab440b2c604b60fe90885270fcfeb5c3dd5d6fae ] More consistent naming should make it easier to untangle the _Generic token pasting maze called __seqprop(). Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201110115358.GE2594@hirez.programming.kicks-ass.net Stable-dep-of: 41b43b6c6e30 ("locking/seqlock: Do the lockdep annotation before locking in do_write_seqcount_begin_nested()") Signed-off-by: Sasha Levin --- include/linux/seqlock.h | 46 ++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index fb89b05066f4..66993e9ef90d 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -307,10 +307,10 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu __seqprop_case((s), mutex, prop), \ __seqprop_case((s), ww_mutex, prop)) -#define __seqcount_ptr(s) __seqprop(s, ptr) -#define __seqcount_sequence(s) __seqprop(s, sequence) -#define __seqcount_lock_preemptible(s) __seqprop(s, preemptible) -#define __seqcount_assert_lock_held(s) __seqprop(s, assert) +#define seqprop_ptr(s) __seqprop(s, ptr) +#define seqprop_sequence(s) __seqprop(s, sequence) +#define seqprop_preemptible(s) __seqprop(s, preemptible) +#define seqprop_assert(s) __seqprop(s, assert) /** * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier @@ -330,7 +330,7 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu ({ \ unsigned __seq; \ \ - while ((__seq = __seqcount_sequence(s)) & 1) \ + while ((__seq = seqprop_sequence(s)) & 1) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ @@ -359,7 +359,7 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu */ #define read_seqcount_begin(s) \ ({ \ - seqcount_lockdep_reader_access(__seqcount_ptr(s)); \ + seqcount_lockdep_reader_access(seqprop_ptr(s)); \ raw_read_seqcount_begin(s); \ }) @@ -376,7 +376,7 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu */ #define raw_read_seqcount(s) \ ({ \ - unsigned __seq = __seqcount_sequence(s); \ + unsigned __seq = seqprop_sequence(s); \ \ smp_rmb(); \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ @@ -425,7 +425,7 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu * Return: true if a read section retry is required, else false */ #define __read_seqcount_retry(s, start) \ - __read_seqcount_t_retry(__seqcount_ptr(s), start) + __read_seqcount_t_retry(seqprop_ptr(s), start) static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start) { @@ -445,7 +445,7 @@ static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start) * Return: true if a read section retry is required, else false */ #define read_seqcount_retry(s, start) \ - read_seqcount_t_retry(__seqcount_ptr(s), start) + read_seqcount_t_retry(seqprop_ptr(s), start) static inline int read_seqcount_t_retry(const seqcount_t *s, unsigned start) { @@ -459,10 +459,10 @@ static inline int read_seqcount_t_retry(const seqcount_t *s, unsigned start) */ #define raw_write_seqcount_begin(s) \ do { \ - if (__seqcount_lock_preemptible(s)) \ + if (seqprop_preemptible(s)) \ preempt_disable(); \ \ - raw_write_seqcount_t_begin(__seqcount_ptr(s)); \ + raw_write_seqcount_t_begin(seqprop_ptr(s)); \ } while (0) static inline void raw_write_seqcount_t_begin(seqcount_t *s) @@ -478,9 +478,9 @@ static inline void raw_write_seqcount_t_begin(seqcount_t *s) */ #define raw_write_seqcount_end(s) \ do { \ - raw_write_seqcount_t_end(__seqcount_ptr(s)); \ + raw_write_seqcount_t_end(seqprop_ptr(s)); \ \ - if (__seqcount_lock_preemptible(s)) \ + if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) @@ -501,12 +501,12 @@ static inline void raw_write_seqcount_t_end(seqcount_t *s) */ #define write_seqcount_begin_nested(s, subclass) \ do { \ - __seqcount_assert_lock_held(s); \ + seqprop_assert(s); \ \ - if (__seqcount_lock_preemptible(s)) \ + if (seqprop_preemptible(s)) \ preempt_disable(); \ \ - write_seqcount_t_begin_nested(__seqcount_ptr(s), subclass); \ + write_seqcount_t_begin_nested(seqprop_ptr(s), subclass); \ } while (0) static inline void write_seqcount_t_begin_nested(seqcount_t *s, int subclass) @@ -528,12 +528,12 @@ static inline void write_seqcount_t_begin_nested(seqcount_t *s, int subclass) */ #define write_seqcount_begin(s) \ do { \ - __seqcount_assert_lock_held(s); \ + seqprop_assert(s); \ \ - if (__seqcount_lock_preemptible(s)) \ + if (seqprop_preemptible(s)) \ preempt_disable(); \ \ - write_seqcount_t_begin(__seqcount_ptr(s)); \ + write_seqcount_t_begin(seqprop_ptr(s)); \ } while (0) static inline void write_seqcount_t_begin(seqcount_t *s) @@ -549,9 +549,9 @@ static inline void write_seqcount_t_begin(seqcount_t *s) */ #define write_seqcount_end(s) \ do { \ - write_seqcount_t_end(__seqcount_ptr(s)); \ + write_seqcount_t_end(seqprop_ptr(s)); \ \ - if (__seqcount_lock_preemptible(s)) \ + if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) @@ -603,7 +603,7 @@ static inline void write_seqcount_t_end(seqcount_t *s) * } */ #define raw_write_seqcount_barrier(s) \ - raw_write_seqcount_t_barrier(__seqcount_ptr(s)) + raw_write_seqcount_t_barrier(seqprop_ptr(s)) static inline void raw_write_seqcount_t_barrier(seqcount_t *s) { @@ -623,7 +623,7 @@ static inline void raw_write_seqcount_t_barrier(seqcount_t *s) * will complete successfully and see data older than this. */ #define write_seqcount_invalidate(s) \ - write_seqcount_t_invalidate(__seqcount_ptr(s)) + write_seqcount_t_invalidate(seqprop_ptr(s)) static inline void write_seqcount_t_invalidate(seqcount_t *s) { From a8dd21118b0fa33efd09b713cef79d02e72719e2 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Sun, 6 Dec 2020 17:21:42 +0100 Subject: [PATCH 056/228] seqlock: Prefix internal seqcount_t-only macros with a "do_" [ Upstream commit 66bcfcdf89d00f2409f4b5da0f8c20c08318dc72 ] When the seqcount_LOCKNAME_t group of data types were introduced, two classes of seqlock.h sequence counter macros were added: - An external public API which can either take a plain seqcount_t or any of the seqcount_LOCKNAME_t variants. - An internal API which takes only a plain seqcount_t. To distinguish between the two groups, the "*_seqcount_t_*" pattern was used for the latter. This confused a number of mm/ call-site developers, and Linus also commented that it was not a standard practice for marking seqlock.h internal APIs. Distinguish the latter group of macros by prefixing a "do_". Signed-off-by: Ahmed S. Darwish Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/CAHk-=wikhGExmprXgaW+MVXG1zsGpztBbVwOb23vetk41EtTBQ@mail.gmail.com Stable-dep-of: 41b43b6c6e30 ("locking/seqlock: Do the lockdep annotation before locking in do_write_seqcount_begin_nested()") Signed-off-by: Sasha Levin --- include/linux/seqlock.h | 66 ++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 66993e9ef90d..008fa88ad58e 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -425,9 +425,9 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, &s->lock->base, ww_mu * Return: true if a read section retry is required, else false */ #define __read_seqcount_retry(s, start) \ - __read_seqcount_t_retry(seqprop_ptr(s), start) + do___read_seqcount_retry(seqprop_ptr(s), start) -static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start) +static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) { kcsan_atomic_next(0); return unlikely(READ_ONCE(s->sequence) != start); @@ -445,12 +445,12 @@ static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start) * Return: true if a read section retry is required, else false */ #define read_seqcount_retry(s, start) \ - read_seqcount_t_retry(seqprop_ptr(s), start) + do_read_seqcount_retry(seqprop_ptr(s), start) -static inline int read_seqcount_t_retry(const seqcount_t *s, unsigned start) +static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start) { smp_rmb(); - return __read_seqcount_t_retry(s, start); + return do___read_seqcount_retry(s, start); } /** @@ -462,10 +462,10 @@ do { \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ - raw_write_seqcount_t_begin(seqprop_ptr(s)); \ + do_raw_write_seqcount_begin(seqprop_ptr(s)); \ } while (0) -static inline void raw_write_seqcount_t_begin(seqcount_t *s) +static inline void do_raw_write_seqcount_begin(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; @@ -478,13 +478,13 @@ static inline void raw_write_seqcount_t_begin(seqcount_t *s) */ #define raw_write_seqcount_end(s) \ do { \ - raw_write_seqcount_t_end(seqprop_ptr(s)); \ + do_raw_write_seqcount_end(seqprop_ptr(s)); \ \ if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) -static inline void raw_write_seqcount_t_end(seqcount_t *s) +static inline void do_raw_write_seqcount_end(seqcount_t *s) { smp_wmb(); s->sequence++; @@ -506,12 +506,12 @@ do { \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ - write_seqcount_t_begin_nested(seqprop_ptr(s), subclass); \ + do_write_seqcount_begin_nested(seqprop_ptr(s), subclass); \ } while (0) -static inline void write_seqcount_t_begin_nested(seqcount_t *s, int subclass) +static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass) { - raw_write_seqcount_t_begin(s); + do_raw_write_seqcount_begin(s); seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); } @@ -533,12 +533,12 @@ do { \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ - write_seqcount_t_begin(seqprop_ptr(s)); \ + do_write_seqcount_begin(seqprop_ptr(s)); \ } while (0) -static inline void write_seqcount_t_begin(seqcount_t *s) +static inline void do_write_seqcount_begin(seqcount_t *s) { - write_seqcount_t_begin_nested(s, 0); + do_write_seqcount_begin_nested(s, 0); } /** @@ -549,16 +549,16 @@ static inline void write_seqcount_t_begin(seqcount_t *s) */ #define write_seqcount_end(s) \ do { \ - write_seqcount_t_end(seqprop_ptr(s)); \ + do_write_seqcount_end(seqprop_ptr(s)); \ \ if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) -static inline void write_seqcount_t_end(seqcount_t *s) +static inline void do_write_seqcount_end(seqcount_t *s) { seqcount_release(&s->dep_map, _RET_IP_); - raw_write_seqcount_t_end(s); + do_raw_write_seqcount_end(s); } /** @@ -603,9 +603,9 @@ static inline void write_seqcount_t_end(seqcount_t *s) * } */ #define raw_write_seqcount_barrier(s) \ - raw_write_seqcount_t_barrier(seqprop_ptr(s)) + do_raw_write_seqcount_barrier(seqprop_ptr(s)) -static inline void raw_write_seqcount_t_barrier(seqcount_t *s) +static inline void do_raw_write_seqcount_barrier(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; @@ -623,9 +623,9 @@ static inline void raw_write_seqcount_t_barrier(seqcount_t *s) * will complete successfully and see data older than this. */ #define write_seqcount_invalidate(s) \ - write_seqcount_t_invalidate(seqprop_ptr(s)) + do_write_seqcount_invalidate(seqprop_ptr(s)) -static inline void write_seqcount_t_invalidate(seqcount_t *s) +static inline void do_write_seqcount_invalidate(seqcount_t *s) { smp_wmb(); kcsan_nestable_atomic_begin(); @@ -862,9 +862,9 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) } /* - * For all seqlock_t write side functions, use write_seqcount_*t*_begin() - * instead of the generic write_seqcount_begin(). This way, no redundant - * lockdep_assert_held() checks are added. + * For all seqlock_t write side functions, use the the internal + * do_write_seqcount_begin() instead of generic write_seqcount_begin(). + * This way, no redundant lockdep_assert_held() checks are added. */ /** @@ -883,7 +883,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) static inline void write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); - write_seqcount_t_begin(&sl->seqcount.seqcount); + do_write_seqcount_begin(&sl->seqcount.seqcount); } /** @@ -895,7 +895,7 @@ static inline void write_seqlock(seqlock_t *sl) */ static inline void write_sequnlock(seqlock_t *sl) { - write_seqcount_t_end(&sl->seqcount.seqcount); + do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock(&sl->lock); } @@ -909,7 +909,7 @@ static inline void write_sequnlock(seqlock_t *sl) static inline void write_seqlock_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); - write_seqcount_t_begin(&sl->seqcount.seqcount); + do_write_seqcount_begin(&sl->seqcount.seqcount); } /** @@ -922,7 +922,7 @@ static inline void write_seqlock_bh(seqlock_t *sl) */ static inline void write_sequnlock_bh(seqlock_t *sl) { - write_seqcount_t_end(&sl->seqcount.seqcount); + do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_bh(&sl->lock); } @@ -936,7 +936,7 @@ static inline void write_sequnlock_bh(seqlock_t *sl) static inline void write_seqlock_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); - write_seqcount_t_begin(&sl->seqcount.seqcount); + do_write_seqcount_begin(&sl->seqcount.seqcount); } /** @@ -948,7 +948,7 @@ static inline void write_seqlock_irq(seqlock_t *sl) */ static inline void write_sequnlock_irq(seqlock_t *sl) { - write_seqcount_t_end(&sl->seqcount.seqcount); + do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_irq(&sl->lock); } @@ -957,7 +957,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) unsigned long flags; spin_lock_irqsave(&sl->lock, flags); - write_seqcount_t_begin(&sl->seqcount.seqcount); + do_write_seqcount_begin(&sl->seqcount.seqcount); return flags; } @@ -986,7 +986,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) static inline void write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) { - write_seqcount_t_end(&sl->seqcount.seqcount); + do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_irqrestore(&sl->lock, flags); } From 78106529b3908ca9a1e307044a80b0a72465a816 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 20 Sep 2023 12:46:27 +0200 Subject: [PATCH 057/228] locking/seqlock: Do the lockdep annotation before locking in do_write_seqcount_begin_nested() [ Upstream commit 41b43b6c6e30a832c790b010a06772e793bca193 ] It was brought up by Tetsuo that the following sequence: write_seqlock_irqsave() printk_deferred_enter() could lead to a deadlock if the lockdep annotation within write_seqlock_irqsave() triggers. The problem is that the sequence counter is incremented before the lockdep annotation is performed. The lockdep splat would then attempt to invoke printk() but the reader side, of the same seqcount, could have a tty_port::lock acquired waiting for the sequence number to become even again. The other lockdep annotations come before the actual locking because "we want to see the locking error before it happens". There is no reason why seqcount should be different here. Do the lockdep annotation first then perform the locking operation (the sequence increment). Fixes: 1ca7d67cf5d5a ("seqcount: Add lockdep functionality to seqcount/seqlock structures") Reported-by: Tetsuo Handa Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230920104627._DTHgPyA@linutronix.de Closes: https://lore.kernel.org/20230621130641.-5iueY1I@linutronix.de Signed-off-by: Sasha Levin --- include/linux/seqlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 008fa88ad58e..0928a60b8f82 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -511,8 +511,8 @@ do { \ static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass) { - do_raw_write_seqcount_begin(s); seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); + do_raw_write_seqcount_begin(s); } /** From 466e88548e19914e5d0b0d8406f95f47d9232753 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 18 Sep 2023 17:36:10 +0200 Subject: [PATCH 058/228] bnxt_en: Flush XDP for bnxt_poll_nitroa0()'s NAPI [ Upstream commit edc0140cc3b7b91874ebe70eb7d2a851e8817ccc ] bnxt_poll_nitroa0() invokes bnxt_rx_pkt() which can run a XDP program which in turn can return XDP_REDIRECT. bnxt_rx_pkt() is also used by __bnxt_poll_work() which flushes (xdp_do_flush()) the packets after each round. bnxt_poll_nitroa0() lacks this feature. xdp_do_flush() should be invoked before leaving the NAPI callback. Invoke xdp_do_flush() after a redirect in bnxt_poll_nitroa0() NAPI. Cc: Michael Chan Fixes: f18c2b77b2e4e ("bnxt_en: optimized XDP_REDIRECT support") Reviewed-by: Andy Gospodarek Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Michael Chan Acked-by: Jesper Dangaard Brouer Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d8366351cf14..c67a108c2c07 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2404,6 +2404,7 @@ static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget) struct rx_cmp_ext *rxcmp1; u32 cp_cons, tmp_raw_cons; u32 raw_cons = cpr->cp_raw_cons; + bool flush_xdp = false; u32 rx_pkts = 0; u8 event = 0; @@ -2438,6 +2439,8 @@ static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget) rx_pkts++; else if (rc == -EBUSY) /* partial completion */ break; + if (event & BNXT_REDIRECT_EVENT) + flush_xdp = true; } else if (unlikely(TX_CMP_TYPE(txcmp) == CMPL_BASE_TYPE_HWRM_DONE)) { bnxt_hwrm_handler(bp, txcmp); @@ -2457,6 +2460,8 @@ static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget) if (event & BNXT_AGG_EVENT) bnxt_db_write(bp, &rxr->rx_agg_db, rxr->rx_agg_prod); + if (flush_xdp) + xdp_do_flush(); if (!bnxt_has_work(bp, cpr) && rx_pkts < budget) { napi_complete_done(napi, rx_pkts); From f515112e833791001aaa8ab886af3ca78503617f Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Mon, 18 Sep 2023 16:56:23 +0300 Subject: [PATCH 059/228] net: rds: Fix possible NULL-pointer dereference [ Upstream commit f1d95df0f31048f1c59092648997686e3f7d9478 ] In rds_rdma_cm_event_handler_cmn() check, if conn pointer exists before dereferencing it as rdma_set_service_type() argument Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: fd261ce6a30e ("rds: rdma: update rdma transport for tos") Signed-off-by: Artem Chernyshev Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/rds/rdma_transport.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 5f741e51b4ba..bb38124a5d3d 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -86,10 +86,12 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ADDR_RESOLVED: - rdma_set_service_type(cm_id, conn->c_tos); - /* XXX do we need to clean up if this fails? */ - ret = rdma_resolve_route(cm_id, + if (conn) { + rdma_set_service_type(cm_id, conn->c_tos); + /* XXX do we need to clean up if this fails? */ + ret = rdma_resolve_route(cm_id, RDS_RDMA_RESOLVE_TIMEOUT_MS); + } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: From 0a78bcc2d526b08d1a3694d132ae5d5dbafde51c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 3 Sep 2023 08:13:21 +0200 Subject: [PATCH 060/228] gpio: tb10x: Fix an error handling path in tb10x_gpio_probe() [ Upstream commit b547b5e52a0587e6b25ea520bf2f9e03d00cbcb6 ] If an error occurs after a successful irq_domain_add_linear() call, it should be undone by a corresponding irq_domain_remove(), as already done in the remove function. Fixes: c6ce2b6bffe5 ("gpio: add TB10x GPIO driver") Signed-off-by: Christophe JAILLET Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpio-tb10x.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-tb10x.c b/drivers/gpio/gpio-tb10x.c index 866201cf5f65..4a9dcaad4a6c 100644 --- a/drivers/gpio/gpio-tb10x.c +++ b/drivers/gpio/gpio-tb10x.c @@ -195,7 +195,7 @@ static int tb10x_gpio_probe(struct platform_device *pdev) handle_edge_irq, IRQ_NOREQUEST, IRQ_NOPROBE, IRQ_GC_INIT_MASK_CACHE); if (ret) - return ret; + goto err_remove_domain; gc = tb10x_gpio->domain->gc->gc[0]; gc->reg_base = tb10x_gpio->base; @@ -209,6 +209,10 @@ static int tb10x_gpio_probe(struct platform_device *pdev) } return 0; + +err_remove_domain: + irq_domain_remove(tb10x_gpio->domain); + return ret; } static int tb10x_gpio_remove(struct platform_device *pdev) From 5e95c88e906161faab0a403d021d4414f4a29cc6 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Thu, 3 Mar 2022 20:39:14 +0800 Subject: [PATCH 061/228] i2c: mux: demux-pinctrl: check the return value of devm_kstrdup() [ Upstream commit 7c0195fa9a9e263df204963f88a22b21688ffb66 ] devm_kstrdup() returns pointer to allocated string on success, NULL on failure. So it is better to check the return value of it. Fixes: e35478eac030 ("i2c: mux: demux-pinctrl: run properly with multiple instances") Signed-off-by: Xiaoke Wang Signed-off-by: Wolfram Sang Signed-off-by: Sasha Levin --- drivers/i2c/muxes/i2c-demux-pinctrl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/i2c/muxes/i2c-demux-pinctrl.c b/drivers/i2c/muxes/i2c-demux-pinctrl.c index f7a7405d4350..8e8688e8de0f 100644 --- a/drivers/i2c/muxes/i2c-demux-pinctrl.c +++ b/drivers/i2c/muxes/i2c-demux-pinctrl.c @@ -243,6 +243,10 @@ static int i2c_demux_pinctrl_probe(struct platform_device *pdev) props[i].name = devm_kstrdup(&pdev->dev, "status", GFP_KERNEL); props[i].value = devm_kstrdup(&pdev->dev, "ok", GFP_KERNEL); + if (!props[i].name || !props[i].value) { + err = -ENOMEM; + goto err_rollback; + } props[i].length = 3; of_changeset_init(&priv->chan[i].chgset); From e51f30826bc5384801df98d76109c94953d1df64 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 27 Sep 2023 17:30:06 +0200 Subject: [PATCH 062/228] netfilter: nf_tables: unregister flowtable hooks on netns exit commit 6069da443bf65f513bb507bb21e2f87cfb1ad0b6 upstream. Unregister flowtable hooks before they are releases via nf_tables_flowtable_destroy() otherwise hook core reports UAF. BUG: KASAN: use-after-free in nf_hook_entries_grow+0x5a7/0x700 net/netfilter/core.c:142 net/netfilter/core.c:142 Read of size 4 at addr ffff8880736f7438 by task syz-executor579/3666 CPU: 0 PID: 3666 Comm: syz-executor579 Not tainted 5.16.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] __dump_stack lib/dump_stack.c:88 [inline] lib/dump_stack.c:106 dump_stack_lvl+0x1dc/0x2d8 lib/dump_stack.c:106 lib/dump_stack.c:106 print_address_description+0x65/0x380 mm/kasan/report.c:247 mm/kasan/report.c:247 __kasan_report mm/kasan/report.c:433 [inline] __kasan_report mm/kasan/report.c:433 [inline] mm/kasan/report.c:450 kasan_report+0x19a/0x1f0 mm/kasan/report.c:450 mm/kasan/report.c:450 nf_hook_entries_grow+0x5a7/0x700 net/netfilter/core.c:142 net/netfilter/core.c:142 __nf_register_net_hook+0x27e/0x8d0 net/netfilter/core.c:429 net/netfilter/core.c:429 nf_register_net_hook+0xaa/0x180 net/netfilter/core.c:571 net/netfilter/core.c:571 nft_register_flowtable_net_hooks+0x3c5/0x730 net/netfilter/nf_tables_api.c:7232 net/netfilter/nf_tables_api.c:7232 nf_tables_newflowtable+0x2022/0x2cf0 net/netfilter/nf_tables_api.c:7430 net/netfilter/nf_tables_api.c:7430 nfnetlink_rcv_batch net/netfilter/nfnetlink.c:513 [inline] nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:634 [inline] nfnetlink_rcv_batch net/netfilter/nfnetlink.c:513 [inline] net/netfilter/nfnetlink.c:652 nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:634 [inline] net/netfilter/nfnetlink.c:652 nfnetlink_rcv+0x10e6/0x2550 net/netfilter/nfnetlink.c:652 net/netfilter/nfnetlink.c:652 __nft_release_hook() calls nft_unregister_flowtable_net_hooks() which only unregisters the hooks, then after RCU grace period, it is guaranteed that no packets add new entries to the flowtable (no flow offload rules and flowtable hooks are reachable from packet path), so it is safe to call nf_flow_table_free() which cleans up the remaining entries from the flowtable (both software and hardware) and it unbinds the flow_block. Fixes: ff4bf2f42a40 ("netfilter: nf_tables: add nft_unregister_flowtable_hook()") Reported-by: syzbot+e918523f77e62790d6d9@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 52c776b5967e..efbcf85cd6b7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9466,16 +9466,24 @@ int __nft_release_basechain(struct nft_ctx *ctx) } EXPORT_SYMBOL_GPL(__nft_release_basechain); +static void __nft_release_hook(struct net *net, struct nft_table *table) +{ + struct nft_flowtable *flowtable; + struct nft_chain *chain; + + list_for_each_entry(chain, &table->chains, list) + nf_tables_unregister_hook(net, table, chain); + list_for_each_entry(flowtable, &table->flowtables, list) + nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list); +} + static void __nft_release_hooks(struct net *net) { struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_table *table; - struct nft_chain *chain; - list_for_each_entry(table, &nft_net->tables, list) { - list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hook(net, table, chain); - } + list_for_each_entry(table, &nft_net->tables, list) + __nft_release_hook(net, table); } static void __nft_release_table(struct net *net, struct nft_table *table) From 3fac8ce48fa9fd61ee9056d3ed48b2edefca8b82 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 27 Sep 2023 17:30:07 +0200 Subject: [PATCH 063/228] netfilter: nf_tables: double hook unregistration in netns path commit f9a43007d3f7ba76d5e7f9421094f00f2ef202f8 upstream. [ This backport includes ab5e5c062f67 ("netfilter: nf_tables: use kfree_rcu(ptr, rcu) to release hooks in clean_net path") ] __nft_release_hooks() is called from pre_netns exit path which unregisters the hooks, then the NETDEV_UNREGISTER event is triggered which unregisters the hooks again. [ 565.221461] WARNING: CPU: 18 PID: 193 at net/netfilter/core.c:495 __nf_unregister_net_hook+0x247/0x270 [...] [ 565.246890] CPU: 18 PID: 193 Comm: kworker/u64:1 Tainted: G E 5.18.0-rc7+ #27 [ 565.253682] Workqueue: netns cleanup_net [ 565.257059] RIP: 0010:__nf_unregister_net_hook+0x247/0x270 [...] [ 565.297120] Call Trace: [ 565.300900] [ 565.304683] nf_tables_flowtable_event+0x16a/0x220 [nf_tables] [ 565.308518] raw_notifier_call_chain+0x63/0x80 [ 565.312386] unregister_netdevice_many+0x54f/0xb50 Unregister and destroy netdev hook from netns pre_exit via kfree_rcu so the NETDEV_UNREGISTER path see unregistered hooks. Fixes: 767d1216bff8 ("netfilter: nftables: fix possible UAF over chains from packet path in netns") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 54 ++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index efbcf85cd6b7..16e2500e8590 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -303,12 +303,18 @@ static int nft_netdev_register_hooks(struct net *net, } static void nft_netdev_unregister_hooks(struct net *net, - struct list_head *hook_list) + struct list_head *hook_list, + bool release_netdev) { - struct nft_hook *hook; + struct nft_hook *hook, *next; - list_for_each_entry(hook, hook_list, list) + list_for_each_entry_safe(hook, next, hook_list, list) { nf_unregister_net_hook(net, &hook->ops); + if (release_netdev) { + list_del(&hook->list); + kfree_rcu(hook, rcu); + } + } } static int nf_tables_register_hook(struct net *net, @@ -334,9 +340,10 @@ static int nf_tables_register_hook(struct net *net, return nf_register_net_hook(net, &basechain->ops); } -static void nf_tables_unregister_hook(struct net *net, - const struct nft_table *table, - struct nft_chain *chain) +static void __nf_tables_unregister_hook(struct net *net, + const struct nft_table *table, + struct nft_chain *chain, + bool release_netdev) { struct nft_base_chain *basechain; const struct nf_hook_ops *ops; @@ -351,11 +358,19 @@ static void nf_tables_unregister_hook(struct net *net, return basechain->type->ops_unregister(net, ops); if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) - nft_netdev_unregister_hooks(net, &basechain->hook_list); + nft_netdev_unregister_hooks(net, &basechain->hook_list, + release_netdev); else nf_unregister_net_hook(net, &basechain->ops); } +static void nf_tables_unregister_hook(struct net *net, + const struct nft_table *table, + struct nft_chain *chain) +{ + return __nf_tables_unregister_hook(net, table, chain, false); +} + static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net; @@ -6822,13 +6837,25 @@ static void nft_unregister_flowtable_hook(struct net *net, FLOW_BLOCK_UNBIND); } +static void __nft_unregister_flowtable_net_hooks(struct net *net, + struct list_head *hook_list, + bool release_netdev) +{ + struct nft_hook *hook, *next; + + list_for_each_entry_safe(hook, next, hook_list, list) { + nf_unregister_net_hook(net, &hook->ops); + if (release_netdev) { + list_del(&hook->list); + kfree_rcu(hook, rcu); + } + } +} + static void nft_unregister_flowtable_net_hooks(struct net *net, struct list_head *hook_list) { - struct nft_hook *hook; - - list_for_each_entry(hook, hook_list, list) - nf_unregister_net_hook(net, &hook->ops); + __nft_unregister_flowtable_net_hooks(net, hook_list, false); } static int nft_register_flowtable_net_hooks(struct net *net, @@ -9472,9 +9499,10 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) struct nft_chain *chain; list_for_each_entry(chain, &table->chains, list) - nf_tables_unregister_hook(net, table, chain); + __nf_tables_unregister_hook(net, table, chain, true); list_for_each_entry(flowtable, &table->flowtables, list) - nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list); + __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list, + true); } static void __nft_release_hooks(struct net *net) From 677bff659fd37579bf7d4f0ea57795af9afa7e20 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sat, 1 Oct 2022 14:28:34 -0700 Subject: [PATCH 064/228] Input: i8042 - rename i8042-x86ia64io.h to i8042-acpipnpio.h [ Upstream commit 8761b9b580d53162cca7868385069c0d4354c9e0 ] Now i8042-x86ia64io.h is shared by X86 and IA64, but it can be shared by more platforms (such as LoongArch) with ACPI firmware on which PNP typed keyboard and mouse is configured in DSDT. So rename it to i8042- acpipnpio.h. Signed-off-by: Huacai Chen Reviewed-by: Mattijs Korpershoek Link: https://lore.kernel.org/r/20220917064020.1639709-1-chenhuacai@loongson.cn Signed-off-by: Dmitry Torokhov Stable-dep-of: eb09074bdb05 ("Input: i8042 - add quirk for TUXEDO Gemini 17 Gen1/Clevo PD70PN") Signed-off-by: Sasha Levin --- .../input/serio/{i8042-x86ia64io.h => i8042-acpipnpio.h} | 6 +++--- drivers/input/serio/i8042.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) rename drivers/input/serio/{i8042-x86ia64io.h => i8042-acpipnpio.h} (99%) diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-acpipnpio.h similarity index 99% rename from drivers/input/serio/i8042-x86ia64io.h rename to drivers/input/serio/i8042-acpipnpio.h index 9dcdf21c50bd..ced72b45aedc 100644 --- a/drivers/input/serio/i8042-x86ia64io.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _I8042_X86IA64IO_H -#define _I8042_X86IA64IO_H +#ifndef _I8042_ACPIPNPIO_H +#define _I8042_ACPIPNPIO_H #ifdef CONFIG_X86 @@ -1587,4 +1587,4 @@ static inline void i8042_platform_exit(void) i8042_pnp_exit(); } -#endif /* _I8042_X86IA64IO_H */ +#endif /* _I8042_ACPIPNPIO_H */ diff --git a/drivers/input/serio/i8042.h b/drivers/input/serio/i8042.h index 55381783dc82..bf2592fa9a78 100644 --- a/drivers/input/serio/i8042.h +++ b/drivers/input/serio/i8042.h @@ -20,7 +20,7 @@ #elif defined(CONFIG_SPARC) #include "i8042-sparcio.h" #elif defined(CONFIG_X86) || defined(CONFIG_IA64) -#include "i8042-x86ia64io.h" +#include "i8042-acpipnpio.h" #else #include "i8042-io.h" #endif From e492f8125133fcba80595ec6e233c000c8e014bd Mon Sep 17 00:00:00 2001 From: Werner Sembach Date: Wed, 12 Jul 2023 11:56:51 -0700 Subject: [PATCH 065/228] Input: i8042 - add quirk for TUXEDO Gemini 17 Gen1/Clevo PD70PN [ Upstream commit eb09074bdb05ffd6bfe77f8b4a41b76ef78c997b ] The touchpad of this device is both connected via PS/2 and i2c. This causes strange behavior when both driver fight for control. The easy fix is to prevent the PS/2 driver from accessing the mouse port as the full feature set of the touchpad is only supported in the i2c interface anyway. The strange behavior in this case is, that when an external screen is connected and the notebook is closed, the pointer on the external screen is moving to the lower right corner. When the notebook is opened again, this movement stops, but the touchpad clicks are unresponsive afterwards until reboot. Signed-off-by: Werner Sembach Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230607173331.851192-1-wse@tuxedocomputers.com Signed-off-by: Dmitry Torokhov Signed-off-by: Sasha Levin --- drivers/input/serio/i8042-acpipnpio.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index ced72b45aedc..1bd5898abb97 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -1184,6 +1184,13 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) }, + /* See comment on TUXEDO InfinityBook S17 Gen6 / Clevo NS70MU above */ + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "PD5x_7xPNP_PNR_PNN_PNT"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOAUX) + }, { .matches = { DMI_MATCH(DMI_BOARD_NAME, "X170SM"), From 7217ceb61a47efd8c4ddd8f5d96702718fd9bfe9 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 10 Nov 2020 15:20:56 +0100 Subject: [PATCH 066/228] mmc: renesas_sdhi: probe into TMIO after SCC parameters have been setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b161d87dfd3d9f3fb064a089a9e521d0e5d3e38f ] Setting up the SCC parameters does not need a probed TMIO device. But in the near future, probing the TMIO device needs the SCC parameters setup. So, fix the ordering. Signed-off-by: Wolfram Sang Reviewed-by: Niklas Söderlund Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20201110142058.36393-3-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson Stable-dep-of: 74f45de394d9 ("mmc: renesas_sdhi: register irqs before registering controller") Signed-off-by: Sasha Levin --- drivers/mmc/host/renesas_sdhi_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index a49b8fe2a098..2cf7360d6cad 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -1070,10 +1070,6 @@ int renesas_sdhi_probe(struct platform_device *pdev, quirks->hs400_calib_table + 1); } - ret = tmio_mmc_host_probe(host); - if (ret < 0) - goto edisclk; - /* Enable tuning iff we have an SCC and a supported mode */ if (of_data && of_data->scc_offset && (host->mmc->caps & MMC_CAP_UHS_SDR104 || @@ -1105,6 +1101,10 @@ int renesas_sdhi_probe(struct platform_device *pdev, host->ops.hs400_complete = renesas_sdhi_hs400_complete; } + ret = tmio_mmc_host_probe(host); + if (ret < 0) + goto edisclk; + num_irqs = platform_irq_count(pdev); if (num_irqs < 0) { ret = num_irqs; From 97eb045386dee8b269fdc647c5fac8f089c50b60 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 10 Nov 2020 15:20:57 +0100 Subject: [PATCH 067/228] mmc: renesas_sdhi: populate SCC pointer at the proper place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d14ac691bb6f6ebaa7eeec21ca04dd47300ff5b6 ] The SCC pointer is currently filled whenever the SoC is Gen2+. This is wrong because there is a Gen2-variant without SCC (SDHI_VER_GEN2_SDR50). We have been lucky because the writes to unintended registers have not caused problems so far. But further refactoring work exposed the problem. So, move the pointer initialization to the place where we know that the SDHI instance supports tuning. And also populate the 'reset' pointer unconditionally to make sure the interrupt enable register is always properly set for Gen2+. Signed-off-by: Wolfram Sang Reviewed-by: Niklas Söderlund Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20201110142058.36393-4-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson Stable-dep-of: 74f45de394d9 ("mmc: renesas_sdhi: register irqs before registering controller") Signed-off-by: Sasha Levin --- drivers/mmc/host/renesas_sdhi_core.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 2cf7360d6cad..bf8d934fb751 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -1010,11 +1010,7 @@ int renesas_sdhi_probe(struct platform_device *pdev, host->ops.start_signal_voltage_switch = renesas_sdhi_start_signal_voltage_switch; host->sdcard_irq_setbit_mask = TMIO_STAT_ALWAYS_SET_27; - - if (of_data && of_data->scc_offset) { - priv->scc_ctl = host->ctl + of_data->scc_offset; - host->reset = renesas_sdhi_reset; - } + host->reset = renesas_sdhi_reset; } /* Orginally registers were 16 bit apart, could be 32 or 64 nowadays */ @@ -1094,6 +1090,7 @@ int renesas_sdhi_probe(struct platform_device *pdev, if (!hit) dev_warn(&host->pdev->dev, "Unknown clock rate for tuning\n"); + priv->scc_ctl = host->ctl + of_data->scc_offset; host->check_retune = renesas_sdhi_check_scc_error; host->ops.execute_tuning = renesas_sdhi_execute_tuning; host->ops.prepare_hs400_tuning = renesas_sdhi_prepare_hs400_tuning; From 995ef65e4b5c824cb0e920761d4da924b2404899 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 23 Feb 2021 11:08:29 +0100 Subject: [PATCH 068/228] mmc: tmio: support custom irq masks [ Upstream commit 0d856c4c68c639f96cb12c26aaeb906353b9a76e ] SDHI Gen2+ has a different value for TMIO_MASK_ALL, so add a member to support that. If the member is not used, the previous default value is applied. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20210223100830.25125-2-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson Stable-dep-of: 74f45de394d9 ("mmc: renesas_sdhi: register irqs before registering controller") Signed-off-by: Sasha Levin --- drivers/mmc/host/tmio_mmc.h | 1 + drivers/mmc/host/tmio_mmc_core.c | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index 9546e542619c..d6ed5e1f8386 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -161,6 +161,7 @@ struct tmio_mmc_host { u32 sdio_irq_mask; unsigned int clk_cache; u32 sdcard_irq_setbit_mask; + u32 sdcard_irq_mask_all; spinlock_t lock; /* protect host private data */ unsigned long last_req_ts; diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index ac4e7874a3f1..abf36acb2641 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -1158,7 +1158,9 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host) tmio_mmc_reset(_host); _host->sdcard_irq_mask = sd_ctrl_read16_and_16_as_32(_host, CTL_IRQ_MASK); - tmio_mmc_disable_mmc_irqs(_host, TMIO_MASK_ALL); + if (!_host->sdcard_irq_mask_all) + _host->sdcard_irq_mask_all = TMIO_MASK_ALL; + tmio_mmc_disable_mmc_irqs(_host, _host->sdcard_irq_mask_all); if (_host->native_hotplug) tmio_mmc_enable_mmc_irqs(_host, @@ -1212,7 +1214,7 @@ void tmio_mmc_host_remove(struct tmio_mmc_host *host) cancel_work_sync(&host->done); cancel_delayed_work_sync(&host->delayed_reset_work); tmio_mmc_release_dma(host); - tmio_mmc_disable_mmc_irqs(host, TMIO_MASK_ALL); + tmio_mmc_disable_mmc_irqs(host, host->sdcard_irq_mask_all); if (host->native_hotplug) pm_runtime_put_noidle(&pdev->dev); @@ -1242,7 +1244,7 @@ int tmio_mmc_host_runtime_suspend(struct device *dev) { struct tmio_mmc_host *host = dev_get_drvdata(dev); - tmio_mmc_disable_mmc_irqs(host, TMIO_MASK_ALL); + tmio_mmc_disable_mmc_irqs(host, host->sdcard_irq_mask_all); if (host->clk_cache) host->set_clock(host, 0); From 6d3745bbc3341d3b52f0e8f63987a1403f55f1f8 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 12 Jul 2023 16:00:11 +0200 Subject: [PATCH 069/228] mmc: renesas_sdhi: register irqs before registering controller [ Upstream commit 74f45de394d979cc7770271f92fafa53e1ed3119 ] IRQs should be ready to serve when we call mmc_add_host() via tmio_mmc_host_probe(). To achieve that, ensure that all irqs are masked before registering the handlers. Signed-off-by: Wolfram Sang Tested-by: Biju Das Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230712140011.18602-1-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/renesas_sdhi_core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index bf8d934fb751..95abd421d0d2 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -1011,6 +1011,8 @@ int renesas_sdhi_probe(struct platform_device *pdev, renesas_sdhi_start_signal_voltage_switch; host->sdcard_irq_setbit_mask = TMIO_STAT_ALWAYS_SET_27; host->reset = renesas_sdhi_reset; + } else { + host->sdcard_irq_mask_all = TMIO_MASK_ALL; } /* Orginally registers were 16 bit apart, could be 32 or 64 nowadays */ @@ -1098,9 +1100,7 @@ int renesas_sdhi_probe(struct platform_device *pdev, host->ops.hs400_complete = renesas_sdhi_hs400_complete; } - ret = tmio_mmc_host_probe(host); - if (ret < 0) - goto edisclk; + sd_ctrl_write32_as_16_and_16(host, CTL_IRQ_MASK, host->sdcard_irq_mask_all); num_irqs = platform_irq_count(pdev); if (num_irqs < 0) { @@ -1127,6 +1127,10 @@ int renesas_sdhi_probe(struct platform_device *pdev, goto eirq; } + ret = tmio_mmc_host_probe(host); + if (ret < 0) + goto edisclk; + dev_info(&pdev->dev, "%s base at %pa, max clock rate %u MHz\n", mmc_hostname(host->mmc), &res->start, host->mmc->f_max / 1000000); From 4ccdeb68da0cc5b1e0443e0923683b421bd53dcb Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Fri, 2 Apr 2021 12:06:27 +0200 Subject: [PATCH 070/228] media: venus: core: Add io base variables for each block [ Upstream commit b4053a2097ec2f8ea622e817ae5a46a83b23aefe ] New silicon means that the pre-determined offsets we have been using in this driver no longer hold. Existing blocks of registers can exist at different offsets relative to the IO base address. This commit adds a routine to assign the IO base hooks a subsequent commit will convert from absolute to relative addressing. Signed-off-by: Bryan O'Donoghue Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab Stable-dep-of: d74e48160980 ("media: venus: hfi_venus: Write to VIDC_CTRL_INIT after unmasking interrupts") Signed-off-by: Sasha Levin --- drivers/media/platform/qcom/venus/core.c | 12 ++++++++++++ drivers/media/platform/qcom/venus/core.h | 10 ++++++++++ 2 files changed, 22 insertions(+) diff --git a/drivers/media/platform/qcom/venus/core.c b/drivers/media/platform/qcom/venus/core.c index 62d11c6e41d6..5f7ac2807e5f 100644 --- a/drivers/media/platform/qcom/venus/core.c +++ b/drivers/media/platform/qcom/venus/core.c @@ -21,6 +21,7 @@ #include "core.h" #include "firmware.h" #include "pm_helpers.h" +#include "hfi_venus_io.h" static void venus_event_notify(struct venus_core *core, u32 event) { @@ -210,6 +211,15 @@ static int venus_enumerate_codecs(struct venus_core *core, u32 type) return ret; } +static void venus_assign_register_offsets(struct venus_core *core) +{ + core->vbif_base = core->base + VBIF_BASE; + core->cpu_base = core->base + CPU_BASE; + core->cpu_cs_base = core->base + CPU_CS_BASE; + core->cpu_ic_base = core->base + CPU_IC_BASE; + core->wrapper_base = core->base + WRAPPER_BASE; +} + static int venus_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -276,6 +286,8 @@ static int venus_probe(struct platform_device *pdev) if (ret) goto err_core_put; + venus_assign_register_offsets(core); + ret = v4l2_device_register(dev, &core->v4l2_dev); if (ret) goto err_core_deinit; diff --git a/drivers/media/platform/qcom/venus/core.h b/drivers/media/platform/qcom/venus/core.h index aebd4c664bfa..50eb0a9fb134 100644 --- a/drivers/media/platform/qcom/venus/core.h +++ b/drivers/media/platform/qcom/venus/core.h @@ -119,6 +119,11 @@ struct venus_caps { * struct venus_core - holds core parameters valid for all instances * * @base: IO memory base address + * @vbif_base IO memory vbif base address + * @cpu_base IO memory cpu base address + * @cpu_cs_base IO memory cpu_cs base address + * @cpu_ic_base IO memory cpu_ic base address + * @wrapper_base IO memory wrapper base address * @irq: Venus irq * @clks: an array of struct clk pointers * @vcodec0_clks: an array of vcodec0 struct clk pointers @@ -152,6 +157,11 @@ struct venus_caps { */ struct venus_core { void __iomem *base; + void __iomem *vbif_base; + void __iomem *cpu_base; + void __iomem *cpu_cs_base; + void __iomem *cpu_ic_base; + void __iomem *wrapper_base; int irq; struct clk *clks[VIDC_CLKS_NUM_MAX]; struct clk *vcodec0_clks[VIDC_VCODEC_CLKS_NUM_MAX]; From ebccb53232ccde59da77874fc5392cb2c79e3c3b Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Fri, 2 Apr 2021 12:06:28 +0200 Subject: [PATCH 071/228] media: venus: hfi,pm,firmware: Convert to block relative addressing [ Upstream commit ff2a7013b3e6a3d34d2b5c7786b8a73093d25319 ] An upcoming silicon change places a number of existing blocks within the Venus at different relative offsets to the base address of IO region. In order to handle this difference this patch changes the address offsets of the registers to function as offsets relative to the relevant sub-block of registers within the IO region not the base address of the IO region. As a result of this change venus_readl() and venus_writel() are deleted. Co-developed-by: Dikshita Agarwal Signed-off-by: Dikshita Agarwal Signed-off-by: Bryan O'Donoghue Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab Stable-dep-of: d74e48160980 ("media: venus: hfi_venus: Write to VIDC_CTRL_INIT after unmasking interrupts") Signed-off-by: Sasha Levin --- drivers/media/platform/qcom/venus/firmware.c | 28 +++---- drivers/media/platform/qcom/venus/hfi_venus.c | 82 ++++++++++--------- .../media/platform/qcom/venus/hfi_venus_io.h | 80 +++++++++--------- .../media/platform/qcom/venus/pm_helpers.c | 12 +-- 4 files changed, 105 insertions(+), 97 deletions(-) diff --git a/drivers/media/platform/qcom/venus/firmware.c b/drivers/media/platform/qcom/venus/firmware.c index 1db64a854b88..67b9138a7c5f 100644 --- a/drivers/media/platform/qcom/venus/firmware.c +++ b/drivers/media/platform/qcom/venus/firmware.c @@ -27,19 +27,19 @@ static void venus_reset_cpu(struct venus_core *core) { u32 fw_size = core->fw.mapped_mem_size; - void __iomem *base = core->base; + void __iomem *wrapper_base = core->wrapper_base; - writel(0, base + WRAPPER_FW_START_ADDR); - writel(fw_size, base + WRAPPER_FW_END_ADDR); - writel(0, base + WRAPPER_CPA_START_ADDR); - writel(fw_size, base + WRAPPER_CPA_END_ADDR); - writel(fw_size, base + WRAPPER_NONPIX_START_ADDR); - writel(fw_size, base + WRAPPER_NONPIX_END_ADDR); - writel(0x0, base + WRAPPER_CPU_CGC_DIS); - writel(0x0, base + WRAPPER_CPU_CLOCK_CONFIG); + writel(0, wrapper_base + WRAPPER_FW_START_ADDR); + writel(fw_size, wrapper_base + WRAPPER_FW_END_ADDR); + writel(0, wrapper_base + WRAPPER_CPA_START_ADDR); + writel(fw_size, wrapper_base + WRAPPER_CPA_END_ADDR); + writel(fw_size, wrapper_base + WRAPPER_NONPIX_START_ADDR); + writel(fw_size, wrapper_base + WRAPPER_NONPIX_END_ADDR); + writel(0x0, wrapper_base + WRAPPER_CPU_CGC_DIS); + writel(0x0, wrapper_base + WRAPPER_CPU_CLOCK_CONFIG); /* Bring ARM9 out of reset */ - writel(0, base + WRAPPER_A9SS_SW_RESET); + writel(0, wrapper_base + WRAPPER_A9SS_SW_RESET); } int venus_set_hw_state(struct venus_core *core, bool resume) @@ -56,7 +56,7 @@ int venus_set_hw_state(struct venus_core *core, bool resume) if (resume) venus_reset_cpu(core); else - writel(1, core->base + WRAPPER_A9SS_SW_RESET); + writel(1, core->wrapper_base + WRAPPER_A9SS_SW_RESET); return 0; } @@ -159,12 +159,12 @@ static int venus_shutdown_no_tz(struct venus_core *core) size_t unmapped; u32 reg; struct device *dev = core->fw.dev; - void __iomem *base = core->base; + void __iomem *wrapper_base = core->wrapper_base; /* Assert the reset to ARM9 */ - reg = readl_relaxed(base + WRAPPER_A9SS_SW_RESET); + reg = readl_relaxed(wrapper_base + WRAPPER_A9SS_SW_RESET); reg |= WRAPPER_A9SS_SW_RESET_BIT; - writel_relaxed(reg, base + WRAPPER_A9SS_SW_RESET); + writel_relaxed(reg, wrapper_base + WRAPPER_A9SS_SW_RESET); /* Make sure reset is asserted before the mapping is removed */ mb(); diff --git a/drivers/media/platform/qcom/venus/hfi_venus.c b/drivers/media/platform/qcom/venus/hfi_venus.c index 4be4a75ddcb6..3d705fc5e109 100644 --- a/drivers/media/platform/qcom/venus/hfi_venus.c +++ b/drivers/media/platform/qcom/venus/hfi_venus.c @@ -345,16 +345,6 @@ static void venus_free(struct venus_hfi_device *hdev, struct mem_desc *mem) dma_free_attrs(dev, mem->size, mem->kva, mem->da, mem->attrs); } -static void venus_writel(struct venus_hfi_device *hdev, u32 reg, u32 value) -{ - writel(value, hdev->core->base + reg); -} - -static u32 venus_readl(struct venus_hfi_device *hdev, u32 reg) -{ - return readl(hdev->core->base + reg); -} - static void venus_set_registers(struct venus_hfi_device *hdev) { const struct venus_resources *res = hdev->core->res; @@ -363,12 +353,14 @@ static void venus_set_registers(struct venus_hfi_device *hdev) unsigned int i; for (i = 0; i < count; i++) - venus_writel(hdev, tbl[i].reg, tbl[i].value); + writel(tbl[i].value, hdev->core->base + tbl[i].reg); } static void venus_soft_int(struct venus_hfi_device *hdev) { - venus_writel(hdev, CPU_IC_SOFTINT, BIT(CPU_IC_SOFTINT_H2A_SHIFT)); + void __iomem *cpu_ic_base = hdev->core->cpu_ic_base; + + writel(BIT(CPU_IC_SOFTINT_H2A_SHIFT), cpu_ic_base + CPU_IC_SOFTINT); } static int venus_iface_cmdq_write_nolock(struct venus_hfi_device *hdev, @@ -441,14 +433,16 @@ static int venus_boot_core(struct venus_hfi_device *hdev) static const unsigned int max_tries = 100; u32 ctrl_status = 0; unsigned int count = 0; + void __iomem *cpu_cs_base = hdev->core->cpu_cs_base; + void __iomem *wrapper_base = hdev->core->wrapper_base; int ret = 0; - venus_writel(hdev, VIDC_CTRL_INIT, BIT(VIDC_CTRL_INIT_CTRL_SHIFT)); - venus_writel(hdev, WRAPPER_INTR_MASK, WRAPPER_INTR_MASK_A2HVCODEC_MASK); - venus_writel(hdev, CPU_CS_SCIACMDARG3, 1); + writel(BIT(VIDC_CTRL_INIT_CTRL_SHIFT), cpu_cs_base + VIDC_CTRL_INIT); + writel(WRAPPER_INTR_MASK_A2HVCODEC_MASK, wrapper_base + WRAPPER_INTR_MASK); + writel(1, cpu_cs_base + CPU_CS_SCIACMDARG3); while (!ctrl_status && count < max_tries) { - ctrl_status = venus_readl(hdev, CPU_CS_SCIACMDARG0); + ctrl_status = readl(cpu_cs_base + CPU_CS_SCIACMDARG0); if ((ctrl_status & CPU_CS_SCIACMDARG0_ERROR_STATUS_MASK) == 4) { dev_err(dev, "invalid setting for UC_REGION\n"); ret = -EINVAL; @@ -468,9 +462,11 @@ static int venus_boot_core(struct venus_hfi_device *hdev) static u32 venus_hwversion(struct venus_hfi_device *hdev) { struct device *dev = hdev->core->dev; - u32 ver = venus_readl(hdev, WRAPPER_HW_VERSION); + void __iomem *wrapper_base = hdev->core->wrapper_base; + u32 ver; u32 major, minor, step; + ver = readl(wrapper_base + WRAPPER_HW_VERSION); major = ver & WRAPPER_HW_VERSION_MAJOR_VERSION_MASK; major = major >> WRAPPER_HW_VERSION_MAJOR_VERSION_SHIFT; minor = ver & WRAPPER_HW_VERSION_MINOR_VERSION_MASK; @@ -485,6 +481,7 @@ static u32 venus_hwversion(struct venus_hfi_device *hdev) static int venus_run(struct venus_hfi_device *hdev) { struct device *dev = hdev->core->dev; + void __iomem *cpu_cs_base = hdev->core->cpu_cs_base; int ret; /* @@ -493,12 +490,12 @@ static int venus_run(struct venus_hfi_device *hdev) */ venus_set_registers(hdev); - venus_writel(hdev, UC_REGION_ADDR, hdev->ifaceq_table.da); - venus_writel(hdev, UC_REGION_SIZE, SHARED_QSIZE); - venus_writel(hdev, CPU_CS_SCIACMDARG2, hdev->ifaceq_table.da); - venus_writel(hdev, CPU_CS_SCIACMDARG1, 0x01); + writel(hdev->ifaceq_table.da, cpu_cs_base + UC_REGION_ADDR); + writel(SHARED_QSIZE, cpu_cs_base + UC_REGION_SIZE); + writel(hdev->ifaceq_table.da, cpu_cs_base + CPU_CS_SCIACMDARG2); + writel(0x01, cpu_cs_base + CPU_CS_SCIACMDARG1); if (hdev->sfr.da) - venus_writel(hdev, SFR_ADDR, hdev->sfr.da); + writel(hdev->sfr.da, cpu_cs_base + SFR_ADDR); ret = venus_boot_core(hdev); if (ret) { @@ -513,17 +510,18 @@ static int venus_run(struct venus_hfi_device *hdev) static int venus_halt_axi(struct venus_hfi_device *hdev) { - void __iomem *base = hdev->core->base; + void __iomem *wrapper_base = hdev->core->wrapper_base; + void __iomem *vbif_base = hdev->core->vbif_base; struct device *dev = hdev->core->dev; u32 val; int ret; if (IS_V4(hdev->core)) { - val = venus_readl(hdev, WRAPPER_CPU_AXI_HALT); + val = readl(wrapper_base + WRAPPER_CPU_AXI_HALT); val |= WRAPPER_CPU_AXI_HALT_HALT; - venus_writel(hdev, WRAPPER_CPU_AXI_HALT, val); + writel(val, wrapper_base + WRAPPER_CPU_AXI_HALT); - ret = readl_poll_timeout(base + WRAPPER_CPU_AXI_HALT_STATUS, + ret = readl_poll_timeout(wrapper_base + WRAPPER_CPU_AXI_HALT_STATUS, val, val & WRAPPER_CPU_AXI_HALT_STATUS_IDLE, POLL_INTERVAL_US, @@ -537,12 +535,12 @@ static int venus_halt_axi(struct venus_hfi_device *hdev) } /* Halt AXI and AXI IMEM VBIF Access */ - val = venus_readl(hdev, VBIF_AXI_HALT_CTRL0); + val = readl(vbif_base + VBIF_AXI_HALT_CTRL0); val |= VBIF_AXI_HALT_CTRL0_HALT_REQ; - venus_writel(hdev, VBIF_AXI_HALT_CTRL0, val); + writel(val, vbif_base + VBIF_AXI_HALT_CTRL0); /* Request for AXI bus port halt */ - ret = readl_poll_timeout(base + VBIF_AXI_HALT_CTRL1, val, + ret = readl_poll_timeout(vbif_base + VBIF_AXI_HALT_CTRL1, val, val & VBIF_AXI_HALT_CTRL1_HALT_ACK, POLL_INTERVAL_US, VBIF_AXI_HALT_ACK_TIMEOUT_US); @@ -1035,19 +1033,21 @@ static irqreturn_t venus_isr(struct venus_core *core) { struct venus_hfi_device *hdev = to_hfi_priv(core); u32 status; + void __iomem *cpu_cs_base = hdev->core->cpu_cs_base; + void __iomem *wrapper_base = hdev->core->wrapper_base; if (!hdev) return IRQ_NONE; - status = venus_readl(hdev, WRAPPER_INTR_STATUS); + status = readl(wrapper_base + WRAPPER_INTR_STATUS); if (status & WRAPPER_INTR_STATUS_A2H_MASK || status & WRAPPER_INTR_STATUS_A2HWD_MASK || status & CPU_CS_SCIACMDARG0_INIT_IDLE_MSG_MASK) hdev->irq_status = status; - venus_writel(hdev, CPU_CS_A2HSOFTINTCLR, 1); - venus_writel(hdev, WRAPPER_INTR_CLEAR, status); + writel(1, cpu_cs_base + CPU_CS_A2HSOFTINTCLR); + writel(status, wrapper_base + WRAPPER_INTR_CLEAR); return IRQ_WAKE_THREAD; } @@ -1380,6 +1380,7 @@ static int venus_suspend_1xx(struct venus_core *core) { struct venus_hfi_device *hdev = to_hfi_priv(core); struct device *dev = core->dev; + void __iomem *cpu_cs_base = hdev->core->cpu_cs_base; u32 ctrl_status; int ret; @@ -1414,7 +1415,7 @@ static int venus_suspend_1xx(struct venus_core *core) return -EINVAL; } - ctrl_status = venus_readl(hdev, CPU_CS_SCIACMDARG0); + ctrl_status = readl(cpu_cs_base + CPU_CS_SCIACMDARG0); if (!(ctrl_status & CPU_CS_SCIACMDARG0_PC_READY)) { mutex_unlock(&hdev->lock); return -EINVAL; @@ -1435,10 +1436,12 @@ static int venus_suspend_1xx(struct venus_core *core) static bool venus_cpu_and_video_core_idle(struct venus_hfi_device *hdev) { + void __iomem *wrapper_base = hdev->core->wrapper_base; + void __iomem *cpu_cs_base = hdev->core->cpu_cs_base; u32 ctrl_status, cpu_status; - cpu_status = venus_readl(hdev, WRAPPER_CPU_STATUS); - ctrl_status = venus_readl(hdev, CPU_CS_SCIACMDARG0); + cpu_status = readl(wrapper_base + WRAPPER_CPU_STATUS); + ctrl_status = readl(cpu_cs_base + CPU_CS_SCIACMDARG0); if (cpu_status & WRAPPER_CPU_STATUS_WFI && ctrl_status & CPU_CS_SCIACMDARG0_INIT_IDLE_MSG_MASK) @@ -1449,10 +1452,12 @@ static bool venus_cpu_and_video_core_idle(struct venus_hfi_device *hdev) static bool venus_cpu_idle_and_pc_ready(struct venus_hfi_device *hdev) { + void __iomem *wrapper_base = hdev->core->wrapper_base; + void __iomem *cpu_cs_base = hdev->core->cpu_cs_base; u32 ctrl_status, cpu_status; - cpu_status = venus_readl(hdev, WRAPPER_CPU_STATUS); - ctrl_status = venus_readl(hdev, CPU_CS_SCIACMDARG0); + cpu_status = readl(wrapper_base + WRAPPER_CPU_STATUS); + ctrl_status = readl(cpu_cs_base + CPU_CS_SCIACMDARG0); if (cpu_status & WRAPPER_CPU_STATUS_WFI && ctrl_status & CPU_CS_SCIACMDARG0_PC_READY) @@ -1465,6 +1470,7 @@ static int venus_suspend_3xx(struct venus_core *core) { struct venus_hfi_device *hdev = to_hfi_priv(core); struct device *dev = core->dev; + void __iomem *cpu_cs_base = hdev->core->cpu_cs_base; u32 ctrl_status; bool val; int ret; @@ -1481,7 +1487,7 @@ static int venus_suspend_3xx(struct venus_core *core) return -EINVAL; } - ctrl_status = venus_readl(hdev, CPU_CS_SCIACMDARG0); + ctrl_status = readl(cpu_cs_base + CPU_CS_SCIACMDARG0); if (ctrl_status & CPU_CS_SCIACMDARG0_PC_READY) goto power_off; diff --git a/drivers/media/platform/qcom/venus/hfi_venus_io.h b/drivers/media/platform/qcom/venus/hfi_venus_io.h index 3b52f98478db..4c392b67252c 100644 --- a/drivers/media/platform/qcom/venus/hfi_venus_io.h +++ b/drivers/media/platform/qcom/venus/hfi_venus_io.h @@ -8,27 +8,28 @@ #define VBIF_BASE 0x80000 -#define VBIF_AXI_HALT_CTRL0 (VBIF_BASE + 0x208) -#define VBIF_AXI_HALT_CTRL1 (VBIF_BASE + 0x20c) +#define VBIF_AXI_HALT_CTRL0 0x208 +#define VBIF_AXI_HALT_CTRL1 0x20c #define VBIF_AXI_HALT_CTRL0_HALT_REQ BIT(0) #define VBIF_AXI_HALT_CTRL1_HALT_ACK BIT(0) #define VBIF_AXI_HALT_ACK_TIMEOUT_US 500000 #define CPU_BASE 0xc0000 + #define CPU_CS_BASE (CPU_BASE + 0x12000) #define CPU_IC_BASE (CPU_BASE + 0x1f000) -#define CPU_CS_A2HSOFTINTCLR (CPU_CS_BASE + 0x1c) +#define CPU_CS_A2HSOFTINTCLR 0x1c -#define VIDC_CTRL_INIT (CPU_CS_BASE + 0x48) +#define VIDC_CTRL_INIT 0x48 #define VIDC_CTRL_INIT_RESERVED_BITS31_1_MASK 0xfffffffe #define VIDC_CTRL_INIT_RESERVED_BITS31_1_SHIFT 1 #define VIDC_CTRL_INIT_CTRL_MASK 0x1 #define VIDC_CTRL_INIT_CTRL_SHIFT 0 /* HFI control status */ -#define CPU_CS_SCIACMDARG0 (CPU_CS_BASE + 0x4c) +#define CPU_CS_SCIACMDARG0 0x4c #define CPU_CS_SCIACMDARG0_MASK 0xff #define CPU_CS_SCIACMDARG0_SHIFT 0x0 #define CPU_CS_SCIACMDARG0_ERROR_STATUS_MASK 0xfe @@ -39,42 +40,43 @@ #define CPU_CS_SCIACMDARG0_INIT_IDLE_MSG_MASK BIT(30) /* HFI queue table info */ -#define CPU_CS_SCIACMDARG1 (CPU_CS_BASE + 0x50) +#define CPU_CS_SCIACMDARG1 0x50 /* HFI queue table address */ -#define CPU_CS_SCIACMDARG2 (CPU_CS_BASE + 0x54) +#define CPU_CS_SCIACMDARG2 0x54 /* Venus cpu */ -#define CPU_CS_SCIACMDARG3 (CPU_CS_BASE + 0x58) +#define CPU_CS_SCIACMDARG3 0x58 -#define SFR_ADDR (CPU_CS_BASE + 0x5c) -#define MMAP_ADDR (CPU_CS_BASE + 0x60) -#define UC_REGION_ADDR (CPU_CS_BASE + 0x64) -#define UC_REGION_SIZE (CPU_CS_BASE + 0x68) +#define SFR_ADDR 0x5c +#define MMAP_ADDR 0x60 +#define UC_REGION_ADDR 0x64 +#define UC_REGION_SIZE 0x68 -#define CPU_IC_SOFTINT (CPU_IC_BASE + 0x18) +/* Relative to CPU_IC_BASE */ +#define CPU_IC_SOFTINT 0x18 #define CPU_IC_SOFTINT_H2A_MASK 0x8000 #define CPU_IC_SOFTINT_H2A_SHIFT 0xf /* Venus wrapper */ #define WRAPPER_BASE 0x000e0000 -#define WRAPPER_HW_VERSION (WRAPPER_BASE + 0x00) +#define WRAPPER_HW_VERSION 0x00 #define WRAPPER_HW_VERSION_MAJOR_VERSION_MASK 0x78000000 #define WRAPPER_HW_VERSION_MAJOR_VERSION_SHIFT 28 #define WRAPPER_HW_VERSION_MINOR_VERSION_MASK 0xfff0000 #define WRAPPER_HW_VERSION_MINOR_VERSION_SHIFT 16 #define WRAPPER_HW_VERSION_STEP_VERSION_MASK 0xffff -#define WRAPPER_CLOCK_CONFIG (WRAPPER_BASE + 0x04) +#define WRAPPER_CLOCK_CONFIG 0x04 -#define WRAPPER_INTR_STATUS (WRAPPER_BASE + 0x0c) +#define WRAPPER_INTR_STATUS 0x0c #define WRAPPER_INTR_STATUS_A2HWD_MASK 0x10 #define WRAPPER_INTR_STATUS_A2HWD_SHIFT 0x4 #define WRAPPER_INTR_STATUS_A2H_MASK 0x4 #define WRAPPER_INTR_STATUS_A2H_SHIFT 0x2 -#define WRAPPER_INTR_MASK (WRAPPER_BASE + 0x10) +#define WRAPPER_INTR_MASK 0x10 #define WRAPPER_INTR_MASK_A2HWD_BASK 0x10 #define WRAPPER_INTR_MASK_A2HWD_SHIFT 0x4 #define WRAPPER_INTR_MASK_A2HVCODEC_MASK 0x8 @@ -82,41 +84,41 @@ #define WRAPPER_INTR_MASK_A2HCPU_MASK 0x4 #define WRAPPER_INTR_MASK_A2HCPU_SHIFT 0x2 -#define WRAPPER_INTR_CLEAR (WRAPPER_BASE + 0x14) +#define WRAPPER_INTR_CLEAR 0x14 #define WRAPPER_INTR_CLEAR_A2HWD_MASK 0x10 #define WRAPPER_INTR_CLEAR_A2HWD_SHIFT 0x4 #define WRAPPER_INTR_CLEAR_A2H_MASK 0x4 #define WRAPPER_INTR_CLEAR_A2H_SHIFT 0x2 -#define WRAPPER_POWER_STATUS (WRAPPER_BASE + 0x44) -#define WRAPPER_VDEC_VCODEC_POWER_CONTROL (WRAPPER_BASE + 0x48) -#define WRAPPER_VENC_VCODEC_POWER_CONTROL (WRAPPER_BASE + 0x4c) -#define WRAPPER_VDEC_VENC_AHB_BRIDGE_SYNC_RESET (WRAPPER_BASE + 0x64) +#define WRAPPER_POWER_STATUS 0x44 +#define WRAPPER_VDEC_VCODEC_POWER_CONTROL 0x48 +#define WRAPPER_VENC_VCODEC_POWER_CONTROL 0x4c +#define WRAPPER_VDEC_VENC_AHB_BRIDGE_SYNC_RESET 0x64 -#define WRAPPER_CPU_CLOCK_CONFIG (WRAPPER_BASE + 0x2000) -#define WRAPPER_CPU_AXI_HALT (WRAPPER_BASE + 0x2008) +#define WRAPPER_CPU_CLOCK_CONFIG 0x2000 +#define WRAPPER_CPU_AXI_HALT 0x2008 #define WRAPPER_CPU_AXI_HALT_HALT BIT(16) -#define WRAPPER_CPU_AXI_HALT_STATUS (WRAPPER_BASE + 0x200c) +#define WRAPPER_CPU_AXI_HALT_STATUS 0x200c #define WRAPPER_CPU_AXI_HALT_STATUS_IDLE BIT(24) -#define WRAPPER_CPU_CGC_DIS (WRAPPER_BASE + 0x2010) -#define WRAPPER_CPU_STATUS (WRAPPER_BASE + 0x2014) +#define WRAPPER_CPU_CGC_DIS 0x2010 +#define WRAPPER_CPU_STATUS 0x2014 #define WRAPPER_CPU_STATUS_WFI BIT(0) -#define WRAPPER_SW_RESET (WRAPPER_BASE + 0x3000) -#define WRAPPER_CPA_START_ADDR (WRAPPER_BASE + 0x1020) -#define WRAPPER_CPA_END_ADDR (WRAPPER_BASE + 0x1024) -#define WRAPPER_FW_START_ADDR (WRAPPER_BASE + 0x1028) -#define WRAPPER_FW_END_ADDR (WRAPPER_BASE + 0x102C) -#define WRAPPER_NONPIX_START_ADDR (WRAPPER_BASE + 0x1030) -#define WRAPPER_NONPIX_END_ADDR (WRAPPER_BASE + 0x1034) -#define WRAPPER_A9SS_SW_RESET (WRAPPER_BASE + 0x3000) +#define WRAPPER_SW_RESET 0x3000 +#define WRAPPER_CPA_START_ADDR 0x1020 +#define WRAPPER_CPA_END_ADDR 0x1024 +#define WRAPPER_FW_START_ADDR 0x1028 +#define WRAPPER_FW_END_ADDR 0x102C +#define WRAPPER_NONPIX_START_ADDR 0x1030 +#define WRAPPER_NONPIX_END_ADDR 0x1034 +#define WRAPPER_A9SS_SW_RESET 0x3000 #define WRAPPER_A9SS_SW_RESET_BIT BIT(4) /* Venus 4xx */ -#define WRAPPER_VCODEC0_MMCC_POWER_STATUS (WRAPPER_BASE + 0x90) -#define WRAPPER_VCODEC0_MMCC_POWER_CONTROL (WRAPPER_BASE + 0x94) +#define WRAPPER_VCODEC0_MMCC_POWER_STATUS 0x90 +#define WRAPPER_VCODEC0_MMCC_POWER_CONTROL 0x94 -#define WRAPPER_VCODEC1_MMCC_POWER_STATUS (WRAPPER_BASE + 0x110) -#define WRAPPER_VCODEC1_MMCC_POWER_CONTROL (WRAPPER_BASE + 0x114) +#define WRAPPER_VCODEC1_MMCC_POWER_STATUS 0x110 +#define WRAPPER_VCODEC1_MMCC_POWER_CONTROL 0x114 #endif diff --git a/drivers/media/platform/qcom/venus/pm_helpers.c b/drivers/media/platform/qcom/venus/pm_helpers.c index f7de02352f1b..6bf9c5c319de 100644 --- a/drivers/media/platform/qcom/venus/pm_helpers.c +++ b/drivers/media/platform/qcom/venus/pm_helpers.c @@ -304,9 +304,9 @@ vcodec_control_v3(struct venus_core *core, u32 session_type, bool enable) void __iomem *ctrl; if (session_type == VIDC_SESSION_TYPE_DEC) - ctrl = core->base + WRAPPER_VDEC_VCODEC_POWER_CONTROL; + ctrl = core->wrapper_base + WRAPPER_VDEC_VCODEC_POWER_CONTROL; else - ctrl = core->base + WRAPPER_VENC_VCODEC_POWER_CONTROL; + ctrl = core->wrapper_base + WRAPPER_VENC_VCODEC_POWER_CONTROL; if (enable) writel(0, ctrl); @@ -381,11 +381,11 @@ static int vcodec_control_v4(struct venus_core *core, u32 coreid, bool enable) int ret; if (coreid == VIDC_CORE_ID_1) { - ctrl = core->base + WRAPPER_VCODEC0_MMCC_POWER_CONTROL; - stat = core->base + WRAPPER_VCODEC0_MMCC_POWER_STATUS; + ctrl = core->wrapper_base + WRAPPER_VCODEC0_MMCC_POWER_CONTROL; + stat = core->wrapper_base + WRAPPER_VCODEC0_MMCC_POWER_STATUS; } else { - ctrl = core->base + WRAPPER_VCODEC1_MMCC_POWER_CONTROL; - stat = core->base + WRAPPER_VCODEC1_MMCC_POWER_STATUS; + ctrl = core->wrapper_base + WRAPPER_VCODEC1_MMCC_POWER_CONTROL; + stat = core->wrapper_base + WRAPPER_VCODEC1_MMCC_POWER_STATUS; } if (enable) { From 3ed9d3dc244b0d819addf40f02f94bffde06585e Mon Sep 17 00:00:00 2001 From: Dikshita Agarwal Date: Fri, 2 Apr 2021 12:06:30 +0200 Subject: [PATCH 072/228] media: venus: hfi: Define additional 6xx registers [ Upstream commit 7f6631295f46070ee5cdbe939136ce48cc617272 ] - Add X2 RPMh registers and definitions from the downstream example. - Add 6xx core power definitions - Add 6xx AON definitions - Add 6xx wrapper tz definitions - Add 6xx wrapper interrupt definitions - Add 6xx soft interrupt definitions - Define wrapper LPI register offsets Signed-off-by: Dikshita Agarwal Co-developed-by: Bryan O'Donoghue Signed-off-by: Bryan O'Donoghue Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab Stable-dep-of: d74e48160980 ("media: venus: hfi_venus: Write to VIDC_CTRL_INIT after unmasking interrupts") Signed-off-by: Sasha Levin --- .../media/platform/qcom/venus/hfi_venus_io.h | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/media/platform/qcom/venus/hfi_venus_io.h b/drivers/media/platform/qcom/venus/hfi_venus_io.h index 4c392b67252c..9cad15eac9e8 100644 --- a/drivers/media/platform/qcom/venus/hfi_venus_io.h +++ b/drivers/media/platform/qcom/venus/hfi_venus_io.h @@ -53,10 +53,22 @@ #define UC_REGION_ADDR 0x64 #define UC_REGION_SIZE 0x68 +#define CPU_CS_H2XSOFTINTEN_V6 0x148 + +#define CPU_CS_X2RPMH_V6 0x168 +#define CPU_CS_X2RPMH_MASK0_BMSK_V6 0x1 +#define CPU_CS_X2RPMH_MASK0_SHFT_V6 0x0 +#define CPU_CS_X2RPMH_MASK1_BMSK_V6 0x2 +#define CPU_CS_X2RPMH_MASK1_SHFT_V6 0x1 +#define CPU_CS_X2RPMH_SWOVERRIDE_BMSK_V6 0x4 +#define CPU_CS_X2RPMH_SWOVERRIDE_SHFT_V6 0x3 + /* Relative to CPU_IC_BASE */ #define CPU_IC_SOFTINT 0x18 +#define CPU_IC_SOFTINT_V6 0x150 #define CPU_IC_SOFTINT_H2A_MASK 0x8000 #define CPU_IC_SOFTINT_H2A_SHIFT 0xf +#define CPU_IC_SOFTINT_H2A_SHIFT_V6 0x0 /* Venus wrapper */ #define WRAPPER_BASE 0x000e0000 @@ -84,6 +96,9 @@ #define WRAPPER_INTR_MASK_A2HCPU_MASK 0x4 #define WRAPPER_INTR_MASK_A2HCPU_SHIFT 0x2 +#define WRAPPER_INTR_STATUS_A2HWD_MASK_V6 0x8 +#define WRAPPER_INTR_MASK_A2HWD_BASK_V6 0x8 + #define WRAPPER_INTR_CLEAR 0x14 #define WRAPPER_INTR_CLEAR_A2HWD_MASK 0x10 #define WRAPPER_INTR_CLEAR_A2HWD_SHIFT 0x4 @@ -93,6 +108,8 @@ #define WRAPPER_POWER_STATUS 0x44 #define WRAPPER_VDEC_VCODEC_POWER_CONTROL 0x48 #define WRAPPER_VENC_VCODEC_POWER_CONTROL 0x4c +#define WRAPPER_DEBUG_BRIDGE_LPI_CONTROL_V6 0x54 +#define WRAPPER_DEBUG_BRIDGE_LPI_STATUS_V6 0x58 #define WRAPPER_VDEC_VENC_AHB_BRIDGE_SYNC_RESET 0x64 #define WRAPPER_CPU_CLOCK_CONFIG 0x2000 @@ -121,4 +138,17 @@ #define WRAPPER_VCODEC1_MMCC_POWER_STATUS 0x110 #define WRAPPER_VCODEC1_MMCC_POWER_CONTROL 0x114 +/* Venus 6xx */ +#define WRAPPER_CORE_POWER_STATUS_V6 0x80 +#define WRAPPER_CORE_POWER_CONTROL_V6 0x84 + +/* Wrapper TZ 6xx */ +#define WRAPPER_TZ_BASE_V6 0x000c0000 +#define WRAPPER_TZ_CPU_STATUS_V6 0x10 + +/* Venus AON */ +#define AON_BASE_V6 0x000e0000 +#define AON_WRAPPER_MVP_NOC_LPI_CONTROL 0x00 +#define AON_WRAPPER_MVP_NOC_LPI_STATUS 0x04 + #endif From 4596fece3c2448c64e6b5b156f67eb752a4cf6a3 Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Fri, 2 Apr 2021 12:06:31 +0200 Subject: [PATCH 073/228] media: venus: core: Add differentiator IS_V6(core) [ Upstream commit ff027906308fcda1661e05beac6abdcbe2b93f6d ] This commit adds the macro helper IS_V6() which will be used to differentiate iris2/v6 silicon from previous versions. Signed-off-by: Bryan O'Donoghue Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab Stable-dep-of: d74e48160980 ("media: venus: hfi_venus: Write to VIDC_CTRL_INIT after unmasking interrupts") Signed-off-by: Sasha Levin --- drivers/media/platform/qcom/venus/core.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/platform/qcom/venus/core.h b/drivers/media/platform/qcom/venus/core.h index 50eb0a9fb134..75d006803327 100644 --- a/drivers/media/platform/qcom/venus/core.h +++ b/drivers/media/platform/qcom/venus/core.h @@ -426,6 +426,7 @@ struct venus_inst { #define IS_V1(core) ((core)->res->hfi_version == HFI_VERSION_1XX) #define IS_V3(core) ((core)->res->hfi_version == HFI_VERSION_3XX) #define IS_V4(core) ((core)->res->hfi_version == HFI_VERSION_4XX) +#define IS_V6(core) ((core)->res->hfi_version == HFI_VERSION_6XX) #define ctrl_to_inst(ctrl) \ container_of((ctrl)->handler, struct venus_inst, ctrl_handler) From c4cc1f690f191c97d1305b4666851db6816c3fee Mon Sep 17 00:00:00 2001 From: Dikshita Agarwal Date: Fri, 2 Apr 2021 12:06:35 +0200 Subject: [PATCH 074/228] media: venus: hfi: Add a 6xx boot logic [ Upstream commit 255385ca433ce5ff621732f26a759211a27c8f85 ] This patch adds a 6xx specific boot logic. The goal is to share as much code as possible between 3xx, 4xx and 6xx silicon. We need to do a different write to WRAPPER_INTR_MASK with an additional write to CPU_CS_H2XSOFTINTEN_V6 and CPU_CS_X2RPMh_V6. The other writes are the same for 6xx and non-6xx silicon albeit at different absolute relative locations to the base of the venus address space. Signed-off-by: Dikshita Agarwal Signed-off-by: Bryan O'Donoghue Signed-off-by: Stanimir Varbanov Signed-off-by: Mauro Carvalho Chehab Stable-dep-of: d74e48160980 ("media: venus: hfi_venus: Write to VIDC_CTRL_INIT after unmasking interrupts") Signed-off-by: Sasha Levin --- drivers/media/platform/qcom/venus/hfi_venus.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/qcom/venus/hfi_venus.c b/drivers/media/platform/qcom/venus/hfi_venus.c index 3d705fc5e109..97d36cafd8cb 100644 --- a/drivers/media/platform/qcom/venus/hfi_venus.c +++ b/drivers/media/platform/qcom/venus/hfi_venus.c @@ -431,14 +431,21 @@ static int venus_boot_core(struct venus_hfi_device *hdev) { struct device *dev = hdev->core->dev; static const unsigned int max_tries = 100; - u32 ctrl_status = 0; + u32 ctrl_status = 0, mask_val; unsigned int count = 0; void __iomem *cpu_cs_base = hdev->core->cpu_cs_base; void __iomem *wrapper_base = hdev->core->wrapper_base; int ret = 0; writel(BIT(VIDC_CTRL_INIT_CTRL_SHIFT), cpu_cs_base + VIDC_CTRL_INIT); - writel(WRAPPER_INTR_MASK_A2HVCODEC_MASK, wrapper_base + WRAPPER_INTR_MASK); + if (IS_V6(hdev->core)) { + mask_val = readl(wrapper_base + WRAPPER_INTR_MASK); + mask_val &= ~(WRAPPER_INTR_MASK_A2HWD_BASK_V6 | + WRAPPER_INTR_MASK_A2HCPU_MASK); + } else { + mask_val = WRAPPER_INTR_MASK_A2HVCODEC_MASK; + } + writel(mask_val, wrapper_base + WRAPPER_INTR_MASK); writel(1, cpu_cs_base + CPU_CS_SCIACMDARG3); while (!ctrl_status && count < max_tries) { @@ -456,6 +463,9 @@ static int venus_boot_core(struct venus_hfi_device *hdev) if (count >= max_tries) ret = -ETIMEDOUT; + if (IS_V6(hdev->core)) + writel(0x0, cpu_cs_base + CPU_CS_X2RPMH_V6); + return ret; } From 2d9ea86f3c4a50c62597096c293d99f378af968a Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Tue, 30 May 2023 14:30:36 +0200 Subject: [PATCH 075/228] media: venus: hfi_venus: Write to VIDC_CTRL_INIT after unmasking interrupts [ Upstream commit d74e481609808330b4625b3691cf01e1f56e255e ] The startup procedure shouldn't be started with interrupts masked, as that may entail silent failures. Kick off initialization only after the interrupts are unmasked. Cc: stable@vger.kernel.org # v4.12+ Fixes: d96d3f30c0f2 ("[media] media: venus: hfi: add Venus HFI files") Signed-off-by: Konrad Dybcio Signed-off-by: Stanimir Varbanov Signed-off-by: Hans Verkuil Signed-off-by: Sasha Levin --- drivers/media/platform/qcom/venus/hfi_venus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/venus/hfi_venus.c b/drivers/media/platform/qcom/venus/hfi_venus.c index 97d36cafd8cb..9d939f63d16f 100644 --- a/drivers/media/platform/qcom/venus/hfi_venus.c +++ b/drivers/media/platform/qcom/venus/hfi_venus.c @@ -437,7 +437,6 @@ static int venus_boot_core(struct venus_hfi_device *hdev) void __iomem *wrapper_base = hdev->core->wrapper_base; int ret = 0; - writel(BIT(VIDC_CTRL_INIT_CTRL_SHIFT), cpu_cs_base + VIDC_CTRL_INIT); if (IS_V6(hdev->core)) { mask_val = readl(wrapper_base + WRAPPER_INTR_MASK); mask_val &= ~(WRAPPER_INTR_MASK_A2HWD_BASK_V6 | @@ -448,6 +447,7 @@ static int venus_boot_core(struct venus_hfi_device *hdev) writel(mask_val, wrapper_base + WRAPPER_INTR_MASK); writel(1, cpu_cs_base + CPU_CS_SCIACMDARG3); + writel(BIT(VIDC_CTRL_INIT_CTRL_SHIFT), cpu_cs_base + VIDC_CTRL_INIT); while (!ctrl_status && count < max_tries) { ctrl_status = readl(cpu_cs_base + CPU_CS_SCIACMDARG0); if ((ctrl_status & CPU_CS_SCIACMDARG0_ERROR_STATUS_MASK) == 4) { From 858ca1921639e122c152abac7ac93dfa83bfe239 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Sat, 21 Nov 2020 12:11:51 +0100 Subject: [PATCH 076/228] netfilter: use actual socket sk for REJECT action [ Upstream commit 04295878beac396dae47ba93141cae0d9386e7ef ] True to the message of commit v5.10-rc1-105-g46d6c5ae953c, _do_ actually make use of state->sk when possible, such as in the REJECT modules. Reported-by: Minqiang Chen Cc: Jason A. Donenfeld Signed-off-by: Jan Engelhardt Signed-off-by: Pablo Neira Ayuso Stable-dep-of: 28427f368f0e ("netfilter: nft_exthdr: Fix non-linear header modification") Signed-off-by: Sasha Levin --- include/net/netfilter/ipv4/nf_reject.h | 4 ++-- include/net/netfilter/ipv6/nf_reject.h | 5 ++--- net/ipv4/netfilter/ipt_REJECT.c | 3 ++- net/ipv4/netfilter/nf_reject_ipv4.c | 6 +++--- net/ipv4/netfilter/nft_reject_ipv4.c | 3 ++- net/ipv6/netfilter/ip6t_REJECT.c | 2 +- net/ipv6/netfilter/nf_reject_ipv6.c | 5 +++-- net/ipv6/netfilter/nft_reject_ipv6.c | 3 ++- net/netfilter/nft_reject_inet.c | 6 ++++-- 9 files changed, 21 insertions(+), 16 deletions(-) diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index 40e0e0623f46..d8207a82d761 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -8,8 +8,8 @@ #include void nf_send_unreach(struct sk_buff *skb_in, int code, int hook); -void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook); - +void nf_send_reset(struct net *net, struct sock *, struct sk_buff *oldskb, + int hook); const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *_oth, int hook); struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index 4a3ef9ebdf6f..86e87bc2c516 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -7,9 +7,8 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char code, unsigned int hooknum); - -void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook); - +void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, + int hook); const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *otcph, unsigned int *otcplen, int hook); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index e16b98ee6266..4b8840734762 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -56,7 +56,8 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(xt_net(par), skb, hook); + nf_send_reset(xt_net(par), par->state->sk, skb, hook); + break; case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 93b07739807b..efe14a6a5d9b 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -112,7 +112,8 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) } /* Send RST reply */ -void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) +void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, + int hook) { struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; @@ -144,8 +145,7 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, ip4_dst_hoplimit(skb_dst(nskb))); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); - - if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC)) + if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC)) goto free_nskb; niph = ip_hdr(nskb); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index e408f813f5d8..ff437e4ed6db 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,7 +27,8 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nft_hook(pkt)); break; default: break; diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 3ac5485049f0..a35019d2e480 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -61,7 +61,7 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) /* Do nothing */ break; case IP6T_TCP_RESET: - nf_send_reset6(net, skb, xt_hooknum(par)); + nf_send_reset6(net, par->state->sk, skb, xt_hooknum(par)); break; case IP6T_ICMP6_POLICY_FAIL: nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par)); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index bf95513736c9..832d9f9cd10a 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -141,7 +141,8 @@ static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) return 0; } -void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) +void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, + int hook) { struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; @@ -233,7 +234,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) dev_queue_xmit(nskb); } else #endif - ip6_local_out(net, nskb->sk, nskb); + ip6_local_out(net, sk, nskb); } EXPORT_SYMBOL_GPL(nf_send_reset6); diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index c1098a1968e1..7969d1f3018d 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -28,7 +28,8 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nft_hook(pkt)); break; default: break; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index cf8f2646e93c..36b219e2e896 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -28,7 +28,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset(nft_net(pkt), pkt->xt.state->sk, + pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach(pkt->skb, @@ -44,7 +45,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt)); + nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, + pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: nf_send_unreach6(nft_net(pkt), pkt->skb, From af844ba799b56b5d866536cca21e35be8e2febaf Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 4 May 2021 17:54:06 +0200 Subject: [PATCH 077/228] netfilter: nft_exthdr: Support SCTP chunks [ Upstream commit 133dc203d77dff617d9c4673973ef3859be2c476 ] Chunks are SCTP header extensions similar in implementation to IPv6 extension headers or TCP options. Reusing exthdr expression to find and extract field values from them is therefore pretty straightforward. For now, this supports extracting data from chunks at a fixed offset (and length) only - chunks themselves are an extensible data structure; in order to make all fields available, a nested extension search is needed. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso Stable-dep-of: 28427f368f0e ("netfilter: nft_exthdr: Fix non-linear header modification") Signed-off-by: Sasha Levin --- include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nft_exthdr.c | 51 ++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 98272cb5f617..1d8dd58f83a5 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -797,11 +797,13 @@ enum nft_exthdr_flags { * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers * @NFT_EXTHDR_OP_TCP: match against tcp options * @NFT_EXTHDR_OP_IPV4: match against ipv4 options + * @NFT_EXTHDR_OP_SCTP: match against sctp chunks */ enum nft_exthdr_op { NFT_EXTHDR_OP_IPV6, NFT_EXTHDR_OP_TCPOPT, NFT_EXTHDR_OP_IPV4, + NFT_EXTHDR_OP_SCTP, __NFT_EXTHDR_OP_MAX }; #define NFT_EXTHDR_OP_MAX (__NFT_EXTHDR_OP_MAX - 1) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 670dd146fb2b..2f852ea67e5d 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -10,8 +10,10 @@ #include #include #include +#include #include #include +#include #include struct nft_exthdr { @@ -303,6 +305,43 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, } } +static void nft_exthdr_sctp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + unsigned int offset = pkt->xt.thoff + sizeof(struct sctphdr); + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + + do { + sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch); + if (!sch || !sch->length) + break; + + if (sch->type == priv->type) { + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + nft_reg_store8(dest, true); + return; + } + if (priv->offset + priv->len > ntohs(sch->length) || + offset + ntohs(sch->length) > pkt->skb->len) + break; + + dest[priv->len / NFT_REG32_SIZE] = 0; + memcpy(dest, (char *)sch + priv->offset, priv->len); + return; + } + offset += SCTP_PAD4(ntohs(sch->length)); + } while (offset < pkt->skb->len); + + if (priv->flags & NFT_EXTHDR_F_PRESENT) + nft_reg_store8(dest, false); + else + regs->verdict.code = NFT_BREAK; +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, @@ -502,6 +541,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { .dump = nft_exthdr_dump_set, }; +static const struct nft_expr_ops nft_exthdr_sctp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_sctp_eval, + .init = nft_exthdr_init, + .dump = nft_exthdr_dump, +}; + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -532,6 +579,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, return &nft_exthdr_ipv4_ops; } break; + case NFT_EXTHDR_OP_SCTP: + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_sctp_ops; + break; } return ERR_PTR(-EOPNOTSUPP); From 39546418b84d4abdbba9cc0db2458e577ff9758c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 May 2021 12:30:05 +0200 Subject: [PATCH 078/228] netfilter: nf_tables: add and use nft_sk helper [ Upstream commit 85554eb981e5a8b0b8947611193aef1737081ef2 ] This allows to change storage placement later on without changing readers. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Stable-dep-of: 28427f368f0e ("netfilter: nft_exthdr: Fix non-linear header modification") Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables.h | 5 +++++ net/ipv4/netfilter/nft_reject_ipv4.c | 2 +- net/ipv6/netfilter/nft_reject_ipv6.c | 2 +- net/netfilter/nft_reject_inet.c | 4 ++-- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5619642b9ad4..013f11c9de85 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -28,6 +28,11 @@ struct nft_pktinfo { struct xt_action_param xt; }; +static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) +{ + return pkt->xt.state->sk; +} + static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->xt.state->net; diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index ff437e4ed6db..55fc23a8f7a7 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,7 +27,7 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; default: diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c index 7969d1f3018d..ed69c768797e 100644 --- a/net/ipv6/netfilter/nft_reject_ipv6.c +++ b/net/ipv6/netfilter/nft_reject_ipv6.c @@ -28,7 +28,7 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, pkt->skb, + nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; default: diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 36b219e2e896..c00b94a16682 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -28,7 +28,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset(nft_net(pkt), pkt->xt.state->sk, + nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: @@ -45,7 +45,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, priv->icmp_code, nft_hook(pkt)); break; case NFT_REJECT_TCP_RST: - nf_send_reset6(nft_net(pkt), pkt->xt.state->sk, + nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb, nft_hook(pkt)); break; case NFT_REJECT_ICMPX_UNREACH: From 45b3eb6afcffe8dda873ff1bf0cd179755143129 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 May 2021 12:30:06 +0200 Subject: [PATCH 079/228] netfilter: nf_tables: add and use nft_thoff helper [ Upstream commit 2d7b4ace0754ebaaf71c6824880178d46aa0ab33 ] This allows to change storage placement later on without changing readers. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Stable-dep-of: 28427f368f0e ("netfilter: nft_exthdr: Fix non-linear header modification") Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables.h | 5 +++++ net/netfilter/nf_tables_core.c | 2 +- net/netfilter/nf_tables_trace.c | 6 +++--- net/netfilter/nft_exthdr.c | 8 ++++---- net/netfilter/nft_flow_offload.c | 2 +- net/netfilter/nft_payload.c | 10 +++++----- net/netfilter/nft_synproxy.c | 4 ++-- net/netfilter/nft_tproxy.c | 4 ++-- 8 files changed, 23 insertions(+), 18 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 013f11c9de85..152cd46915d6 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -33,6 +33,11 @@ static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) return pkt->xt.state->sk; } +static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) +{ + return pkt->xt.thoff; +} + static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->xt.state->net; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 9dc18429ed87..b0d711d498c6 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -125,7 +125,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, else { if (!pkt->tprot_set) return false; - ptr = skb_network_header(skb) + pkt->xt.thoff; + ptr = skb_network_header(skb) + nft_thoff(pkt); } ptr += priv->offset; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 0cf3278007ba..e4fe2f0780eb 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -113,17 +113,17 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, int off = skb_network_offset(skb); unsigned int len, nh_end; - nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len; + nh_end = pkt->tprot_set ? nft_thoff(pkt) : skb->len; len = min_t(unsigned int, nh_end - skb_network_offset(skb), NFT_TRACETYPE_NETWORK_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) return -1; if (pkt->tprot_set) { - len = min_t(unsigned int, skb->len - pkt->xt.thoff, + len = min_t(unsigned int, skb->len - nft_thoff(pkt), NFT_TRACETYPE_TRANSPORT_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, - pkt->xt.thoff, len)) + nft_thoff(pkt), len)) return -1; } diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 2f852ea67e5d..73f82483f242 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -170,7 +170,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP) return NULL; - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buffer); + tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer); if (!tcph) return NULL; @@ -178,7 +178,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt, if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len) return NULL; - return skb_header_pointer(pkt->skb, pkt->xt.thoff, *tcphdr_len, buffer); + return skb_header_pointer(pkt->skb, nft_thoff(pkt), *tcphdr_len, buffer); } static void nft_exthdr_tcp_eval(const struct nft_expr *expr, @@ -254,7 +254,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, return; if (skb_ensure_writable(pkt->skb, - pkt->xt.thoff + i + priv->len)) + nft_thoff(pkt) + i + priv->len)) return; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, @@ -309,7 +309,7 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - unsigned int offset = pkt->xt.thoff + sizeof(struct sctphdr); + unsigned int offset = nft_thoff(pkt) + sizeof(struct sctphdr); struct nft_exthdr *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; const struct sctp_chunkhdr *sch; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index d868eade6017..a44340dd3ce6 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -90,7 +90,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: - tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, + tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst)) goto out; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 74c220eeec1a..b2b63c3653d4 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -110,7 +110,7 @@ void nft_payload_eval(const struct nft_expr *expr, case NFT_PAYLOAD_TRANSPORT_HEADER: if (!pkt->tprot_set) goto err; - offset = pkt->xt.thoff; + offset = nft_thoff(pkt); break; default: BUG(); @@ -510,7 +510,7 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, *l4csum_offset = offsetof(struct tcphdr, check); break; case IPPROTO_UDP: - if (!nft_payload_udp_checksum(skb, pkt->xt.thoff)) + if (!nft_payload_udp_checksum(skb, nft_thoff(pkt))) return -1; fallthrough; case IPPROTO_UDPLITE: @@ -523,7 +523,7 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, return -1; } - *l4csum_offset += pkt->xt.thoff; + *l4csum_offset += nft_thoff(pkt); return 0; } @@ -615,7 +615,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, case NFT_PAYLOAD_TRANSPORT_HEADER: if (!pkt->tprot_set) goto err; - offset = pkt->xt.thoff; + offset = nft_thoff(pkt); break; default: BUG(); @@ -646,7 +646,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP && pkt->tprot == IPPROTO_SCTP && skb->ip_summed != CHECKSUM_PARTIAL) { - if (nft_payload_csum_sctp(skb, pkt->xt.thoff)) + if (nft_payload_csum_sctp(skb, nft_thoff(pkt))) goto err; } diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 59c4dfaf2ea1..1133e06f3c40 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -109,7 +109,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, { struct synproxy_options opts = {}; struct sk_buff *skb = pkt->skb; - int thoff = pkt->xt.thoff; + int thoff = nft_thoff(pkt); const struct tcphdr *tcp; struct tcphdr _tcph; @@ -123,7 +123,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, return; } - tcp = skb_header_pointer(skb, pkt->xt.thoff, + tcp = skb_header_pointer(skb, thoff, sizeof(struct tcphdr), &_tcph); if (!tcp) { diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index c49d318f8e6e..f8d277e05ef4 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -88,9 +88,9 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, const struct nft_tproxy *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; const struct ipv6hdr *iph = ipv6_hdr(skb); - struct in6_addr taddr; - int thoff = pkt->xt.thoff; + int thoff = nft_thoff(pkt); struct udphdr _hdr, *hp; + struct in6_addr taddr; __be16 tport = 0; struct sock *sk; int l4proto; From 10670abe111568c7f662981df79ebe4de23ce658 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Nov 2021 11:34:04 +0100 Subject: [PATCH 080/228] netfilter: nft_exthdr: break evaluation if setting TCP option fails [ Upstream commit 962e5a40358787105f126ab1dc01604da3d169e9 ] Break rule evaluation on malformed TCP options. Fixes: 99d1712bc41c ("netfilter: exthdr: tcp option set support") Signed-off-by: Pablo Neira Ayuso Stable-dep-of: 28427f368f0e ("netfilter: nft_exthdr: Fix non-linear header modification") Signed-off-by: Sasha Levin --- net/netfilter/nft_exthdr.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 73f82483f242..10a510fef75c 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -236,7 +236,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) - return; + goto err; opt = (u8 *)tcph; for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { @@ -251,16 +251,16 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, continue; if (i + optl > tcphdr_len || priv->len + priv->offset > optl) - return; + goto err; if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + i + priv->len)) - return; + goto err; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) - return; + goto err; offset = i + priv->offset; @@ -303,6 +303,9 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, return; } + return; +err: + regs->verdict.code = NFT_BREAK; } static void nft_exthdr_sctp_eval(const struct nft_expr *expr, From ed60b8014c9a7ea22a3f06d8beb8faad651e59f8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 Jan 2022 13:00:36 +0100 Subject: [PATCH 081/228] netfilter: exthdr: add support for tcp option removal [ Upstream commit 7890cbea66e78a3a6037b2a12827118d7243270b ] This allows to replace a tcp option with nop padding to selectively disable a particular tcp option. Optstrip mode is chosen when userspace passes the exthdr expression with neither a source nor a destination register attribute. This is identical to xtables TCPOPTSTRIP extension. The only difference is that TCPOPTSTRIP allows to pass in a bitmap of options to remove rather than a single number. Unlike TCPOPTSTRIP this expression can be used multiple times in the same rule to get the same effect. We could add a new nested attribute later on in case there is a use case for single-expression-multi-remove. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Stable-dep-of: 28427f368f0e ("netfilter: nft_exthdr: Fix non-linear header modification") Signed-off-by: Sasha Levin --- net/netfilter/nft_exthdr.c | 96 +++++++++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 10a510fef75c..7a00867aa64b 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -308,6 +308,63 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int i, tcphdr_len, optl; + struct tcphdr *tcph; + u8 *opt; + + tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); + if (!tcph) + goto err; + + if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) + goto drop; + + opt = (u8 *)nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); + if (!opt) + goto err; + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { + unsigned int j; + + optl = optlen(opt, i); + if (priv->type != opt[i]) + continue; + + if (i + optl > tcphdr_len) + goto drop; + + for (j = 0; j < optl; ++j) { + u16 n = TCPOPT_NOP; + u16 o = opt[i+j]; + + if ((i + j) % 2 == 0) { + o <<= 8; + n <<= 8; + } + inet_proto_csum_replace2(&tcph->check, pkt->skb, htons(o), + htons(n), false); + } + memset(opt + i, TCPOPT_NOP, optl); + return; + } + + /* option not found, continue. This allows to do multiple + * option removals per rule. + */ + return; +err: + regs->verdict.code = NFT_BREAK; + return; +drop: + /* can't remove, no choice but to drop */ + regs->verdict.code = NF_DROP; +} + static void nft_exthdr_sctp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -452,6 +509,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, priv->len); } +static int nft_exthdr_tcp_strip_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + + if (tb[NFTA_EXTHDR_SREG] || + tb[NFTA_EXTHDR_DREG] || + tb[NFTA_EXTHDR_FLAGS] || + tb[NFTA_EXTHDR_OFFSET] || + tb[NFTA_EXTHDR_LEN]) + return -EINVAL; + + if (!tb[NFTA_EXTHDR_TYPE]) + return -EINVAL; + + priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); + priv->op = NFT_EXTHDR_OP_TCPOPT; + + return 0; +} + static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -512,6 +591,13 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr) return nft_exthdr_dump_common(skb, priv); } +static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_exthdr *priv = nft_expr_priv(expr); + + return nft_exthdr_dump_common(skb, priv); +} + static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -544,6 +630,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { .dump = nft_exthdr_dump_set, }; +static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_tcp_strip_eval, + .init = nft_exthdr_tcp_strip_init, + .dump = nft_exthdr_dump_strip, +}; + static const struct nft_expr_ops nft_exthdr_sctp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -571,7 +665,7 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, return &nft_exthdr_tcp_set_ops; if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_tcp_ops; - break; + return &nft_exthdr_tcp_strip_ops; case NFT_EXTHDR_OP_IPV6: if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv6_ops; From 9f0d346630253327ab961d3fff61e4243d12bce5 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Fri, 25 Aug 2023 13:33:27 +0800 Subject: [PATCH 082/228] netfilter: nft_exthdr: Fix non-linear header modification [ Upstream commit 28427f368f0e08d504ed06e74bc7cc79d6d06511 ] Fix skb_ensure_writable() size. Don't use nft_tcp_header_pointer() to make it explicit that pointers point to the packet (not local buffer). Fixes: 99d1712bc41c ("netfilter: exthdr: tcp option set support") Fixes: 7890cbea66e7 ("netfilter: exthdr: add support for tcp option removal") Cc: stable@vger.kernel.org Signed-off-by: Xiao Liang Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_exthdr.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 7a00867aa64b..b4682aeabab9 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -238,7 +238,12 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (!tcph) goto err; + if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) + goto err; + + tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt)); opt = (u8 *)tcph; + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { union { __be16 v16; @@ -253,15 +258,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (i + optl > tcphdr_len || priv->len + priv->offset > optl) goto err; - if (skb_ensure_writable(pkt->skb, - nft_thoff(pkt) + i + priv->len)) - goto err; - - tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, - &tcphdr_len); - if (!tcph) - goto err; - offset = i + priv->offset; switch (priv->len) { @@ -325,9 +321,9 @@ static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr, if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) goto drop; - opt = (u8 *)nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); - if (!opt) - goto err; + tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt)); + opt = (u8 *)tcph; + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { unsigned int j; From 0156cce71f8e5e2625638d79c27b775c026bb450 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Wed, 5 Jan 2022 16:36:16 +0100 Subject: [PATCH 083/228] ata: libata: Rename link flag ATA_LFLAG_NO_DB_DELAY [ Upstream commit b9ba367c513dbc165dd6c01266a59db4be2a3564 ] Rename the link flag ATA_LFLAG_NO_DB_DELAY to ATA_LFLAG_NO_DEBOUNCE_DELAY. The new name is longer, but clearer. Signed-off-by: Paul Menzel Signed-off-by: Damien Le Moal Stable-dep-of: 2a2df98ec592 ("ata: ahci: Add Elkhart Lake AHCI controller") Signed-off-by: Sasha Levin --- drivers/ata/ahci_brcm.c | 2 +- drivers/ata/libata-sata.c | 2 +- include/linux/libata.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ata/ahci_brcm.c b/drivers/ata/ahci_brcm.c index 5b32df5d33ad..2e4252545fd2 100644 --- a/drivers/ata/ahci_brcm.c +++ b/drivers/ata/ahci_brcm.c @@ -332,7 +332,7 @@ static struct ata_port_operations ahci_brcm_platform_ops = { static const struct ata_port_info ahci_brcm_port_info = { .flags = AHCI_FLAG_COMMON | ATA_FLAG_NO_DIPM, - .link_flags = ATA_LFLAG_NO_DB_DELAY, + .link_flags = ATA_LFLAG_NO_DEBOUNCE_DELAY, .pio_mask = ATA_PIO4, .udma_mask = ATA_UDMA6, .port_ops = &ahci_brcm_platform_ops, diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c index 4fd9a107fe7f..45656067c547 100644 --- a/drivers/ata/libata-sata.c +++ b/drivers/ata/libata-sata.c @@ -317,7 +317,7 @@ int sata_link_resume(struct ata_link *link, const unsigned long *params, * immediately after resuming. Delay 200ms before * debouncing. */ - if (!(link->flags & ATA_LFLAG_NO_DB_DELAY)) + if (!(link->flags & ATA_LFLAG_NO_DEBOUNCE_DELAY)) ata_msleep(link->ap, 200); /* is SControl restored correctly? */ diff --git a/include/linux/libata.h b/include/linux/libata.h index 5ca9347bd8ef..2de6b4a61394 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -187,7 +187,7 @@ enum { ATA_LFLAG_NO_LPM = (1 << 8), /* disable LPM on this link */ ATA_LFLAG_RST_ONCE = (1 << 9), /* limit recovery to one reset */ ATA_LFLAG_CHANGED = (1 << 10), /* LPM state changed on this link */ - ATA_LFLAG_NO_DB_DELAY = (1 << 11), /* no debounce delay on link resume */ + ATA_LFLAG_NO_DEBOUNCE_DELAY = (1 << 11), /* no debounce delay on link resume */ /* struct ata_port flags */ ATA_FLAG_SLAVE_POSS = (1 << 0), /* host supports slave dev */ From 8061c399c83b9e5fd46f6cfe5e4aa4bb25ba877b Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Wed, 5 Jan 2022 16:36:18 +0100 Subject: [PATCH 084/228] ata: ahci: Add support for AMD A85 FCH (Hudson D4) [ Upstream commit a17ab7aba5df4135ef77d7f6d7105e1ea414936f ] Add support for the AMD A85 FCH (Hudson D4) AHCI adapter. Since this adapter does not require the default 200 ms debounce delay in sata_link_resume(), create a new board board_ahci_no_debounce_delay with the link flag ATA_LFLAG_NO_DEBOUNCE_DELAY, and, for now, configure the AMD A85 FCH (Hudson D4) to use it. On the ASUS F2A85-M PRO it reduces the Linux kernel boot time by the expected 200 ms from 787 ms to 585 ms. Signed-off-by: Paul Menzel Cc: Tejun Heo Signed-off-by: Damien Le Moal Stable-dep-of: 2a2df98ec592 ("ata: ahci: Add Elkhart Lake AHCI controller") Signed-off-by: Sasha Levin --- drivers/ata/ahci.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index d831a80c25f0..1a3608f4209e 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -51,6 +51,7 @@ enum board_ids { board_ahci, board_ahci_ign_iferr, board_ahci_mobile, + board_ahci_no_debounce_delay, board_ahci_nomsi, board_ahci_noncq, board_ahci_nosntf, @@ -142,6 +143,13 @@ static const struct ata_port_info ahci_port_info[] = { .udma_mask = ATA_UDMA6, .port_ops = &ahci_ops, }, + [board_ahci_no_debounce_delay] = { + .flags = AHCI_FLAG_COMMON, + .link_flags = ATA_LFLAG_NO_DEBOUNCE_DELAY, + .pio_mask = ATA_PIO4, + .udma_mask = ATA_UDMA6, + .port_ops = &ahci_ops, + }, [board_ahci_nomsi] = { AHCI_HFLAGS (AHCI_HFLAG_NO_MSI), .flags = AHCI_FLAG_COMMON, @@ -442,6 +450,7 @@ static const struct pci_device_id ahci_pci_tbl[] = { board_ahci_al }, /* AMD */ { PCI_VDEVICE(AMD, 0x7800), board_ahci }, /* AMD Hudson-2 */ + { PCI_VDEVICE(AMD, 0x7801), board_ahci_no_debounce_delay }, /* AMD Hudson-2 (AHCI mode) */ { PCI_VDEVICE(AMD, 0x7900), board_ahci }, /* AMD CZ */ { PCI_VDEVICE(AMD, 0x7901), board_ahci_mobile }, /* AMD Green Sardine */ /* AMD is using RAID class only for ahci controllers */ From 51d190cc98de3c72f197be963a3aa7bbd1d22459 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 25 Feb 2022 11:23:17 -0600 Subject: [PATCH 085/228] ata: ahci: Rename board_ahci_mobile [ Upstream commit 099849af27f74981c7e660dd93ff6a987307c1f2 ] This board definition was originally created for mobile devices to designate default link power managmeent policy to influence runtime power consumption. As this is interesting for more than just mobile designs, rename the board to `board_ahci_low_power` to make it clear it is about default policy. Reviewed-by: Hans de Goede Reviewed-by: Paul Menzel Signed-off-by: Mario Limonciello Signed-off-by: Damien Le Moal Stable-dep-of: 2a2df98ec592 ("ata: ahci: Add Elkhart Lake AHCI controller") Signed-off-by: Sasha Levin --- drivers/ata/ahci.c | 96 +++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 1a3608f4209e..547c0d8460e8 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -50,7 +50,7 @@ enum board_ids { /* board IDs by feature in alphabetical order */ board_ahci, board_ahci_ign_iferr, - board_ahci_mobile, + board_ahci_low_power, board_ahci_no_debounce_delay, board_ahci_nomsi, board_ahci_noncq, @@ -136,7 +136,7 @@ static const struct ata_port_info ahci_port_info[] = { .udma_mask = ATA_UDMA6, .port_ops = &ahci_ops, }, - [board_ahci_mobile] = { + [board_ahci_low_power] = { AHCI_HFLAGS (AHCI_HFLAG_IS_MOBILE), .flags = AHCI_FLAG_COMMON, .pio_mask = ATA_PIO4, @@ -276,13 +276,13 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x2924), board_ahci }, /* ICH9 */ { PCI_VDEVICE(INTEL, 0x2925), board_ahci }, /* ICH9 */ { PCI_VDEVICE(INTEL, 0x2927), board_ahci }, /* ICH9 */ - { PCI_VDEVICE(INTEL, 0x2929), board_ahci_mobile }, /* ICH9M */ - { PCI_VDEVICE(INTEL, 0x292a), board_ahci_mobile }, /* ICH9M */ - { PCI_VDEVICE(INTEL, 0x292b), board_ahci_mobile }, /* ICH9M */ - { PCI_VDEVICE(INTEL, 0x292c), board_ahci_mobile }, /* ICH9M */ - { PCI_VDEVICE(INTEL, 0x292f), board_ahci_mobile }, /* ICH9M */ + { PCI_VDEVICE(INTEL, 0x2929), board_ahci_low_power }, /* ICH9M */ + { PCI_VDEVICE(INTEL, 0x292a), board_ahci_low_power }, /* ICH9M */ + { PCI_VDEVICE(INTEL, 0x292b), board_ahci_low_power }, /* ICH9M */ + { PCI_VDEVICE(INTEL, 0x292c), board_ahci_low_power }, /* ICH9M */ + { PCI_VDEVICE(INTEL, 0x292f), board_ahci_low_power }, /* ICH9M */ { PCI_VDEVICE(INTEL, 0x294d), board_ahci }, /* ICH9 */ - { PCI_VDEVICE(INTEL, 0x294e), board_ahci_mobile }, /* ICH9M */ + { PCI_VDEVICE(INTEL, 0x294e), board_ahci_low_power }, /* ICH9M */ { PCI_VDEVICE(INTEL, 0x502a), board_ahci }, /* Tolapai */ { PCI_VDEVICE(INTEL, 0x502b), board_ahci }, /* Tolapai */ { PCI_VDEVICE(INTEL, 0x3a05), board_ahci }, /* ICH10 */ @@ -292,9 +292,9 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x3b23), board_ahci }, /* PCH AHCI */ { PCI_VDEVICE(INTEL, 0x3b24), board_ahci }, /* PCH RAID */ { PCI_VDEVICE(INTEL, 0x3b25), board_ahci }, /* PCH RAID */ - { PCI_VDEVICE(INTEL, 0x3b29), board_ahci_mobile }, /* PCH M AHCI */ + { PCI_VDEVICE(INTEL, 0x3b29), board_ahci_low_power }, /* PCH M AHCI */ { PCI_VDEVICE(INTEL, 0x3b2b), board_ahci }, /* PCH RAID */ - { PCI_VDEVICE(INTEL, 0x3b2c), board_ahci_mobile }, /* PCH M RAID */ + { PCI_VDEVICE(INTEL, 0x3b2c), board_ahci_low_power }, /* PCH M RAID */ { PCI_VDEVICE(INTEL, 0x3b2f), board_ahci }, /* PCH AHCI */ { PCI_VDEVICE(INTEL, 0x19b0), board_ahci_pcs7 }, /* DNV AHCI */ { PCI_VDEVICE(INTEL, 0x19b1), board_ahci_pcs7 }, /* DNV AHCI */ @@ -317,9 +317,9 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x19cE), board_ahci_pcs7 }, /* DNV AHCI */ { PCI_VDEVICE(INTEL, 0x19cF), board_ahci_pcs7 }, /* DNV AHCI */ { PCI_VDEVICE(INTEL, 0x1c02), board_ahci }, /* CPT AHCI */ - { PCI_VDEVICE(INTEL, 0x1c03), board_ahci_mobile }, /* CPT M AHCI */ + { PCI_VDEVICE(INTEL, 0x1c03), board_ahci_low_power }, /* CPT M AHCI */ { PCI_VDEVICE(INTEL, 0x1c04), board_ahci }, /* CPT RAID */ - { PCI_VDEVICE(INTEL, 0x1c05), board_ahci_mobile }, /* CPT M RAID */ + { PCI_VDEVICE(INTEL, 0x1c05), board_ahci_low_power }, /* CPT M RAID */ { PCI_VDEVICE(INTEL, 0x1c06), board_ahci }, /* CPT RAID */ { PCI_VDEVICE(INTEL, 0x1c07), board_ahci }, /* CPT RAID */ { PCI_VDEVICE(INTEL, 0x1d02), board_ahci }, /* PBG AHCI */ @@ -328,29 +328,29 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x2826), board_ahci }, /* PBG RAID */ { PCI_VDEVICE(INTEL, 0x2323), board_ahci }, /* DH89xxCC AHCI */ { PCI_VDEVICE(INTEL, 0x1e02), board_ahci }, /* Panther Point AHCI */ - { PCI_VDEVICE(INTEL, 0x1e03), board_ahci_mobile }, /* Panther M AHCI */ + { PCI_VDEVICE(INTEL, 0x1e03), board_ahci_low_power }, /* Panther M AHCI */ { PCI_VDEVICE(INTEL, 0x1e04), board_ahci }, /* Panther Point RAID */ { PCI_VDEVICE(INTEL, 0x1e05), board_ahci }, /* Panther Point RAID */ { PCI_VDEVICE(INTEL, 0x1e06), board_ahci }, /* Panther Point RAID */ - { PCI_VDEVICE(INTEL, 0x1e07), board_ahci_mobile }, /* Panther M RAID */ + { PCI_VDEVICE(INTEL, 0x1e07), board_ahci_low_power }, /* Panther M RAID */ { PCI_VDEVICE(INTEL, 0x1e0e), board_ahci }, /* Panther Point RAID */ { PCI_VDEVICE(INTEL, 0x8c02), board_ahci }, /* Lynx Point AHCI */ - { PCI_VDEVICE(INTEL, 0x8c03), board_ahci_mobile }, /* Lynx M AHCI */ + { PCI_VDEVICE(INTEL, 0x8c03), board_ahci_low_power }, /* Lynx M AHCI */ { PCI_VDEVICE(INTEL, 0x8c04), board_ahci }, /* Lynx Point RAID */ - { PCI_VDEVICE(INTEL, 0x8c05), board_ahci_mobile }, /* Lynx M RAID */ + { PCI_VDEVICE(INTEL, 0x8c05), board_ahci_low_power }, /* Lynx M RAID */ { PCI_VDEVICE(INTEL, 0x8c06), board_ahci }, /* Lynx Point RAID */ - { PCI_VDEVICE(INTEL, 0x8c07), board_ahci_mobile }, /* Lynx M RAID */ + { PCI_VDEVICE(INTEL, 0x8c07), board_ahci_low_power }, /* Lynx M RAID */ { PCI_VDEVICE(INTEL, 0x8c0e), board_ahci }, /* Lynx Point RAID */ - { PCI_VDEVICE(INTEL, 0x8c0f), board_ahci_mobile }, /* Lynx M RAID */ - { PCI_VDEVICE(INTEL, 0x9c02), board_ahci_mobile }, /* Lynx LP AHCI */ - { PCI_VDEVICE(INTEL, 0x9c03), board_ahci_mobile }, /* Lynx LP AHCI */ - { PCI_VDEVICE(INTEL, 0x9c04), board_ahci_mobile }, /* Lynx LP RAID */ - { PCI_VDEVICE(INTEL, 0x9c05), board_ahci_mobile }, /* Lynx LP RAID */ - { PCI_VDEVICE(INTEL, 0x9c06), board_ahci_mobile }, /* Lynx LP RAID */ - { PCI_VDEVICE(INTEL, 0x9c07), board_ahci_mobile }, /* Lynx LP RAID */ - { PCI_VDEVICE(INTEL, 0x9c0e), board_ahci_mobile }, /* Lynx LP RAID */ - { PCI_VDEVICE(INTEL, 0x9c0f), board_ahci_mobile }, /* Lynx LP RAID */ - { PCI_VDEVICE(INTEL, 0x9dd3), board_ahci_mobile }, /* Cannon Lake PCH-LP AHCI */ + { PCI_VDEVICE(INTEL, 0x8c0f), board_ahci_low_power }, /* Lynx M RAID */ + { PCI_VDEVICE(INTEL, 0x9c02), board_ahci_low_power }, /* Lynx LP AHCI */ + { PCI_VDEVICE(INTEL, 0x9c03), board_ahci_low_power }, /* Lynx LP AHCI */ + { PCI_VDEVICE(INTEL, 0x9c04), board_ahci_low_power }, /* Lynx LP RAID */ + { PCI_VDEVICE(INTEL, 0x9c05), board_ahci_low_power }, /* Lynx LP RAID */ + { PCI_VDEVICE(INTEL, 0x9c06), board_ahci_low_power }, /* Lynx LP RAID */ + { PCI_VDEVICE(INTEL, 0x9c07), board_ahci_low_power }, /* Lynx LP RAID */ + { PCI_VDEVICE(INTEL, 0x9c0e), board_ahci_low_power }, /* Lynx LP RAID */ + { PCI_VDEVICE(INTEL, 0x9c0f), board_ahci_low_power }, /* Lynx LP RAID */ + { PCI_VDEVICE(INTEL, 0x9dd3), board_ahci_low_power }, /* Cannon Lake PCH-LP AHCI */ { PCI_VDEVICE(INTEL, 0x1f22), board_ahci }, /* Avoton AHCI */ { PCI_VDEVICE(INTEL, 0x1f23), board_ahci }, /* Avoton AHCI */ { PCI_VDEVICE(INTEL, 0x1f24), board_ahci }, /* Avoton RAID */ @@ -382,26 +382,26 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x8d66), board_ahci }, /* Wellsburg RAID */ { PCI_VDEVICE(INTEL, 0x8d6e), board_ahci }, /* Wellsburg RAID */ { PCI_VDEVICE(INTEL, 0x23a3), board_ahci }, /* Coleto Creek AHCI */ - { PCI_VDEVICE(INTEL, 0x9c83), board_ahci_mobile }, /* Wildcat LP AHCI */ - { PCI_VDEVICE(INTEL, 0x9c85), board_ahci_mobile }, /* Wildcat LP RAID */ - { PCI_VDEVICE(INTEL, 0x9c87), board_ahci_mobile }, /* Wildcat LP RAID */ - { PCI_VDEVICE(INTEL, 0x9c8f), board_ahci_mobile }, /* Wildcat LP RAID */ + { PCI_VDEVICE(INTEL, 0x9c83), board_ahci_low_power }, /* Wildcat LP AHCI */ + { PCI_VDEVICE(INTEL, 0x9c85), board_ahci_low_power }, /* Wildcat LP RAID */ + { PCI_VDEVICE(INTEL, 0x9c87), board_ahci_low_power }, /* Wildcat LP RAID */ + { PCI_VDEVICE(INTEL, 0x9c8f), board_ahci_low_power }, /* Wildcat LP RAID */ { PCI_VDEVICE(INTEL, 0x8c82), board_ahci }, /* 9 Series AHCI */ - { PCI_VDEVICE(INTEL, 0x8c83), board_ahci_mobile }, /* 9 Series M AHCI */ + { PCI_VDEVICE(INTEL, 0x8c83), board_ahci_low_power }, /* 9 Series M AHCI */ { PCI_VDEVICE(INTEL, 0x8c84), board_ahci }, /* 9 Series RAID */ - { PCI_VDEVICE(INTEL, 0x8c85), board_ahci_mobile }, /* 9 Series M RAID */ + { PCI_VDEVICE(INTEL, 0x8c85), board_ahci_low_power }, /* 9 Series M RAID */ { PCI_VDEVICE(INTEL, 0x8c86), board_ahci }, /* 9 Series RAID */ - { PCI_VDEVICE(INTEL, 0x8c87), board_ahci_mobile }, /* 9 Series M RAID */ + { PCI_VDEVICE(INTEL, 0x8c87), board_ahci_low_power }, /* 9 Series M RAID */ { PCI_VDEVICE(INTEL, 0x8c8e), board_ahci }, /* 9 Series RAID */ - { PCI_VDEVICE(INTEL, 0x8c8f), board_ahci_mobile }, /* 9 Series M RAID */ - { PCI_VDEVICE(INTEL, 0x9d03), board_ahci_mobile }, /* Sunrise LP AHCI */ - { PCI_VDEVICE(INTEL, 0x9d05), board_ahci_mobile }, /* Sunrise LP RAID */ - { PCI_VDEVICE(INTEL, 0x9d07), board_ahci_mobile }, /* Sunrise LP RAID */ + { PCI_VDEVICE(INTEL, 0x8c8f), board_ahci_low_power }, /* 9 Series M RAID */ + { PCI_VDEVICE(INTEL, 0x9d03), board_ahci_low_power }, /* Sunrise LP AHCI */ + { PCI_VDEVICE(INTEL, 0x9d05), board_ahci_low_power }, /* Sunrise LP RAID */ + { PCI_VDEVICE(INTEL, 0x9d07), board_ahci_low_power }, /* Sunrise LP RAID */ { PCI_VDEVICE(INTEL, 0xa102), board_ahci }, /* Sunrise Point-H AHCI */ - { PCI_VDEVICE(INTEL, 0xa103), board_ahci_mobile }, /* Sunrise M AHCI */ + { PCI_VDEVICE(INTEL, 0xa103), board_ahci_low_power }, /* Sunrise M AHCI */ { PCI_VDEVICE(INTEL, 0xa105), board_ahci }, /* Sunrise Point-H RAID */ { PCI_VDEVICE(INTEL, 0xa106), board_ahci }, /* Sunrise Point-H RAID */ - { PCI_VDEVICE(INTEL, 0xa107), board_ahci_mobile }, /* Sunrise M RAID */ + { PCI_VDEVICE(INTEL, 0xa107), board_ahci_low_power }, /* Sunrise M RAID */ { PCI_VDEVICE(INTEL, 0xa10f), board_ahci }, /* Sunrise Point-H RAID */ { PCI_VDEVICE(INTEL, 0x2822), board_ahci }, /* Lewisburg RAID*/ { PCI_VDEVICE(INTEL, 0x2823), board_ahci }, /* Lewisburg AHCI*/ @@ -418,13 +418,13 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0xa356), board_ahci }, /* Cannon Lake PCH-H RAID */ { PCI_VDEVICE(INTEL, 0x06d7), board_ahci }, /* Comet Lake-H RAID */ { PCI_VDEVICE(INTEL, 0xa386), board_ahci }, /* Comet Lake PCH-V RAID */ - { PCI_VDEVICE(INTEL, 0x0f22), board_ahci_mobile }, /* Bay Trail AHCI */ - { PCI_VDEVICE(INTEL, 0x0f23), board_ahci_mobile }, /* Bay Trail AHCI */ - { PCI_VDEVICE(INTEL, 0x22a3), board_ahci_mobile }, /* Cherry Tr. AHCI */ - { PCI_VDEVICE(INTEL, 0x5ae3), board_ahci_mobile }, /* ApolloLake AHCI */ - { PCI_VDEVICE(INTEL, 0x34d3), board_ahci_mobile }, /* Ice Lake LP AHCI */ - { PCI_VDEVICE(INTEL, 0x02d3), board_ahci_mobile }, /* Comet Lake PCH-U AHCI */ - { PCI_VDEVICE(INTEL, 0x02d7), board_ahci_mobile }, /* Comet Lake PCH RAID */ + { PCI_VDEVICE(INTEL, 0x0f22), board_ahci_low_power }, /* Bay Trail AHCI */ + { PCI_VDEVICE(INTEL, 0x0f23), board_ahci_low_power }, /* Bay Trail AHCI */ + { PCI_VDEVICE(INTEL, 0x22a3), board_ahci_low_power }, /* Cherry Tr. AHCI */ + { PCI_VDEVICE(INTEL, 0x5ae3), board_ahci_low_power }, /* ApolloLake AHCI */ + { PCI_VDEVICE(INTEL, 0x34d3), board_ahci_low_power }, /* Ice Lake LP AHCI */ + { PCI_VDEVICE(INTEL, 0x02d3), board_ahci_low_power }, /* Comet Lake PCH-U AHCI */ + { PCI_VDEVICE(INTEL, 0x02d7), board_ahci_low_power }, /* Comet Lake PCH RAID */ /* JMicron 360/1/3/5/6, match class to avoid IDE function */ { PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, @@ -452,7 +452,7 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(AMD, 0x7800), board_ahci }, /* AMD Hudson-2 */ { PCI_VDEVICE(AMD, 0x7801), board_ahci_no_debounce_delay }, /* AMD Hudson-2 (AHCI mode) */ { PCI_VDEVICE(AMD, 0x7900), board_ahci }, /* AMD CZ */ - { PCI_VDEVICE(AMD, 0x7901), board_ahci_mobile }, /* AMD Green Sardine */ + { PCI_VDEVICE(AMD, 0x7901), board_ahci_low_power }, /* AMD Green Sardine */ /* AMD is using RAID class only for ahci controllers */ { PCI_VENDOR_ID_AMD, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_STORAGE_RAID << 8, 0xffffff, board_ahci }, From a1f85bc9bc69e7f9146a5f1068c7cc94c6678d27 Mon Sep 17 00:00:00 2001 From: Werner Fischer Date: Tue, 29 Aug 2023 13:33:58 +0200 Subject: [PATCH 086/228] ata: ahci: Add Elkhart Lake AHCI controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 2a2df98ec592667927b5c1351afa6493ea125c9f ] Elkhart Lake is the successor of Apollo Lake and Gemini Lake. These CPUs and their PCHs are used in mobile and embedded environments. With this patch I suggest that Elkhart Lake SATA controllers [1] should use the default LPM policy for mobile chipsets. The disadvantage of missing hot-plug support with this setting should not be an issue, as those CPUs are used in embedded environments and not in servers with hot-plug backplanes. We discovered that the Elkhart Lake SATA controllers have been missing in ahci.c after a customer reported the throttling of his SATA SSD after a short period of higher I/O. We determined the high temperature of the SSD controller in idle mode as the root cause for that. Depending on the used SSD, we have seen up to 1.8 Watt lower system idle power usage and up to 30°C lower SSD controller temperatures in our tests, when we set med_power_with_dipm manually. I have provided a table showing seven different SATA SSDs from ATP, Intel/Solidigm and Samsung [2]. Intel lists a total of 3 SATA controller IDs (4B60, 4B62, 4B63) in [1] for those mobile PCHs. This commit just adds 0x4b63 as I do not have test systems with 0x4b60 and 0x4b62 SATA controllers. I have tested this patch with a system which uses 0x4b63 as SATA controller. [1] https://sata-io.org/product/8803 [2] https://www.thomas-krenn.com/en/wiki/SATA_Link_Power_Management#Example_LES_v4 Signed-off-by: Werner Fischer Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Sasha Levin --- drivers/ata/ahci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 547c0d8460e8..4297a8d69dbf 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -425,6 +425,8 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x34d3), board_ahci_low_power }, /* Ice Lake LP AHCI */ { PCI_VDEVICE(INTEL, 0x02d3), board_ahci_low_power }, /* Comet Lake PCH-U AHCI */ { PCI_VDEVICE(INTEL, 0x02d7), board_ahci_low_power }, /* Comet Lake PCH RAID */ + /* Elkhart Lake IDs 0x4b60 & 0x4b62 https://sata-io.org/product/8803 not tested yet */ + { PCI_VDEVICE(INTEL, 0x4b63), board_ahci_low_power }, /* Elkhart Lake AHCI */ /* JMicron 360/1/3/5/6, match class to avoid IDE function */ { PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, From d678c078f30268a29237595f573a467ff678356b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 19 Sep 2023 11:44:42 +0930 Subject: [PATCH 087/228] btrfs: reset destination buffer when read_extent_buffer() gets invalid range [ Upstream commit 74ee79142c0a344d4eae2eb7012ebc4e82254109 ] Commit f98b6215d7d1 ("btrfs: extent_io: do extra check for extent buffer read write functions") changed how we handle invalid extent buffer range for read_extent_buffer(). Previously if the range is invalid we just set the destination to zero, but after the patch we do nothing and error out. This can lead to smatch static checker errors like: fs/btrfs/print-tree.c:186 print_uuid_item() error: uninitialized symbol 'subvol_id'. fs/btrfs/tests/extent-io-tests.c:338 check_eb_bitmap() error: uninitialized symbol 'has'. fs/btrfs/tests/extent-io-tests.c:353 check_eb_bitmap() error: uninitialized symbol 'has'. fs/btrfs/uuid-tree.c:203 btrfs_uuid_tree_remove() error: uninitialized symbol 'read_subid'. fs/btrfs/uuid-tree.c:353 btrfs_uuid_tree_iterate() error: uninitialized symbol 'subid_le'. fs/btrfs/uuid-tree.c:72 btrfs_uuid_tree_lookup() error: uninitialized symbol 'data'. fs/btrfs/volumes.c:7415 btrfs_dev_stats_value() error: uninitialized symbol 'val'. Fix those warnings by reverting back to the old memset() behavior. By this we keep the static checker happy and would still make a lot of noise when such invalid ranges are passed in. Reported-by: Dan Carpenter Fixes: f98b6215d7d1 ("btrfs: extent_io: do extra check for extent buffer read write functions") Signed-off-by: Qu Wenruo Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/extent_io.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0e266772beae..685a375bb6af 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5634,8 +5634,14 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = start >> PAGE_SHIFT; - if (check_eb_range(eb, start, len)) + if (check_eb_range(eb, start, len)) { + /* + * Invalid range hit, reset the memory, so callers won't get + * some random garbage for their uninitialzed memory. + */ + memset(dstv, 0, len); return; + } offset = offset_in_page(start); From a8ee76d72737d7bfa8d6f63593334be55d3fac11 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Sep 2023 09:06:56 +0200 Subject: [PATCH 088/228] MIPS: Alchemy: only build mmc support helpers if au1xmmc is enabled [ Upstream commit ef8f8f04a0b25e8f294b24350e8463a8d6a9ba0b ] While commit d4a5c59a955b ("mmc: au1xmmc: force non-modular build and remove symbol_get usage") to be built in, it can still build a kernel without MMC support and thuse no mmc_detect_change symbol at all. Add ifdefs to build the mmc support code in the alchemy arch code conditional on mmc support. Fixes: d4a5c59a955b ("mmc: au1xmmc: force non-modular build and remove symbol_get usage") Reported-by: kernel test robot Signed-off-by: Christoph Hellwig Acked-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/alchemy/devboards/db1000.c | 4 ++++ arch/mips/alchemy/devboards/db1200.c | 6 ++++++ arch/mips/alchemy/devboards/db1300.c | 4 ++++ 3 files changed, 14 insertions(+) diff --git a/arch/mips/alchemy/devboards/db1000.c b/arch/mips/alchemy/devboards/db1000.c index 50de86eb8784..3183df60ad33 100644 --- a/arch/mips/alchemy/devboards/db1000.c +++ b/arch/mips/alchemy/devboards/db1000.c @@ -164,6 +164,7 @@ static struct platform_device db1x00_audio_dev = { /******************************************************************************/ +#ifdef CONFIG_MMC_AU1X static irqreturn_t db1100_mmc_cd(int irq, void *ptr) { mmc_detect_change(ptr, msecs_to_jiffies(500)); @@ -369,6 +370,7 @@ static struct platform_device db1100_mmc1_dev = { .num_resources = ARRAY_SIZE(au1100_mmc1_res), .resource = au1100_mmc1_res, }; +#endif /* CONFIG_MMC_AU1X */ /******************************************************************************/ @@ -432,8 +434,10 @@ static struct platform_device *db1x00_devs[] = { static struct platform_device *db1100_devs[] = { &au1100_lcd_device, +#ifdef CONFIG_MMC_AU1X &db1100_mmc0_dev, &db1100_mmc1_dev, +#endif }; int __init db1000_dev_setup(void) diff --git a/arch/mips/alchemy/devboards/db1200.c b/arch/mips/alchemy/devboards/db1200.c index b70e2cf8a27b..414f92eacb5e 100644 --- a/arch/mips/alchemy/devboards/db1200.c +++ b/arch/mips/alchemy/devboards/db1200.c @@ -326,6 +326,7 @@ static struct platform_device db1200_ide_dev = { /**********************************************************************/ +#ifdef CONFIG_MMC_AU1X /* SD carddetects: they're supposed to be edge-triggered, but ack * doesn't seem to work (CPLD Rev 2). Instead, the screaming one * is disabled and its counterpart enabled. The 200ms timeout is @@ -584,6 +585,7 @@ static struct platform_device pb1200_mmc1_dev = { .num_resources = ARRAY_SIZE(au1200_mmc1_res), .resource = au1200_mmc1_res, }; +#endif /* CONFIG_MMC_AU1X */ /**********************************************************************/ @@ -751,7 +753,9 @@ static struct platform_device db1200_audiodma_dev = { static struct platform_device *db1200_devs[] __initdata = { NULL, /* PSC0, selected by S6.8 */ &db1200_ide_dev, +#ifdef CONFIG_MMC_AU1X &db1200_mmc0_dev, +#endif &au1200_lcd_dev, &db1200_eth_dev, &db1200_nand_dev, @@ -762,7 +766,9 @@ static struct platform_device *db1200_devs[] __initdata = { }; static struct platform_device *pb1200_devs[] __initdata = { +#ifdef CONFIG_MMC_AU1X &pb1200_mmc1_dev, +#endif }; /* Some peripheral base addresses differ on the PB1200 */ diff --git a/arch/mips/alchemy/devboards/db1300.c b/arch/mips/alchemy/devboards/db1300.c index ca71e5ed51ab..c965d0007481 100644 --- a/arch/mips/alchemy/devboards/db1300.c +++ b/arch/mips/alchemy/devboards/db1300.c @@ -450,6 +450,7 @@ static struct platform_device db1300_ide_dev = { /**********************************************************************/ +#ifdef CONFIG_MMC_AU1X static irqreturn_t db1300_mmc_cd(int irq, void *ptr) { disable_irq_nosync(irq); @@ -632,6 +633,7 @@ static struct platform_device db1300_sd0_dev = { .resource = au1300_sd0_res, .num_resources = ARRAY_SIZE(au1300_sd0_res), }; +#endif /* CONFIG_MMC_AU1X */ /**********************************************************************/ @@ -776,8 +778,10 @@ static struct platform_device *db1300_dev[] __initdata = { &db1300_5waysw_dev, &db1300_nand_dev, &db1300_ide_dev, +#ifdef CONFIG_MMC_AU1X &db1300_sd0_dev, &db1300_sd1_dev, +#endif &db1300_lcd_dev, &db1300_ac97_dev, &db1300_i2s_dev, From 1d4d846e2a4948ec9ff19fb9b023dee5dd160baf Mon Sep 17 00:00:00 2001 From: Julien Panis Date: Mon, 21 Aug 2023 16:24:18 +0200 Subject: [PATCH 089/228] bus: ti-sysc: Use fsleep() instead of usleep_range() in sysc_reset() [ Upstream commit d929b2b7464f95ec01e47f560b1e687482ba8929 ] The am335x-evm started producing boot errors because of subtle timing changes: Unhandled fault: external abort on non-linefetch (0x1008) at 0xf03c1010 ... sysc_reset from sysc_probe+0xf60/0x1514 sysc_probe from platform_probe+0x5c/0xbc ... The fix consists in using the appropriate sleep function in sysc reset. For flexible sleeping, fsleep is recommended. Here, sysc delay parameter can take any value in [0 - 255] us range. As a result, fsleep() should be used, calling udelay() for a sysc delay lower than 10 us. Signed-off-by: Julien Panis Fixes: e709ed70d122 ("bus: ti-sysc: Fix missing reset delay handling") Message-ID: <20230821-fix-ti-sysc-reset-v1-1-5a0a5d8fae55@baylibre.com> Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin --- drivers/bus/ti-sysc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 5e8c078efd22..24d589b43dfe 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -2085,8 +2085,7 @@ static int sysc_reset(struct sysc *ddata) } if (ddata->cfg.srst_udelay) - usleep_range(ddata->cfg.srst_udelay, - ddata->cfg.srst_udelay * 2); + fsleep(ddata->cfg.srst_udelay); if (ddata->post_reset_quirk) ddata->post_reset_quirk(ddata); From 0fd5839e250488d22dd8500d0905bfbf15ef90b8 Mon Sep 17 00:00:00 2001 From: Adam Ford Date: Wed, 6 Sep 2023 18:34:42 -0500 Subject: [PATCH 090/228] bus: ti-sysc: Fix missing AM35xx SoC matching [ Upstream commit 11729caa520950e17cd81bc43ffc477c46cf791e ] Commit feaa8baee82a ("bus: ti-sysc: Implement SoC revision handling") created a list of SoC types searching for strings based on names and wildcards which associates the SoC to different families. The OMAP34xx and OMAP35xx are treated as SOC_3430 while OMAP36xx and OMAP37xx are treated as SOC_3630, but the AM35xx isn't listed. The AM35xx is mostly an OMAP3430, and a later commit a12315d6d270 ("bus: ti-sysc: Make omap3 gpt12 quirk handling SoC specific") looks for the SOC type and behaves in a certain way if it's SOC_3430. This caused a regression on the AM3517 causing it to return two errors: ti-sysc: probe of 48318000.target-module failed with error -16 ti-sysc: probe of 49032000.target-module failed with error -16 Fix this by treating the creating SOC_AM35 and inserting it between the SOC_3430 and SOC_3630. If it is treaed the same way as the SOC_3430 when checking the status of sysc_check_active_timer, the error conditions will disappear. Fixes: a12315d6d270 ("bus: ti-sysc: Make omap3 gpt12 quirk handling SoC specific") Fixes: feaa8baee82a ("bus: ti-sysc: Implement SoC revision handling") Signed-off-by: Adam Ford Message-ID: <20230906233442.270835-1-aford173@gmail.com> Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin --- drivers/bus/ti-sysc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 24d589b43dfe..5dba06ed61bf 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -38,6 +38,7 @@ enum sysc_soc { SOC_2420, SOC_2430, SOC_3430, + SOC_AM35, SOC_3630, SOC_4430, SOC_4460, @@ -1818,7 +1819,7 @@ static void sysc_pre_reset_quirk_dss(struct sysc *ddata) dev_warn(ddata->dev, "%s: timed out %08x !+ %08x\n", __func__, val, irq_mask); - if (sysc_soc->soc == SOC_3430) { + if (sysc_soc->soc == SOC_3430 || sysc_soc->soc == SOC_AM35) { /* Clear DSS_SDI_CONTROL */ sysc_write(ddata, 0x44, 0); @@ -2959,6 +2960,7 @@ static void ti_sysc_idle(struct work_struct *work) static const struct soc_device_attribute sysc_soc_match[] = { SOC_FLAG("OMAP242*", SOC_2420), SOC_FLAG("OMAP243*", SOC_2430), + SOC_FLAG("AM35*", SOC_AM35), SOC_FLAG("OMAP3[45]*", SOC_3430), SOC_FLAG("OMAP3[67]*", SOC_3630), SOC_FLAG("OMAP443*", SOC_4430), @@ -3146,7 +3148,7 @@ static int sysc_check_active_timer(struct sysc *ddata) * can be dropped if we stop supporting old beagleboard revisions * A to B4 at some point. */ - if (sysc_soc->soc == SOC_3430) + if (sysc_soc->soc == SOC_3430 || sysc_soc->soc == SOC_AM35) error = -ENXIO; else error = -EBUSY; From 50789f37239cf7ca06bce126e2d92ce2203a6f3e Mon Sep 17 00:00:00 2001 From: Timo Alho Date: Tue, 12 Sep 2023 14:29:50 +0300 Subject: [PATCH 091/228] clk: tegra: fix error return case for recalc_rate [ Upstream commit a47b44fbb13f5e7a981b4515dcddc93a321ae89c ] tegra-bpmp clocks driver makes implicit conversion of signed error code to unsigned value in recalc_rate operation. The behavior for recalc_rate, according to it's specification, should be that "If the driver cannot figure out a rate for this clock, it must return 0." Fixes: ca6f2796eef7 ("clk: tegra: Add BPMP clock driver") Signed-off-by: Timo Alho Signed-off-by: Mikko Perttunen Link: https://lore.kernel.org/r/20230912112951.2330497-1-cyndis@kapsi.fi Signed-off-by: Stephen Boyd Signed-off-by: Sasha Levin --- drivers/clk/tegra/clk-bpmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/tegra/clk-bpmp.c b/drivers/clk/tegra/clk-bpmp.c index a66263b6490d..00845044c98e 100644 --- a/drivers/clk/tegra/clk-bpmp.c +++ b/drivers/clk/tegra/clk-bpmp.c @@ -159,7 +159,7 @@ static unsigned long tegra_bpmp_clk_recalc_rate(struct clk_hw *hw, err = tegra_bpmp_clk_transfer(clk->bpmp, &msg); if (err < 0) - return err; + return 0; return response.rate; } From 28e5423ad8fb13dda0fe541b40bb6eddee1ec0f3 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 2 Oct 2022 11:20:02 +0200 Subject: [PATCH 092/228] ARM: dts: omap: correct indentation [ Upstream commit 8ae9c7a69fa14e95d032e64d8d758e3f85bee132 ] Do not use spaces for indentation. Link: https://lore.kernel.org/r/20221002092002.68880-1-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski Stable-dep-of: 6469b2feade8 ("ARM: dts: ti: omap: Fix bandgap thermal cells addressing for omap3/4") Signed-off-by: Sasha Levin --- arch/arm/boot/dts/omap-gpmc-smsc911x.dtsi | 6 +-- arch/arm/boot/dts/omap-gpmc-smsc9221.dtsi | 6 +-- arch/arm/boot/dts/omap3-cm-t3517.dts | 12 ++--- arch/arm/boot/dts/omap3-gta04.dtsi | 6 +-- arch/arm/boot/dts/omap3-ldp.dts | 2 +- arch/arm/boot/dts/omap3-n900.dts | 38 +++++++-------- arch/arm/boot/dts/omap3-zoom3.dts | 44 +++++++++--------- arch/arm/boot/dts/omap4-cpu-thermal.dtsi | 24 +++++----- arch/arm/boot/dts/omap5-cm-t54.dts | 56 +++++++++++------------ 9 files changed, 97 insertions(+), 97 deletions(-) diff --git a/arch/arm/boot/dts/omap-gpmc-smsc911x.dtsi b/arch/arm/boot/dts/omap-gpmc-smsc911x.dtsi index ded7e8fec9eb..9cf52650f073 100644 --- a/arch/arm/boot/dts/omap-gpmc-smsc911x.dtsi +++ b/arch/arm/boot/dts/omap-gpmc-smsc911x.dtsi @@ -8,9 +8,9 @@ / { vddvario: regulator-vddvario { - compatible = "regulator-fixed"; - regulator-name = "vddvario"; - regulator-always-on; + compatible = "regulator-fixed"; + regulator-name = "vddvario"; + regulator-always-on; }; vdd33a: regulator-vdd33a { diff --git a/arch/arm/boot/dts/omap-gpmc-smsc9221.dtsi b/arch/arm/boot/dts/omap-gpmc-smsc9221.dtsi index e7534fe9c53c..bc8961f3690f 100644 --- a/arch/arm/boot/dts/omap-gpmc-smsc9221.dtsi +++ b/arch/arm/boot/dts/omap-gpmc-smsc9221.dtsi @@ -12,9 +12,9 @@ / { vddvario: regulator-vddvario { - compatible = "regulator-fixed"; - regulator-name = "vddvario"; - regulator-always-on; + compatible = "regulator-fixed"; + regulator-name = "vddvario"; + regulator-always-on; }; vdd33a: regulator-vdd33a { diff --git a/arch/arm/boot/dts/omap3-cm-t3517.dts b/arch/arm/boot/dts/omap3-cm-t3517.dts index 3b8349094baa..f25c0a84a190 100644 --- a/arch/arm/boot/dts/omap3-cm-t3517.dts +++ b/arch/arm/boot/dts/omap3-cm-t3517.dts @@ -11,12 +11,12 @@ / { model = "CompuLab CM-T3517"; compatible = "compulab,omap3-cm-t3517", "ti,am3517", "ti,omap3"; - vmmc: regulator-vmmc { - compatible = "regulator-fixed"; - regulator-name = "vmmc"; - regulator-min-microvolt = <3300000>; - regulator-max-microvolt = <3300000>; - }; + vmmc: regulator-vmmc { + compatible = "regulator-fixed"; + regulator-name = "vmmc"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + }; wl12xx_vmmc2: wl12xx_vmmc2 { compatible = "regulator-fixed"; diff --git a/arch/arm/boot/dts/omap3-gta04.dtsi b/arch/arm/boot/dts/omap3-gta04.dtsi index e61e5ddbf202..0333ca0e87fe 100644 --- a/arch/arm/boot/dts/omap3-gta04.dtsi +++ b/arch/arm/boot/dts/omap3-gta04.dtsi @@ -332,7 +332,7 @@ OMAP3_CORE1_IOPAD(0x2106, PIN_OUTPUT | MUX_MODE0) /* dss_data21.dss_data21 */ OMAP3_CORE1_IOPAD(0x2108, PIN_OUTPUT | MUX_MODE0) /* dss_data22.dss_data22 */ OMAP3_CORE1_IOPAD(0x210a, PIN_OUTPUT | MUX_MODE0) /* dss_data23.dss_data23 */ >; - }; + }; gps_pins: pinmux_gps_pins { pinctrl-single,pins = < @@ -866,8 +866,8 @@ &mcbsp4 { /* GSM voice PCM */ }; &hdqw1w { - pinctrl-names = "default"; - pinctrl-0 = <&hdq_pins>; + pinctrl-names = "default"; + pinctrl-0 = <&hdq_pins>; }; /* image signal processor within OMAP3 SoC */ diff --git a/arch/arm/boot/dts/omap3-ldp.dts b/arch/arm/boot/dts/omap3-ldp.dts index 9c6a92724590..b898e2f6f41d 100644 --- a/arch/arm/boot/dts/omap3-ldp.dts +++ b/arch/arm/boot/dts/omap3-ldp.dts @@ -301,5 +301,5 @@ &usb_otg_hs { &vaux1 { /* Needed for ads7846 */ - regulator-name = "vcc"; + regulator-name = "vcc"; }; diff --git a/arch/arm/boot/dts/omap3-n900.dts b/arch/arm/boot/dts/omap3-n900.dts index d40c3d2c4914..fdd929bc65d7 100644 --- a/arch/arm/boot/dts/omap3-n900.dts +++ b/arch/arm/boot/dts/omap3-n900.dts @@ -236,27 +236,27 @@ gpmc_pins: pinmux_gpmc_pins { pinctrl-single,pins = < /* address lines */ - OMAP3_CORE1_IOPAD(0x207a, PIN_OUTPUT | MUX_MODE0) /* gpmc_a1.gpmc_a1 */ - OMAP3_CORE1_IOPAD(0x207c, PIN_OUTPUT | MUX_MODE0) /* gpmc_a2.gpmc_a2 */ - OMAP3_CORE1_IOPAD(0x207e, PIN_OUTPUT | MUX_MODE0) /* gpmc_a3.gpmc_a3 */ + OMAP3_CORE1_IOPAD(0x207a, PIN_OUTPUT | MUX_MODE0) /* gpmc_a1.gpmc_a1 */ + OMAP3_CORE1_IOPAD(0x207c, PIN_OUTPUT | MUX_MODE0) /* gpmc_a2.gpmc_a2 */ + OMAP3_CORE1_IOPAD(0x207e, PIN_OUTPUT | MUX_MODE0) /* gpmc_a3.gpmc_a3 */ /* data lines, gpmc_d0..d7 not muxable according to TRM */ - OMAP3_CORE1_IOPAD(0x209e, PIN_INPUT | MUX_MODE0) /* gpmc_d8.gpmc_d8 */ - OMAP3_CORE1_IOPAD(0x20a0, PIN_INPUT | MUX_MODE0) /* gpmc_d9.gpmc_d9 */ - OMAP3_CORE1_IOPAD(0x20a2, PIN_INPUT | MUX_MODE0) /* gpmc_d10.gpmc_d10 */ - OMAP3_CORE1_IOPAD(0x20a4, PIN_INPUT | MUX_MODE0) /* gpmc_d11.gpmc_d11 */ - OMAP3_CORE1_IOPAD(0x20a6, PIN_INPUT | MUX_MODE0) /* gpmc_d12.gpmc_d12 */ - OMAP3_CORE1_IOPAD(0x20a8, PIN_INPUT | MUX_MODE0) /* gpmc_d13.gpmc_d13 */ - OMAP3_CORE1_IOPAD(0x20aa, PIN_INPUT | MUX_MODE0) /* gpmc_d14.gpmc_d14 */ - OMAP3_CORE1_IOPAD(0x20ac, PIN_INPUT | MUX_MODE0) /* gpmc_d15.gpmc_d15 */ + OMAP3_CORE1_IOPAD(0x209e, PIN_INPUT | MUX_MODE0) /* gpmc_d8.gpmc_d8 */ + OMAP3_CORE1_IOPAD(0x20a0, PIN_INPUT | MUX_MODE0) /* gpmc_d9.gpmc_d9 */ + OMAP3_CORE1_IOPAD(0x20a2, PIN_INPUT | MUX_MODE0) /* gpmc_d10.gpmc_d10 */ + OMAP3_CORE1_IOPAD(0x20a4, PIN_INPUT | MUX_MODE0) /* gpmc_d11.gpmc_d11 */ + OMAP3_CORE1_IOPAD(0x20a6, PIN_INPUT | MUX_MODE0) /* gpmc_d12.gpmc_d12 */ + OMAP3_CORE1_IOPAD(0x20a8, PIN_INPUT | MUX_MODE0) /* gpmc_d13.gpmc_d13 */ + OMAP3_CORE1_IOPAD(0x20aa, PIN_INPUT | MUX_MODE0) /* gpmc_d14.gpmc_d14 */ + OMAP3_CORE1_IOPAD(0x20ac, PIN_INPUT | MUX_MODE0) /* gpmc_d15.gpmc_d15 */ /* * gpmc_ncs0, gpmc_nadv_ale, gpmc_noe, gpmc_nwe, gpmc_wait0 not muxable * according to TRM. OneNAND seems to require PIN_INPUT on clock. */ - OMAP3_CORE1_IOPAD(0x20b0, PIN_OUTPUT | MUX_MODE0) /* gpmc_ncs1.gpmc_ncs1 */ - OMAP3_CORE1_IOPAD(0x20be, PIN_INPUT | MUX_MODE0) /* gpmc_clk.gpmc_clk */ - >; + OMAP3_CORE1_IOPAD(0x20b0, PIN_OUTPUT | MUX_MODE0) /* gpmc_ncs1.gpmc_ncs1 */ + OMAP3_CORE1_IOPAD(0x20be, PIN_INPUT | MUX_MODE0) /* gpmc_clk.gpmc_clk */ + >; }; i2c1_pins: pinmux_i2c1_pins { @@ -738,12 +738,12 @@ tpa6130a2: tpa6130a2@60 { si4713: si4713@63 { compatible = "silabs,si4713"; - reg = <0x63>; + reg = <0x63>; - interrupts-extended = <&gpio2 21 IRQ_TYPE_EDGE_FALLING>; /* 53 */ - reset-gpios = <&gpio6 3 GPIO_ACTIVE_HIGH>; /* 163 */ - vio-supply = <&vio>; - vdd-supply = <&vaux1>; + interrupts-extended = <&gpio2 21 IRQ_TYPE_EDGE_FALLING>; /* 53 */ + reset-gpios = <&gpio6 3 GPIO_ACTIVE_HIGH>; /* 163 */ + vio-supply = <&vio>; + vdd-supply = <&vaux1>; }; bq24150a: bq24150a@6b { diff --git a/arch/arm/boot/dts/omap3-zoom3.dts b/arch/arm/boot/dts/omap3-zoom3.dts index 0482676d1830..ce58b1f208e8 100644 --- a/arch/arm/boot/dts/omap3-zoom3.dts +++ b/arch/arm/boot/dts/omap3-zoom3.dts @@ -23,9 +23,9 @@ memory@80000000 { }; vddvario: regulator-vddvario { - compatible = "regulator-fixed"; - regulator-name = "vddvario"; - regulator-always-on; + compatible = "regulator-fixed"; + regulator-name = "vddvario"; + regulator-always-on; }; vdd33a: regulator-vdd33a { @@ -84,28 +84,28 @@ OMAP3_CORE1_IOPAD(0x21d0, PIN_INPUT_PULLUP | MUX_MODE3) /* mcspi1_cs1.sdmmc3_cmd uart1_pins: pinmux_uart1_pins { pinctrl-single,pins = < - OMAP3_CORE1_IOPAD(0x2180, PIN_INPUT | MUX_MODE0) /* uart1_cts.uart1_cts */ - OMAP3_CORE1_IOPAD(0x217e, PIN_OUTPUT | MUX_MODE0) /* uart1_rts.uart1_rts */ - OMAP3_CORE1_IOPAD(0x2182, WAKEUP_EN | PIN_INPUT | MUX_MODE0) /* uart1_rx.uart1_rx */ - OMAP3_CORE1_IOPAD(0x217c, PIN_OUTPUT | MUX_MODE0) /* uart1_tx.uart1_tx */ + OMAP3_CORE1_IOPAD(0x2180, PIN_INPUT | MUX_MODE0) /* uart1_cts.uart1_cts */ + OMAP3_CORE1_IOPAD(0x217e, PIN_OUTPUT | MUX_MODE0) /* uart1_rts.uart1_rts */ + OMAP3_CORE1_IOPAD(0x2182, WAKEUP_EN | PIN_INPUT | MUX_MODE0) /* uart1_rx.uart1_rx */ + OMAP3_CORE1_IOPAD(0x217c, PIN_OUTPUT | MUX_MODE0) /* uart1_tx.uart1_tx */ >; }; uart2_pins: pinmux_uart2_pins { pinctrl-single,pins = < - OMAP3_CORE1_IOPAD(0x2174, PIN_INPUT_PULLUP | MUX_MODE0) /* uart2_cts.uart2_cts */ - OMAP3_CORE1_IOPAD(0x2176, PIN_OUTPUT | MUX_MODE0) /* uart2_rts.uart2_rts */ - OMAP3_CORE1_IOPAD(0x217a, PIN_INPUT | MUX_MODE0) /* uart2_rx.uart2_rx */ - OMAP3_CORE1_IOPAD(0x2178, PIN_OUTPUT | MUX_MODE0) /* uart2_tx.uart2_tx */ + OMAP3_CORE1_IOPAD(0x2174, PIN_INPUT_PULLUP | MUX_MODE0) /* uart2_cts.uart2_cts */ + OMAP3_CORE1_IOPAD(0x2176, PIN_OUTPUT | MUX_MODE0) /* uart2_rts.uart2_rts */ + OMAP3_CORE1_IOPAD(0x217a, PIN_INPUT | MUX_MODE0) /* uart2_rx.uart2_rx */ + OMAP3_CORE1_IOPAD(0x2178, PIN_OUTPUT | MUX_MODE0) /* uart2_tx.uart2_tx */ >; }; uart3_pins: pinmux_uart3_pins { pinctrl-single,pins = < - OMAP3_CORE1_IOPAD(0x219a, PIN_INPUT_PULLDOWN | MUX_MODE0) /* uart3_cts_rctx.uart3_cts_rctx */ - OMAP3_CORE1_IOPAD(0x219c, PIN_OUTPUT | MUX_MODE0) /* uart3_rts_sd.uart3_rts_sd */ - OMAP3_CORE1_IOPAD(0x219e, PIN_INPUT | MUX_MODE0) /* uart3_rx_irrx.uart3_rx_irrx */ - OMAP3_CORE1_IOPAD(0x21a0, PIN_OUTPUT | MUX_MODE0) /* uart3_tx_irtx.uart3_tx_irtx */ + OMAP3_CORE1_IOPAD(0x219a, PIN_INPUT_PULLDOWN | MUX_MODE0) /* uart3_cts_rctx.uart3_cts_rctx */ + OMAP3_CORE1_IOPAD(0x219c, PIN_OUTPUT | MUX_MODE0) /* uart3_rts_sd.uart3_rts_sd */ + OMAP3_CORE1_IOPAD(0x219e, PIN_INPUT | MUX_MODE0) /* uart3_rx_irrx.uart3_rx_irrx */ + OMAP3_CORE1_IOPAD(0x21a0, PIN_OUTPUT | MUX_MODE0) /* uart3_tx_irtx.uart3_tx_irtx */ >; }; @@ -205,22 +205,22 @@ wlcore: wlcore@2 { }; &uart1 { - pinctrl-names = "default"; - pinctrl-0 = <&uart1_pins>; + pinctrl-names = "default"; + pinctrl-0 = <&uart1_pins>; }; &uart2 { - pinctrl-names = "default"; - pinctrl-0 = <&uart2_pins>; + pinctrl-names = "default"; + pinctrl-0 = <&uart2_pins>; }; &uart3 { - pinctrl-names = "default"; - pinctrl-0 = <&uart3_pins>; + pinctrl-names = "default"; + pinctrl-0 = <&uart3_pins>; }; &uart4 { - status = "disabled"; + status = "disabled"; }; &usb_otg_hs { diff --git a/arch/arm/boot/dts/omap4-cpu-thermal.dtsi b/arch/arm/boot/dts/omap4-cpu-thermal.dtsi index 03d054b2bf9a..c3380c4a80de 100644 --- a/arch/arm/boot/dts/omap4-cpu-thermal.dtsi +++ b/arch/arm/boot/dts/omap4-cpu-thermal.dtsi @@ -16,20 +16,20 @@ cpu_thermal: cpu_thermal { polling-delay = <1000>; /* milliseconds */ /* sensor ID */ - thermal-sensors = <&bandgap 0>; + thermal-sensors = <&bandgap 0>; cpu_trips: trips { - cpu_alert0: cpu_alert { - temperature = <100000>; /* millicelsius */ - hysteresis = <2000>; /* millicelsius */ - type = "passive"; - }; - cpu_crit: cpu_crit { - temperature = <125000>; /* millicelsius */ - hysteresis = <2000>; /* millicelsius */ - type = "critical"; - }; - }; + cpu_alert0: cpu_alert { + temperature = <100000>; /* millicelsius */ + hysteresis = <2000>; /* millicelsius */ + type = "passive"; + }; + cpu_crit: cpu_crit { + temperature = <125000>; /* millicelsius */ + hysteresis = <2000>; /* millicelsius */ + type = "critical"; + }; + }; cpu_cooling_maps: cooling-maps { map0 { diff --git a/arch/arm/boot/dts/omap5-cm-t54.dts b/arch/arm/boot/dts/omap5-cm-t54.dts index e62ea8b6d53f..af288d63a26a 100644 --- a/arch/arm/boot/dts/omap5-cm-t54.dts +++ b/arch/arm/boot/dts/omap5-cm-t54.dts @@ -84,36 +84,36 @@ led1 { }; lcd0: display { - compatible = "startek,startek-kd050c", "panel-dpi"; - label = "lcd"; + compatible = "startek,startek-kd050c", "panel-dpi"; + label = "lcd"; - pinctrl-names = "default"; - pinctrl-0 = <&lcd_pins>; + pinctrl-names = "default"; + pinctrl-0 = <&lcd_pins>; - enable-gpios = <&gpio8 3 GPIO_ACTIVE_HIGH>; + enable-gpios = <&gpio8 3 GPIO_ACTIVE_HIGH>; - panel-timing { - clock-frequency = <33000000>; - hactive = <800>; - vactive = <480>; - hfront-porch = <40>; - hback-porch = <40>; - hsync-len = <43>; - vback-porch = <29>; - vfront-porch = <13>; - vsync-len = <3>; - hsync-active = <0>; - vsync-active = <0>; - de-active = <1>; - pixelclk-active = <1>; - }; + panel-timing { + clock-frequency = <33000000>; + hactive = <800>; + vactive = <480>; + hfront-porch = <40>; + hback-porch = <40>; + hsync-len = <43>; + vback-porch = <29>; + vfront-porch = <13>; + vsync-len = <3>; + hsync-active = <0>; + vsync-active = <0>; + de-active = <1>; + pixelclk-active = <1>; + }; - port { - lcd_in: endpoint { - remote-endpoint = <&dpi_lcd_out>; - }; - }; - }; + port { + lcd_in: endpoint { + remote-endpoint = <&dpi_lcd_out>; + }; + }; + }; hdmi0: connector0 { compatible = "hdmi-connector"; @@ -644,8 +644,8 @@ &usbhsehci { }; &usb3 { - extcon = <&extcon_usb3>; - vbus-supply = <&smps10_out1_reg>; + extcon = <&extcon_usb3>; + vbus-supply = <&smps10_out1_reg>; }; &cpu0 { From 093a9a02d4d56ee030afc8de5d698303dcac43ee Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 11 Sep 2023 07:07:38 +0300 Subject: [PATCH 093/228] ARM: dts: ti: omap: Fix bandgap thermal cells addressing for omap3/4 [ Upstream commit 6469b2feade8fd82d224dd3734e146536f3e9f0e ] Fix "thermal_sys: cpu_thermal: Failed to read thermal-sensors cells: -2" error on boot for omap3/4. This is caused by wrong addressing in the dts for bandgap sensor for single sensor instances. Note that omap4-cpu-thermal.dtsi is shared across omap4/5 and dra7, so we can't just change the addressing in omap4-cpu-thermal.dtsi. Cc: Ivaylo Dimitrov Cc: Carl Philipp Klemm Cc: Merlijn Wajer Cc: Pavel Machek Reviewed-by: Sebastian Reichel Fixes: a761d517bbb1 ("ARM: dts: omap3: Add cpu_thermal zone") Fixes: 0bbf6c54d100 ("arm: dts: add omap4 CPU thermal data") Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin --- arch/arm/boot/dts/omap3-cpu-thermal.dtsi | 3 +-- arch/arm/boot/dts/omap4-cpu-thermal.dtsi | 5 ++++- arch/arm/boot/dts/omap443x.dtsi | 1 + arch/arm/boot/dts/omap4460.dtsi | 1 + 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/omap3-cpu-thermal.dtsi b/arch/arm/boot/dts/omap3-cpu-thermal.dtsi index 1ed837859374..51e6c2d42be2 100644 --- a/arch/arm/boot/dts/omap3-cpu-thermal.dtsi +++ b/arch/arm/boot/dts/omap3-cpu-thermal.dtsi @@ -15,8 +15,7 @@ cpu_thermal: cpu_thermal { polling-delay = <1000>; /* milliseconds */ coefficients = <0 20000>; - /* sensor ID */ - thermal-sensors = <&bandgap 0>; + thermal-sensors = <&bandgap>; cpu_trips: trips { cpu_alert0: cpu_alert { diff --git a/arch/arm/boot/dts/omap4-cpu-thermal.dtsi b/arch/arm/boot/dts/omap4-cpu-thermal.dtsi index c3380c4a80de..4b3afe298062 100644 --- a/arch/arm/boot/dts/omap4-cpu-thermal.dtsi +++ b/arch/arm/boot/dts/omap4-cpu-thermal.dtsi @@ -15,7 +15,10 @@ cpu_thermal: cpu_thermal { polling-delay-passive = <250>; /* milliseconds */ polling-delay = <1000>; /* milliseconds */ - /* sensor ID */ + /* + * See 44xx files for single sensor addressing, omap5 and dra7 need + * also sensor ID for addressing. + */ thermal-sensors = <&bandgap 0>; cpu_trips: trips { diff --git a/arch/arm/boot/dts/omap443x.dtsi b/arch/arm/boot/dts/omap443x.dtsi index dd8ef58cbaed..cce39dce1428 100644 --- a/arch/arm/boot/dts/omap443x.dtsi +++ b/arch/arm/boot/dts/omap443x.dtsi @@ -72,6 +72,7 @@ abb_iva: regulator-abb-iva { }; &cpu_thermal { + thermal-sensors = <&bandgap>; coefficients = <0 20000>; }; diff --git a/arch/arm/boot/dts/omap4460.dtsi b/arch/arm/boot/dts/omap4460.dtsi index 2d3e54901b6e..d62e2bacca18 100644 --- a/arch/arm/boot/dts/omap4460.dtsi +++ b/arch/arm/boot/dts/omap4460.dtsi @@ -89,6 +89,7 @@ abb_iva: regulator-abb-iva { }; &cpu_thermal { + thermal-sensors = <&bandgap>; coefficients = <348 (-9301)>; }; From afdc40a74ae3269f892fb47885233377e731d11c Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 30 Dec 2020 10:42:31 +0200 Subject: [PATCH 094/228] ARM: dts: motorola-mapphone: Configure lower temperature passive cooling [ Upstream commit 5c3db2d4d4ed747e714387362afe007e6ae5e2d3 ] The current cooling device temperature is too high at 100C as we have a battery on the device right next to the SoC as pointed out by Carl Philipp Klemm . Let's configure the max temperature to 80C. As we only have a tshut interrupt and no talert interrupt on 4430, we have a passive cooling device configured for 4430. However, we want the poll interval to be 10 seconds instead of 1 second for power management. The value of 10 seconds seems like plenty of time to notice the temperature increase above the 75C temperatures. Having the bandgap temperature change seems to take several tens of seconds because of heat dissipation above 75C range as monitored with a full CPU load. Cc: Carl Philipp Klemm Cc: Merlijn Wajer Cc: Pavel Machek Cc: Sebastian Reichel Suggested-by: Carl Philipp Klemm Signed-off-by: Tony Lindgren Stable-dep-of: ac08bda1569b ("ARM: dts: ti: omap: motorola-mapphone: Fix abe_clkctrl warning on boot") Signed-off-by: Sasha Levin --- arch/arm/boot/dts/motorola-mapphone-common.dtsi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm/boot/dts/motorola-mapphone-common.dtsi b/arch/arm/boot/dts/motorola-mapphone-common.dtsi index 5f8f77cfbe59..807042a293d0 100644 --- a/arch/arm/boot/dts/motorola-mapphone-common.dtsi +++ b/arch/arm/boot/dts/motorola-mapphone-common.dtsi @@ -192,6 +192,14 @@ backlight: backlight { }; }; +&cpu_thermal { + polling-delay = <10000>; /* milliseconds */ +}; + +&cpu_alert0 { + temperature = <80000>; /* millicelsius */ +}; + &dss { status = "okay"; }; From 12a28c379ef80d5b31b5f4f17cc29c97ea7413da Mon Sep 17 00:00:00 2001 From: Carl Philipp Klemm Date: Wed, 30 Dec 2020 10:42:32 +0200 Subject: [PATCH 095/228] ARM: dts: motorola-mapphone: Add 1.2GHz OPP [ Upstream commit 19e367147ea8864dff1fb153cfab6d8e8da10324 ] The omap4430 HS HIGH performance devces support 1.2GHz opp, lower speed variants do not. However for mapphone devices Motorola seems to have decided that this does not really matter for the SoC variants they have tested to use, and decided to clock all devices, including the ones with STANDARD performance chips at 1.2GHz upon release of the 3.0.8 vendor kernel shiped with Android 4.0. Therefore it seems safe to do the same, but let's only do it for Motorola devices as the others have not been tested. Note that we prevent overheating with the passive cooling device cpu_alert0 configured in the dts file that starts lowering the speed as needed. This also removes the "failed to find current OPP for freq 1200000000" warning. Cc: Merlijn Wajer Cc: Pavel Machek Cc: Sebastian Reichel Signed-off-by: Carl Philipp Klemm [tony@atomide.com: made motorola specific, updated comments] Signed-off-by: Tony Lindgren Stable-dep-of: ac08bda1569b ("ARM: dts: ti: omap: motorola-mapphone: Fix abe_clkctrl warning on boot") Signed-off-by: Sasha Levin --- arch/arm/boot/dts/motorola-mapphone-common.dtsi | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm/boot/dts/motorola-mapphone-common.dtsi b/arch/arm/boot/dts/motorola-mapphone-common.dtsi index 807042a293d0..ab0672131c21 100644 --- a/arch/arm/boot/dts/motorola-mapphone-common.dtsi +++ b/arch/arm/boot/dts/motorola-mapphone-common.dtsi @@ -200,6 +200,21 @@ &cpu_alert0 { temperature = <80000>; /* millicelsius */ }; +&cpu0 { + /* + * Note that the 1.2GiHz mode is enabled for all SoC variants for + * the Motorola Android Linux v3.0.8 based kernel. + */ + operating-points = < + /* kHz uV */ + 300000 1025000 + 600000 1200000 + 800000 1313000 + 1008000 1375000 + 1200000 1375000 + >; +}; + &dss { status = "okay"; }; From ef83f35ced408c20da263420454510c564837544 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 3 Dec 2021 15:10:43 +0100 Subject: [PATCH 096/228] ARM: dts: motorola-mapphone: Drop second ti,wlcore compatible value [ Upstream commit 7ebe6e99f7702dad342486e5b30d989a0a6499af ] The TI wlcore DT bindings specify using a single compatible value for each variant, and the Linux kernel driver matches against the first compatible value since commit 078b30da3f074f2e ("wlcore: add wl1285 compatible") in v4.13. Signed-off-by: Geert Uytterhoeven Reviewed-by: Sebastian Reichel Signed-off-by: Tony Lindgren Stable-dep-of: ac08bda1569b ("ARM: dts: ti: omap: motorola-mapphone: Fix abe_clkctrl warning on boot") Signed-off-by: Sasha Levin --- arch/arm/boot/dts/motorola-mapphone-common.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/motorola-mapphone-common.dtsi b/arch/arm/boot/dts/motorola-mapphone-common.dtsi index ab0672131c21..4227b7f49e46 100644 --- a/arch/arm/boot/dts/motorola-mapphone-common.dtsi +++ b/arch/arm/boot/dts/motorola-mapphone-common.dtsi @@ -407,7 +407,7 @@ &mmc3 { #address-cells = <1>; #size-cells = <0>; wlcore: wlcore@2 { - compatible = "ti,wl1285", "ti,wl1283"; + compatible = "ti,wl1285"; reg = <2>; /* gpio_100 with gpmc_wait2 pad as wakeirq */ interrupts-extended = <&gpio4 4 IRQ_TYPE_LEVEL_HIGH>, From 2d9c9589da6ac5e5d5d720edcb559558ba572578 Mon Sep 17 00:00:00 2001 From: Gireesh Hiremath Date: Fri, 25 Mar 2022 10:06:05 +0000 Subject: [PATCH 097/228] ARM: dts: am335x: Guardian: Update beeper label [ Upstream commit b5bf6b434575d32aeaa70c82ec84b3cec92e2973 ] * Update lable pwm to guardian beeper Signed-off-by: Gireesh Hiremath Message-Id: <20220325100613.1494-8-Gireesh.Hiremath@in.bosch.com> Signed-off-by: Tony Lindgren Stable-dep-of: ac08bda1569b ("ARM: dts: ti: omap: motorola-mapphone: Fix abe_clkctrl warning on boot") Signed-off-by: Sasha Levin --- arch/arm/boot/dts/am335x-guardian.dts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/am335x-guardian.dts b/arch/arm/boot/dts/am335x-guardian.dts index 1918766c1f80..b113edab7695 100644 --- a/arch/arm/boot/dts/am335x-guardian.dts +++ b/arch/arm/boot/dts/am335x-guardian.dts @@ -100,11 +100,11 @@ panel-info { }; - pwm7: dmtimer-pwm { + guardian_beeper: dmtimer-pwm@7 { compatible = "ti,omap-dmtimer-pwm"; ti,timers = <&timer7>; pinctrl-names = "default"; - pinctrl-0 = <&dmtimer7_pins>; + pinctrl-0 = <&guardian_beeper_pins>; ti,clock-source = <0x01>; }; @@ -343,9 +343,9 @@ AM33XX_IOPAD(0x9b4, PIN_OUTPUT_PULLDOWN | MUX_MODE3) >; }; - dmtimer7_pins: pinmux_dmtimer7_pins { + guardian_beeper_pins: pinmux_dmtimer7_pins { pinctrl-single,pins = < - AM33XX_IOPAD(0x968, PIN_OUTPUT | MUX_MODE5) + AM33XX_IOPAD(0x968, PIN_OUTPUT | MUX_MODE5) /* (E18) timer7 */ >; }; From a2a592adad7cd844f7cb072160738c00da6b60a5 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 18 Nov 2022 14:19:24 +0200 Subject: [PATCH 098/228] ARM: dts: Unify pwm-omap-dmtimer node names [ Upstream commit 4f15fc7c0f28ffcd6e9a56396db6edcdfa4c9925 ] There is no reg property for pwm-omap-dmtimer. Cc: Krzysztof Kozlowski Cc: Rob Herring Signed-off-by: Tony Lindgren Stable-dep-of: ac08bda1569b ("ARM: dts: ti: omap: motorola-mapphone: Fix abe_clkctrl warning on boot") Signed-off-by: Sasha Levin --- arch/arm/boot/dts/am335x-guardian.dts | 3 ++- arch/arm/boot/dts/am3517-evm.dts | 2 +- arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi | 2 +- arch/arm/boot/dts/motorola-mapphone-common.dtsi | 4 ++-- arch/arm/boot/dts/omap3-gta04.dtsi | 2 +- arch/arm/boot/dts/omap3-n900.dts | 2 +- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/arm/boot/dts/am335x-guardian.dts b/arch/arm/boot/dts/am335x-guardian.dts index b113edab7695..9594276acf9d 100644 --- a/arch/arm/boot/dts/am335x-guardian.dts +++ b/arch/arm/boot/dts/am335x-guardian.dts @@ -100,8 +100,9 @@ panel-info { }; - guardian_beeper: dmtimer-pwm@7 { + guardian_beeper: pwm-7 { compatible = "ti,omap-dmtimer-pwm"; + #pwm-cells = <3>; ti,timers = <&timer7>; pinctrl-names = "default"; pinctrl-0 = <&guardian_beeper_pins>; diff --git a/arch/arm/boot/dts/am3517-evm.dts b/arch/arm/boot/dts/am3517-evm.dts index c8b80f156ec9..9cc1ae36c420 100644 --- a/arch/arm/boot/dts/am3517-evm.dts +++ b/arch/arm/boot/dts/am3517-evm.dts @@ -150,7 +150,7 @@ bl: backlight { enable-gpios = <&gpio6 22 GPIO_ACTIVE_HIGH>; /* gpio_182 */ }; - pwm11: dmtimer-pwm@11 { + pwm11: pwm-11 { compatible = "ti,omap-dmtimer-pwm"; pinctrl-names = "default"; pinctrl-0 = <&pwm_pins>; diff --git a/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi b/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi index 533a47bc4a53..1386a5e63eff 100644 --- a/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi +++ b/arch/arm/boot/dts/logicpd-torpedo-baseboard.dtsi @@ -59,7 +59,7 @@ led2 { }; }; - pwm10: dmtimer-pwm { + pwm10: pwm-10 { compatible = "ti,omap-dmtimer-pwm"; pinctrl-names = "default"; pinctrl-0 = <&pwm_pins>; diff --git a/arch/arm/boot/dts/motorola-mapphone-common.dtsi b/arch/arm/boot/dts/motorola-mapphone-common.dtsi index 4227b7f49e46..2b9ae5242cdf 100644 --- a/arch/arm/boot/dts/motorola-mapphone-common.dtsi +++ b/arch/arm/boot/dts/motorola-mapphone-common.dtsi @@ -156,7 +156,7 @@ soundcard { dais = <&mcbsp2_port>, <&mcbsp3_port>; }; - pwm8: dmtimer-pwm-8 { + pwm8: pwm-8 { pinctrl-names = "default"; pinctrl-0 = <&vibrator_direction_pin>; @@ -166,7 +166,7 @@ pwm8: dmtimer-pwm-8 { ti,clock-source = <0x01>; }; - pwm9: dmtimer-pwm-9 { + pwm9: pwm-9 { pinctrl-names = "default"; pinctrl-0 = <&vibrator_enable_pin>; diff --git a/arch/arm/boot/dts/omap3-gta04.dtsi b/arch/arm/boot/dts/omap3-gta04.dtsi index 0333ca0e87fe..68e56b50652a 100644 --- a/arch/arm/boot/dts/omap3-gta04.dtsi +++ b/arch/arm/boot/dts/omap3-gta04.dtsi @@ -147,7 +147,7 @@ backlight: backlight { pinctrl-0 = <&backlight_pins>; }; - pwm11: dmtimer-pwm { + pwm11: pwm-11 { compatible = "ti,omap-dmtimer-pwm"; ti,timers = <&timer11>; #pwm-cells = <3>; diff --git a/arch/arm/boot/dts/omap3-n900.dts b/arch/arm/boot/dts/omap3-n900.dts index fdd929bc65d7..7dafd69b7d35 100644 --- a/arch/arm/boot/dts/omap3-n900.dts +++ b/arch/arm/boot/dts/omap3-n900.dts @@ -156,7 +156,7 @@ battery: n900-battery { io-channel-names = "temp", "bsi", "vbat"; }; - pwm9: dmtimer-pwm { + pwm9: pwm-9 { compatible = "ti,omap-dmtimer-pwm"; #pwm-cells = <3>; ti,timers = <&timer9>; From 1bc88671960f561dc87cdaf075d60cad9aafbc38 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 11 Sep 2023 07:07:38 +0300 Subject: [PATCH 099/228] ARM: dts: ti: omap: motorola-mapphone: Fix abe_clkctrl warning on boot [ Upstream commit ac08bda1569b06b7a62c7b4dd00d4c3b28ceaaec ] Commit 0840242e8875 ("ARM: dts: Configure clock parent for pwm vibra") attempted to fix the PWM settings but ended up causin an additional clock reparenting error: clk: failed to reparent abe-clkctrl:0060:24 to sys_clkin_ck: -22 Only timer9 is in the PER domain and can use the sys_clkin_ck clock source. For timer8, the there is no sys_clkin_ck available as it's in the ABE domain, instead it should use syc_clk_div_ck. However, for power management, we want to use the always on sys_32k_ck instead. Cc: Ivaylo Dimitrov Cc: Carl Philipp Klemm Cc: Merlijn Wajer Cc: Pavel Machek Reviewed-by: Sebastian Reichel Fixes: 0840242e8875 ("ARM: dts: Configure clock parent for pwm vibra") Depends-on: 61978617e905 ("ARM: dts: Add minimal support for Droid Bionic xt875") Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin --- arch/arm/boot/dts/motorola-mapphone-common.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/motorola-mapphone-common.dtsi b/arch/arm/boot/dts/motorola-mapphone-common.dtsi index 2b9ae5242cdf..8cb26b924d3c 100644 --- a/arch/arm/boot/dts/motorola-mapphone-common.dtsi +++ b/arch/arm/boot/dts/motorola-mapphone-common.dtsi @@ -739,12 +739,12 @@ &rng_target { /* Configure pwm clock source for timers 8 & 9 */ &timer8 { assigned-clocks = <&abe_clkctrl OMAP4_TIMER8_CLKCTRL 24>; - assigned-clock-parents = <&sys_clkin_ck>; + assigned-clock-parents = <&sys_32k_ck>; }; &timer9 { assigned-clocks = <&l4_per_clkctrl OMAP4_TIMER9_CLKCTRL 24>; - assigned-clock-parents = <&sys_clkin_ck>; + assigned-clock-parents = <&sys_32k_ck>; }; /* From eff55feb8b871a7ab0a2a6033b60949c319ecd3b Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 7 Sep 2023 08:53:28 +0300 Subject: [PATCH 100/228] bus: ti-sysc: Fix SYSC_QUIRK_SWSUP_SIDLE_ACT handling for uart wake-up [ Upstream commit e5deb8f76e64d94ccef715e75ebafffd0c312d80 ] The uarts should be tagged with SYSC_QUIRK_SWSUP_SIDLE instead of SYSC_QUIRK_SWSUP_SIDLE_ACT. The difference is that SYSC_QUIRK_SWSUP_SIDLE is used to force idle target modules rather than block idle during usage. The SYSC_QUIRK_SWSUP_SIDLE_ACT should disable autoidle and wake-up when a target module is active, and configure autoidle and wake-up when a target module is inactive. We are missing configuring the target module on sysc_disable_module(), and missing toggling of the wake-up bit. Let's fix the issue to allow uart wake-up to work. Fixes: fb685f1c190e ("bus: ti-sysc: Handle swsup idle mode quirks") Tested-by: Dhruva Gole Tested-by: Kevin Hilman Signed-off-by: Tony Lindgren Signed-off-by: Sasha Levin --- drivers/bus/ti-sysc.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 5dba06ed61bf..ef8c7bfd79a8 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -1114,6 +1114,11 @@ static int sysc_enable_module(struct device *dev) if (ddata->cfg.quirks & (SYSC_QUIRK_SWSUP_SIDLE | SYSC_QUIRK_SWSUP_SIDLE_ACT)) { best_mode = SYSC_IDLE_NO; + + /* Clear WAKEUP */ + if (regbits->enwkup_shift >= 0 && + ddata->cfg.sysc_val & BIT(regbits->enwkup_shift)) + reg &= ~BIT(regbits->enwkup_shift); } else { best_mode = fls(ddata->cfg.sidlemodes) - 1; if (best_mode > SYSC_IDLE_MASK) { @@ -1234,6 +1239,13 @@ static int sysc_disable_module(struct device *dev) } } + if (ddata->cfg.quirks & SYSC_QUIRK_SWSUP_SIDLE_ACT) { + /* Set WAKEUP */ + if (regbits->enwkup_shift >= 0 && + ddata->cfg.sysc_val & BIT(regbits->enwkup_shift)) + reg |= BIT(regbits->enwkup_shift); + } + reg &= ~(SYSC_IDLE_MASK << regbits->sidle_shift); reg |= best_mode << regbits->sidle_shift; if (regbits->autoidle_shift >= 0 && @@ -1497,16 +1509,16 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = { SYSC_QUIRK("smartreflex", 0, -ENODEV, 0x38, -ENODEV, 0x00000000, 0xffffffff, SYSC_QUIRK_LEGACY_IDLE), SYSC_QUIRK("uart", 0, 0x50, 0x54, 0x58, 0x00000046, 0xffffffff, - SYSC_QUIRK_SWSUP_SIDLE | SYSC_QUIRK_LEGACY_IDLE), + SYSC_QUIRK_SWSUP_SIDLE_ACT | SYSC_QUIRK_LEGACY_IDLE), SYSC_QUIRK("uart", 0, 0x50, 0x54, 0x58, 0x00000052, 0xffffffff, - SYSC_QUIRK_SWSUP_SIDLE | SYSC_QUIRK_LEGACY_IDLE), + SYSC_QUIRK_SWSUP_SIDLE_ACT | SYSC_QUIRK_LEGACY_IDLE), /* Uarts on omap4 and later */ SYSC_QUIRK("uart", 0, 0x50, 0x54, 0x58, 0x50411e03, 0xffff00ff, - SYSC_QUIRK_SWSUP_SIDLE | SYSC_QUIRK_LEGACY_IDLE), + SYSC_QUIRK_SWSUP_SIDLE_ACT | SYSC_QUIRK_LEGACY_IDLE), SYSC_QUIRK("uart", 0, 0x50, 0x54, 0x58, 0x47422e03, 0xffffffff, - SYSC_QUIRK_SWSUP_SIDLE | SYSC_QUIRK_LEGACY_IDLE), + SYSC_QUIRK_SWSUP_SIDLE_ACT | SYSC_QUIRK_LEGACY_IDLE), SYSC_QUIRK("uart", 0, 0x50, 0x54, 0x58, 0x47424e03, 0xffffffff, - SYSC_QUIRK_SWSUP_SIDLE | SYSC_QUIRK_LEGACY_IDLE), + SYSC_QUIRK_SWSUP_SIDLE_ACT | SYSC_QUIRK_LEGACY_IDLE), /* Quirks that need to be set based on the module address */ SYSC_QUIRK("mcpdm", 0x40132000, 0, 0x10, -ENODEV, 0x50000800, 0xffffffff, From 3696261859c5acca841df14783af4df9ea999966 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 19 Jun 2023 12:44:17 +0300 Subject: [PATCH 101/228] power: supply: ucs1002: fix error code in ucs1002_get_property() [ Upstream commit e35059949daa83f8dadf710d0f829ab3c3a72fe2 ] This function is supposed to return 0 for success instead of returning the val->intval. This makes it the same as the other case statements in this function. Fixes: 81196e2e57fc ("power: supply: ucs1002: fix some health status issues") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/687f64a4-4c6e-4536-8204-98ad1df934e5@moroto.mountain Signed-off-by: Sebastian Reichel Signed-off-by: Sasha Levin --- drivers/power/supply/ucs1002_power.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/power/supply/ucs1002_power.c b/drivers/power/supply/ucs1002_power.c index ef673ec3db56..332cb50d9fb4 100644 --- a/drivers/power/supply/ucs1002_power.c +++ b/drivers/power/supply/ucs1002_power.c @@ -384,7 +384,8 @@ static int ucs1002_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_USB_TYPE: return ucs1002_get_usb_type(info, val); case POWER_SUPPLY_PROP_HEALTH: - return val->intval = info->health; + val->intval = info->health; + return 0; case POWER_SUPPLY_PROP_PRESENT: val->intval = info->present; return 0; From 608af5511a8f9f247f405c8471c17ed52fe405ec Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Wed, 20 Sep 2023 04:15:22 -0700 Subject: [PATCH 102/228] xtensa: add default definition for XCHAL_HAVE_DIV32 [ Upstream commit 494e87ffa0159b3f879694a9231089707792a44d ] When variant FSF is set, XCHAL_HAVE_DIV32 is not defined. Add default definition for that macro to prevent build warnings: arch/xtensa/lib/divsi3.S:9:5: warning: "XCHAL_HAVE_DIV32" is not defined, evaluates to 0 [-Wundef] 9 | #if XCHAL_HAVE_DIV32 arch/xtensa/lib/modsi3.S:9:5: warning: "XCHAL_HAVE_DIV32" is not defined, evaluates to 0 [-Wundef] 9 | #if XCHAL_HAVE_DIV32 Fixes: 173d6681380a ("xtensa: remove extra header files") Suggested-by: Randy Dunlap Signed-off-by: Max Filippov Reported-by: kernel test robot Closes: lore.kernel.org/r/202309150556.t0yCdv3g-lkp@intel.com Signed-off-by: Sasha Levin --- arch/xtensa/include/asm/core.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/xtensa/include/asm/core.h b/arch/xtensa/include/asm/core.h index a4e40166ff4b..0fa3649649e9 100644 --- a/arch/xtensa/include/asm/core.h +++ b/arch/xtensa/include/asm/core.h @@ -6,6 +6,10 @@ #include +#ifndef XCHAL_HAVE_DIV32 +#define XCHAL_HAVE_DIV32 0 +#endif + #ifndef XCHAL_HAVE_EXCLUSIVE #define XCHAL_HAVE_EXCLUSIVE 0 #endif From a10bfbe599b7b8def7368789f215cc57e52b9b2d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Sep 2023 22:21:36 -0700 Subject: [PATCH 103/228] xtensa: iss/network: make functions static [ Upstream commit 1b59efeb59851277266318f4e0132aa61ce3455e ] Make 2 functions static to prevent build warnings: arch/xtensa/platforms/iss/network.c:204:16: warning: no previous prototype for 'tuntap_protocol' [-Wmissing-prototypes] 204 | unsigned short tuntap_protocol(struct sk_buff *skb) arch/xtensa/platforms/iss/network.c:444:6: warning: no previous prototype for 'iss_net_user_timer_expire' [-Wmissing-prototypes] 444 | void iss_net_user_timer_expire(struct timer_list *unused) Fixes: 7282bee78798 ("xtensa: Architecture support for Tensilica Xtensa Part 8") Fixes: d8479a21a98b ("xtensa: Convert timers to use timer_setup()") Signed-off-by: Randy Dunlap Cc: Chris Zankel Cc: Max Filippov Message-Id: <20230920052139.10570-14-rdunlap@infradead.org> Signed-off-by: Max Filippov Signed-off-by: Sasha Levin --- arch/xtensa/platforms/iss/network.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index 1270de83435e..e8491ac0d5b9 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -204,7 +204,7 @@ static int tuntap_write(struct iss_net_private *lp, struct sk_buff **skb) return simc_write(lp->tp.info.tuntap.fd, (*skb)->data, (*skb)->len); } -unsigned short tuntap_protocol(struct sk_buff *skb) +static unsigned short tuntap_protocol(struct sk_buff *skb) { return eth_type_trans(skb, skb->dev); } @@ -477,7 +477,7 @@ static int iss_net_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } -void iss_net_user_timer_expire(struct timer_list *unused) +static void iss_net_user_timer_expire(struct timer_list *unused) { } From be17dfdcc87a54c818524cabc3e1a206988b7e83 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Sep 2023 22:21:37 -0700 Subject: [PATCH 104/228] xtensa: boot: don't add include-dirs [ Upstream commit 54d3d7d363823782c3444ddc41bb8cf1edc80514 ] Drop the -I options to prevent build warnings since there is not boot/include directory: cc1: warning: arch/xtensa/boot/include: No such file or directory [-Wmissing-include-dirs] Fixes: 437374e9a950 ("restore arch/{ppc/xtensa}/boot cflags") Fixes: 4bedea945451 ("xtensa: Architecture support for Tensilica Xtensa Part 2") Signed-off-by: Randy Dunlap Cc: Chris Zankel Cc: Max Filippov Message-Id: <20230920052139.10570-15-rdunlap@infradead.org> Signed-off-by: Max Filippov Signed-off-by: Sasha Levin --- arch/xtensa/boot/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/xtensa/boot/Makefile b/arch/xtensa/boot/Makefile index f6bb352f94b4..c8fd705d08b2 100644 --- a/arch/xtensa/boot/Makefile +++ b/arch/xtensa/boot/Makefile @@ -9,8 +9,7 @@ # KBUILD_CFLAGS used when building rest of boot (takes effect recursively) -KBUILD_CFLAGS += -fno-builtin -Iarch/$(ARCH)/boot/include -HOSTFLAGS += -Iarch/$(ARCH)/boot/include +KBUILD_CFLAGS += -fno-builtin BIG_ENDIAN := $(shell echo __XTENSA_EB__ | $(CC) -E - | grep -v "\#") From b317f69871ef38e1ecc2381fd021917f25c50cf3 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Wed, 20 Sep 2023 04:41:09 -0700 Subject: [PATCH 105/228] xtensa: boot/lib: fix function prototypes [ Upstream commit f54d02c8f2cc4b46ba2a3bd8252a6750453b6f2b ] Add function prototype for gunzip() to the boot library code and make exit() and zalloc() static. arch/xtensa/boot/lib/zmem.c:8:6: warning: no previous prototype for 'exit' [-Wmissing-prototypes] 8 | void exit (void) arch/xtensa/boot/lib/zmem.c:13:7: warning: no previous prototype for 'zalloc' [-Wmissing-prototypes] 13 | void *zalloc(unsigned size) arch/xtensa/boot/lib/zmem.c:35:6: warning: no previous prototype for 'gunzip' [-Wmissing-prototypes] 35 | void gunzip (void *dst, int dstlen, unsigned char *src, int *lenp) Fixes: 4bedea945451 ("xtensa: Architecture support for Tensilica Xtensa Part 2") Fixes: e7d163f76665 ("xtensa: Removed local copy of zlib and fixed O= support") Suggested-by: Randy Dunlap Signed-off-by: Max Filippov Signed-off-by: Sasha Levin --- arch/xtensa/boot/lib/zmem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/xtensa/boot/lib/zmem.c b/arch/xtensa/boot/lib/zmem.c index e3ecd743c515..b89189355122 100644 --- a/arch/xtensa/boot/lib/zmem.c +++ b/arch/xtensa/boot/lib/zmem.c @@ -4,13 +4,14 @@ /* bits taken from ppc */ extern void *avail_ram, *end_avail; +void gunzip(void *dst, int dstlen, unsigned char *src, int *lenp); -void exit (void) +static void exit(void) { for (;;); } -void *zalloc(unsigned size) +static void *zalloc(unsigned int size) { void *p = avail_ram; From dd81e91b2efc73ee09c867a31d9f964b19c8e90e Mon Sep 17 00:00:00 2001 From: Wenhua Lin Date: Thu, 21 Sep 2023 20:25:27 +0800 Subject: [PATCH 106/228] gpio: pmic-eic-sprd: Add can_sleep flag for PMIC EIC chip [ Upstream commit 26d9e5640d2130ee16df7b1fb6a908f460ab004c ] The drivers uses a mutex and I2C bus access in its PMIC EIC chip get implementation. This means these functions can sleep and the PMIC EIC chip should set the can_sleep property to true. This will ensure that a warning is printed when trying to get the value from a context that potentially can't sleep. Fixes: 348f3cde84ab ("gpio: Add Spreadtrum PMIC EIC driver support") Signed-off-by: Wenhua Lin Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpio-pmic-eic-sprd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-pmic-eic-sprd.c b/drivers/gpio/gpio-pmic-eic-sprd.c index 938285190566..e969ce9131dd 100644 --- a/drivers/gpio/gpio-pmic-eic-sprd.c +++ b/drivers/gpio/gpio-pmic-eic-sprd.c @@ -338,6 +338,7 @@ static int sprd_pmic_eic_probe(struct platform_device *pdev) pmic_eic->chip.set_config = sprd_pmic_eic_set_config; pmic_eic->chip.set = sprd_pmic_eic_set; pmic_eic->chip.get = sprd_pmic_eic_get; + pmic_eic->chip.can_sleep = true; pmic_eic->intc.name = dev_name(&pdev->dev); pmic_eic->intc.irq_mask = sprd_pmic_eic_irq_mask; From 061f4027533827978daa432e31d2107d07e1f308 Mon Sep 17 00:00:00 2001 From: "William A. Kennington III" Date: Sat, 23 Sep 2023 18:02:14 -0700 Subject: [PATCH 107/228] i2c: npcm7xx: Fix callback completion ordering [ Upstream commit 92e73d807b68b2214fcafca4e130b5300a9d4b3c ] Sometimes, our completions race with new master transfers and override the bus->operation and bus->master_or_slave variables. This causes transactions to timeout and kernel crashes less frequently. To remedy this, we re-order all completions to the very end of the function. Fixes: 56a1485b102e ("i2c: npcm7xx: Add Nuvoton NPCM I2C controller driver") Signed-off-by: William A. Kennington III Reviewed-by: Tali Perry Signed-off-by: Wolfram Sang Signed-off-by: Sasha Levin --- drivers/i2c/busses/i2c-npcm7xx.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/i2c/busses/i2c-npcm7xx.c b/drivers/i2c/busses/i2c-npcm7xx.c index c1b679737240..73c808ef1bfe 100644 --- a/drivers/i2c/busses/i2c-npcm7xx.c +++ b/drivers/i2c/busses/i2c-npcm7xx.c @@ -675,6 +675,7 @@ static void npcm_i2c_callback(struct npcm_i2c *bus, { struct i2c_msg *msgs; int msgs_num; + bool do_complete = false; msgs = bus->msgs; msgs_num = bus->msgs_num; @@ -701,23 +702,17 @@ static void npcm_i2c_callback(struct npcm_i2c *bus, msgs[1].flags & I2C_M_RD) msgs[1].len = info; } - if (completion_done(&bus->cmd_complete) == false) - complete(&bus->cmd_complete); - break; - + do_complete = true; + break; case I2C_NACK_IND: /* MASTER transmit got a NACK before tx all bytes */ bus->cmd_err = -ENXIO; - if (bus->master_or_slave == I2C_MASTER) - complete(&bus->cmd_complete); - + do_complete = true; break; case I2C_BUS_ERR_IND: /* Bus error */ bus->cmd_err = -EAGAIN; - if (bus->master_or_slave == I2C_MASTER) - complete(&bus->cmd_complete); - + do_complete = true; break; case I2C_WAKE_UP_IND: /* I2C wake up */ @@ -731,6 +726,8 @@ static void npcm_i2c_callback(struct npcm_i2c *bus, if (bus->slave) bus->master_or_slave = I2C_SLAVE; #endif + if (do_complete) + complete(&bus->cmd_complete); } static u8 npcm_i2c_fifo_usage(struct npcm_i2c *bus) From c79300599923daaa30f417c75555d5566b3d31ae Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 16 Aug 2023 11:32:21 +0900 Subject: [PATCH 108/228] dma-debug: don't call __dma_entry_alloc_check_leak() under free_entries_lock [ Upstream commit fb5a4315591dae307a65fc246ca80b5159d296e1 ] __dma_entry_alloc_check_leak() calls into printk -> serial console output (qcom geni) and grabs port->lock under free_entries_lock spin lock, which is a reverse locking dependency chain as qcom_geni IRQ handler can call into dma-debug code and grab free_entries_lock under port->lock. Move __dma_entry_alloc_check_leak() call out of free_entries_lock scope so that we don't acquire serial console's port->lock under it. Trimmed-down lockdep splat: The existing dependency chain (in reverse order) is: -> #2 (free_entries_lock){-.-.}-{2:2}: _raw_spin_lock_irqsave+0x60/0x80 dma_entry_alloc+0x38/0x110 debug_dma_map_page+0x60/0xf8 dma_map_page_attrs+0x1e0/0x230 dma_map_single_attrs.constprop.0+0x6c/0xc8 geni_se_rx_dma_prep+0x40/0xcc qcom_geni_serial_isr+0x310/0x510 __handle_irq_event_percpu+0x110/0x244 handle_irq_event_percpu+0x20/0x54 handle_irq_event+0x50/0x88 handle_fasteoi_irq+0xa4/0xcc handle_irq_desc+0x28/0x40 generic_handle_domain_irq+0x24/0x30 gic_handle_irq+0xc4/0x148 do_interrupt_handler+0xa4/0xb0 el1_interrupt+0x34/0x64 el1h_64_irq_handler+0x18/0x24 el1h_64_irq+0x64/0x68 arch_local_irq_enable+0x4/0x8 ____do_softirq+0x18/0x24 ... -> #1 (&port_lock_key){-.-.}-{2:2}: _raw_spin_lock_irqsave+0x60/0x80 qcom_geni_serial_console_write+0x184/0x1dc console_flush_all+0x344/0x454 console_unlock+0x94/0xf0 vprintk_emit+0x238/0x24c vprintk_default+0x3c/0x48 vprintk+0xb4/0xbc _printk+0x68/0x90 register_console+0x230/0x38c uart_add_one_port+0x338/0x494 qcom_geni_serial_probe+0x390/0x424 platform_probe+0x70/0xc0 really_probe+0x148/0x280 __driver_probe_device+0xfc/0x114 driver_probe_device+0x44/0x100 __device_attach_driver+0x64/0xdc bus_for_each_drv+0xb0/0xd8 __device_attach+0xe4/0x140 device_initial_probe+0x1c/0x28 bus_probe_device+0x44/0xb0 device_add+0x538/0x668 of_device_add+0x44/0x50 of_platform_device_create_pdata+0x94/0xc8 of_platform_bus_create+0x270/0x304 of_platform_populate+0xac/0xc4 devm_of_platform_populate+0x60/0xac geni_se_probe+0x154/0x160 platform_probe+0x70/0xc0 ... -> #0 (console_owner){-...}-{0:0}: __lock_acquire+0xdf8/0x109c lock_acquire+0x234/0x284 console_flush_all+0x330/0x454 console_unlock+0x94/0xf0 vprintk_emit+0x238/0x24c vprintk_default+0x3c/0x48 vprintk+0xb4/0xbc _printk+0x68/0x90 dma_entry_alloc+0xb4/0x110 debug_dma_map_sg+0xdc/0x2f8 __dma_map_sg_attrs+0xac/0xe4 dma_map_sgtable+0x30/0x4c get_pages+0x1d4/0x1e4 [msm] msm_gem_pin_pages_locked+0x38/0xac [msm] msm_gem_pin_vma_locked+0x58/0x88 [msm] msm_ioctl_gem_submit+0xde4/0x13ac [msm] drm_ioctl_kernel+0xe0/0x15c drm_ioctl+0x2e8/0x3f4 vfs_ioctl+0x30/0x50 ... Chain exists of: console_owner --> &port_lock_key --> free_entries_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(free_entries_lock); lock(&port_lock_key); lock(free_entries_lock); lock(console_owner); *** DEADLOCK *** Call trace: dump_backtrace+0xb4/0xf0 show_stack+0x20/0x30 dump_stack_lvl+0x60/0x84 dump_stack+0x18/0x24 print_circular_bug+0x1cc/0x234 check_noncircular+0x78/0xac __lock_acquire+0xdf8/0x109c lock_acquire+0x234/0x284 console_flush_all+0x330/0x454 console_unlock+0x94/0xf0 vprintk_emit+0x238/0x24c vprintk_default+0x3c/0x48 vprintk+0xb4/0xbc _printk+0x68/0x90 dma_entry_alloc+0xb4/0x110 debug_dma_map_sg+0xdc/0x2f8 __dma_map_sg_attrs+0xac/0xe4 dma_map_sgtable+0x30/0x4c get_pages+0x1d4/0x1e4 [msm] msm_gem_pin_pages_locked+0x38/0xac [msm] msm_gem_pin_vma_locked+0x58/0x88 [msm] msm_ioctl_gem_submit+0xde4/0x13ac [msm] drm_ioctl_kernel+0xe0/0x15c drm_ioctl+0x2e8/0x3f4 vfs_ioctl+0x30/0x50 ... Reported-by: Rob Clark Signed-off-by: Sergey Senozhatsky Acked-by: Robin Murphy Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- kernel/dma/debug.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index ae9fc1ee6d20..026398308909 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -606,15 +606,19 @@ static struct dma_debug_entry *__dma_entry_alloc(void) return entry; } -static void __dma_entry_alloc_check_leak(void) +/* + * This should be called outside of free_entries_lock scope to avoid potential + * deadlocks with serial consoles that use DMA. + */ +static void __dma_entry_alloc_check_leak(u32 nr_entries) { - u32 tmp = nr_total_entries % nr_prealloc_entries; + u32 tmp = nr_entries % nr_prealloc_entries; /* Shout each time we tick over some multiple of the initial pool */ if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) { pr_info("dma_debug_entry pool grown to %u (%u00%%)\n", - nr_total_entries, - (nr_total_entries / nr_prealloc_entries)); + nr_entries, + (nr_entries / nr_prealloc_entries)); } } @@ -625,8 +629,10 @@ static void __dma_entry_alloc_check_leak(void) */ static struct dma_debug_entry *dma_entry_alloc(void) { + bool alloc_check_leak = false; struct dma_debug_entry *entry; unsigned long flags; + u32 nr_entries; spin_lock_irqsave(&free_entries_lock, flags); if (num_free_entries == 0) { @@ -636,13 +642,17 @@ static struct dma_debug_entry *dma_entry_alloc(void) pr_err("debugging out of memory - disabling\n"); return NULL; } - __dma_entry_alloc_check_leak(); + alloc_check_leak = true; + nr_entries = nr_total_entries; } entry = __dma_entry_alloc(); spin_unlock_irqrestore(&free_entries_lock, flags); + if (alloc_check_leak) + __dma_entry_alloc_check_leak(nr_entries); + #ifdef CONFIG_STACKTRACE entry->stack_len = stack_trace_save(entry->stack_entries, ARRAY_SIZE(entry->stack_entries), From 7b2440c2d64f2d418fb726e3432ee46dec96c2e2 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 30 Aug 2023 08:10:01 +0200 Subject: [PATCH 109/228] parisc: sba: Fix compile warning wrt list of SBA devices [ Upstream commit eb3255ee8f6f4691471a28fbf22db5e8901116cd ] Fix this makecheck warning: drivers/parisc/sba_iommu.c:98:19: warning: symbol 'sba_list' was not declared. Should it be static? Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- arch/parisc/include/asm/ropes.h | 3 +++ drivers/char/agp/parisc-agp.c | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/parisc/include/asm/ropes.h b/arch/parisc/include/asm/ropes.h index 8e51c775c80a..62399c7ea94a 100644 --- a/arch/parisc/include/asm/ropes.h +++ b/arch/parisc/include/asm/ropes.h @@ -86,6 +86,9 @@ struct sba_device { struct ioc ioc[MAX_IOC]; }; +/* list of SBA's in system, see drivers/parisc/sba_iommu.c */ +extern struct sba_device *sba_list; + #define ASTRO_RUNWAY_PORT 0x582 #define IKE_MERCED_PORT 0x803 #define REO_MERCED_PORT 0x804 diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c index 514f9f287a78..c6f181702b9a 100644 --- a/drivers/char/agp/parisc-agp.c +++ b/drivers/char/agp/parisc-agp.c @@ -394,8 +394,6 @@ find_quicksilver(struct device *dev, void *data) static int __init parisc_agp_init(void) { - extern struct sba_device *sba_list; - int err = -1; struct parisc_device *sba = NULL, *lba = NULL; struct lba_device *lbadev = NULL; From d967a9472bf9c94b2d2d28bb4aa6ab49d684ac17 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 30 Aug 2023 11:59:55 +0200 Subject: [PATCH 110/228] parisc: iosapic.c: Fix sparse warnings [ Upstream commit 927c6c8aa27c284a799b8c18784e37d3373af908 ] Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/parisc/iosapic.c | 4 ++-- drivers/parisc/iosapic_private.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index fd99735dca3e..6ef663bbcdb0 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -202,9 +202,9 @@ static inline void iosapic_write(void __iomem *iosapic, unsigned int reg, u32 va static DEFINE_SPINLOCK(iosapic_lock); -static inline void iosapic_eoi(void __iomem *addr, unsigned int data) +static inline void iosapic_eoi(__le32 __iomem *addr, __le32 data) { - __raw_writel(data, addr); + __raw_writel((__force u32)data, addr); } /* diff --git a/drivers/parisc/iosapic_private.h b/drivers/parisc/iosapic_private.h index 73ecc657ad95..bd8ff40162b4 100644 --- a/drivers/parisc/iosapic_private.h +++ b/drivers/parisc/iosapic_private.h @@ -118,8 +118,8 @@ struct iosapic_irt { struct vector_info { struct iosapic_info *iosapic; /* I/O SAPIC this vector is on */ struct irt_entry *irte; /* IRT entry */ - u32 __iomem *eoi_addr; /* precalculate EOI reg address */ - u32 eoi_data; /* IA64: ? PA: swapped txn_data */ + __le32 __iomem *eoi_addr; /* precalculate EOI reg address */ + __le32 eoi_data; /* IA64: ? PA: swapped txn_data */ int txn_irq; /* virtual IRQ number for processor */ ulong txn_addr; /* IA64: id_eid PA: partial HPA */ u32 txn_data; /* CPU interrupt bit */ From f47efdffdc130c53a3ea08fe46372e60af471188 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 31 Aug 2023 22:08:32 +0200 Subject: [PATCH 111/228] parisc: drivers: Fix sparse warning [ Upstream commit b137b9d60b8add5620a06c687a71ce18776730b0 ] Fix "warning: directive in macro's argument list" warning. Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- arch/parisc/kernel/drivers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index d95157488832..d11a3123f3dc 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -925,9 +925,9 @@ static __init void qemu_header(void) pr_info("#define PARISC_MODEL \"%s\"\n\n", boot_cpu_data.pdc.sys_model_name); + #define p ((unsigned long *)&boot_cpu_data.pdc.model) pr_info("#define PARISC_PDC_MODEL 0x%lx, 0x%lx, 0x%lx, " "0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx\n\n", - #define p ((unsigned long *)&boot_cpu_data.pdc.model) p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8]); #undef p From 2081b2a15b08933273f887e9b4018cf90599fa17 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 31 Aug 2023 22:36:12 +0200 Subject: [PATCH 112/228] parisc: irq: Make irq_stack_union static to avoid sparse warning [ Upstream commit b1bef1388c427cdad7331a9c8eb4ebbbe5b954b0 ] Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- arch/parisc/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 60f5829d476f..2762e8540672 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -388,7 +388,7 @@ union irq_stack_union { volatile unsigned int lock[1]; }; -DEFINE_PER_CPU(union irq_stack_union, irq_stack_union) = { +static DEFINE_PER_CPU(union irq_stack_union, irq_stack_union) = { .slock = { 1,1,1,1 }, }; #endif From 38ef4b2e4dca1770261d5eb88ba592727f2c24bb Mon Sep 17 00:00:00 2001 From: Javed Hasan Date: Fri, 1 Sep 2023 11:36:46 +0530 Subject: [PATCH 113/228] scsi: qedf: Add synchronization between I/O completions and abort [ Upstream commit 7df0b2605489bef3f4223ad66f1f9bb8d50d4cd2 ] Avoid race condition between I/O completion and abort processing by protecting the cmd_type with the rport lock. Signed-off-by: Javed Hasan Signed-off-by: Saurav Kashyap Link: https://lore.kernel.org/r/20230901060646.27885-1-skashyap@marvell.com Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/qedf/qedf_io.c | 10 ++++++++-- drivers/scsi/qedf/qedf_main.c | 7 ++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qedf/qedf_io.c b/drivers/scsi/qedf/qedf_io.c index 472374d83ced..1f8e81296beb 100644 --- a/drivers/scsi/qedf/qedf_io.c +++ b/drivers/scsi/qedf/qedf_io.c @@ -1924,6 +1924,7 @@ int qedf_initiate_abts(struct qedf_ioreq *io_req, bool return_scsi_cmd_on_abts) goto drop_rdata_kref; } + spin_lock_irqsave(&fcport->rport_lock, flags); if (!test_bit(QEDF_CMD_OUTSTANDING, &io_req->flags) || test_bit(QEDF_CMD_IN_CLEANUP, &io_req->flags) || test_bit(QEDF_CMD_IN_ABORT, &io_req->flags)) { @@ -1931,17 +1932,20 @@ int qedf_initiate_abts(struct qedf_ioreq *io_req, bool return_scsi_cmd_on_abts) "io_req xid=0x%x sc_cmd=%p already in cleanup or abort processing or already completed.\n", io_req->xid, io_req->sc_cmd); rc = 1; + spin_unlock_irqrestore(&fcport->rport_lock, flags); goto drop_rdata_kref; } + /* Set the command type to abort */ + io_req->cmd_type = QEDF_ABTS; + spin_unlock_irqrestore(&fcport->rport_lock, flags); + kref_get(&io_req->refcount); xid = io_req->xid; qedf->control_requests++; qedf->packet_aborts++; - /* Set the command type to abort */ - io_req->cmd_type = QEDF_ABTS; io_req->return_scsi_cmd_on_abts = return_scsi_cmd_on_abts; set_bit(QEDF_CMD_IN_ABORT, &io_req->flags); @@ -2230,7 +2234,9 @@ int qedf_initiate_cleanup(struct qedf_ioreq *io_req, refcount, fcport, fcport->rdata->ids.port_id); /* Cleanup cmds re-use the same TID as the original I/O */ + spin_lock_irqsave(&fcport->rport_lock, flags); io_req->cmd_type = QEDF_CLEANUP; + spin_unlock_irqrestore(&fcport->rport_lock, flags); io_req->return_scsi_cmd_on_abts = return_scsi_cmd_on_abts; init_completion(&io_req->cleanup_done); diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index 63c9368bafcf..6923862be3fb 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -2803,6 +2803,8 @@ void qedf_process_cqe(struct qedf_ctx *qedf, struct fcoe_cqe *cqe) struct qedf_ioreq *io_req; struct qedf_rport *fcport; u32 comp_type; + u8 io_comp_type; + unsigned long flags; comp_type = (cqe->cqe_data >> FCOE_CQE_CQE_TYPE_SHIFT) & FCOE_CQE_CQE_TYPE_MASK; @@ -2836,11 +2838,14 @@ void qedf_process_cqe(struct qedf_ctx *qedf, struct fcoe_cqe *cqe) return; } + spin_lock_irqsave(&fcport->rport_lock, flags); + io_comp_type = io_req->cmd_type; + spin_unlock_irqrestore(&fcport->rport_lock, flags); switch (comp_type) { case FCOE_GOOD_COMPLETION_CQE_TYPE: atomic_inc(&fcport->free_sqes); - switch (io_req->cmd_type) { + switch (io_comp_type) { case QEDF_SCSI_CMD: qedf_scsi_completion(qedf, cqe, io_req); break; From 5dfcb92905b3593ed8aa56bb50fab4d8cb09d181 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Mon, 26 Jun 2023 08:11:44 +0800 Subject: [PATCH 114/228] selftests/ftrace: Correctly enable event in instance-event.tc [ Upstream commit f4e4ada586995b17f828c6d147d1800eb1471450 ] Function instance_set() expects to enable event 'sched_switch', so we should set 1 to its 'enable' file. Testcase passed after this patch: # ./ftracetest test.d/instances/instance-event.tc === Ftrace unit tests === [1] Test creation and deletion of trace instances while setting an event [PASS] # of passed: 1 # of failed: 0 # of unresolved: 0 # of untested: 0 # of unsupported: 0 # of xfailed: 0 # of undefined(test bug): 0 Signed-off-by: Zheng Yejian Acked-by: Masami Hiramatsu (Google) Acked-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin --- .../testing/selftests/ftrace/test.d/instances/instance-event.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc b/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc index 0eb47fbb3f44..42422e425107 100644 --- a/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc +++ b/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc @@ -39,7 +39,7 @@ instance_read() { instance_set() { while :; do - echo 1 > foo/events/sched/sched_switch + echo 1 > foo/events/sched/sched_switch/enable done 2> /dev/null } From 9ccce21bd77b117ede7d890bc4cef156768d8049 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Wed, 6 Sep 2023 16:19:30 +0800 Subject: [PATCH 115/228] ring-buffer: Avoid softlockup in ring_buffer_resize() [ Upstream commit f6bd2c92488c30ef53b5bd80c52f0a7eee9d545a ] When user resize all trace ring buffer through file 'buffer_size_kb', then in ring_buffer_resize(), kernel allocates buffer pages for each cpu in a loop. If the kernel preemption model is PREEMPT_NONE and there are many cpus and there are many buffer pages to be allocated, it may not give up cpu for a long time and finally cause a softlockup. To avoid it, call cond_resched() after each cpu buffer allocation. Link: https://lore.kernel.org/linux-trace-kernel/20230906081930.3939106-1-zhengyejian1@huawei.com Cc: Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin --- kernel/trace/ring_buffer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f8126fa0630e..752e9549a59e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2080,6 +2080,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, err = -ENOMEM; goto out_err; } + + cond_resched(); } get_online_cpus(); From 2956e33fb4f83c0d9c7381e198f2bf13a2d376cd Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 22 Aug 2023 18:09:40 -0300 Subject: [PATCH 116/228] selftests: fix dependency checker script [ Upstream commit 5f9dd2e896a91bfca90f8463eb6808c03d535d8a ] This patch fixes inconsistencies in the parsing rules of the levels 1 and 2 of the kselftest_deps.sh. It was added the levels 4 and 5 to account for a few edge cases that are present in some tests, also some minor identation styling have been fixed (s/ /\t/g). Signed-off-by: Ricardo B. Marliere Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin --- tools/testing/selftests/kselftest_deps.sh | 77 +++++++++++++++++++---- 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh index bbc04646346b..e6010de67820 100755 --- a/tools/testing/selftests/kselftest_deps.sh +++ b/tools/testing/selftests/kselftest_deps.sh @@ -46,11 +46,11 @@ fi print_targets=0 while getopts "p" arg; do - case $arg in - p) + case $arg in + p) print_targets=1 shift;; - esac + esac done if [ $# -eq 0 ] @@ -92,6 +92,10 @@ pass_cnt=0 # Get all TARGETS from selftests Makefile targets=$(egrep "^TARGETS +|^TARGETS =" Makefile | cut -d "=" -f2) +# Initially, in LDLIBS related lines, the dep checker needs +# to ignore lines containing the following strings: +filter="\$(VAR_LDLIBS)\|pkg-config\|PKG_CONFIG\|IOURING_EXTRA_LIBS" + # Single test case if [ $# -eq 2 ] then @@ -100,6 +104,8 @@ then l1_test $test l2_test $test l3_test $test + l4_test $test + l5_test $test print_results $1 $2 exit $? @@ -113,7 +119,7 @@ fi # Append space at the end of the list to append more tests. l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \ - grep -v "VAR_LDLIBS" | awk -F: '{print $1}') + grep -v "$filter" | awk -F: '{print $1}' | uniq) # Level 2: LDLIBS set dynamically. # @@ -126,7 +132,7 @@ l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \ # Append space at the end of the list to append more tests. l2_tests=$(grep -r --include=Makefile ": LDLIBS" | \ - grep -v "VAR_LDLIBS" | awk -F: '{print $1}') + grep -v "$filter" | awk -F: '{print $1}' | uniq) # Level 3 # gpio, memfd and others use pkg-config to find mount and fuse libs @@ -140,11 +146,32 @@ l2_tests=$(grep -r --include=Makefile ": LDLIBS" | \ # VAR_LDLIBS := $(shell pkg-config fuse --libs 2>/dev/null) l3_tests=$(grep -r --include=Makefile "^VAR_LDLIBS" | \ - grep -v "pkg-config" | awk -F: '{print $1}') + grep -v "pkg-config\|PKG_CONFIG" | awk -F: '{print $1}' | uniq) -#echo $l1_tests -#echo $l2_1_tests -#echo $l3_tests +# Level 4 +# some tests may fall back to default using `|| echo -l` +# if pkg-config doesn't find the libs, instead of using VAR_LDLIBS +# as per level 3 checks. +# e.g: +# netfilter/Makefile +# LDLIBS += $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl) +l4_tests=$(grep -r --include=Makefile "^LDLIBS" | \ + grep "pkg-config\|PKG_CONFIG" | awk -F: '{print $1}' | uniq) + +# Level 5 +# some tests may use IOURING_EXTRA_LIBS to add extra libs to LDLIBS, +# which in turn may be defined in a sub-Makefile +# e.g.: +# mm/Makefile +# $(OUTPUT)/gup_longterm: LDLIBS += $(IOURING_EXTRA_LIBS) +l5_tests=$(grep -r --include=Makefile "LDLIBS +=.*\$(IOURING_EXTRA_LIBS)" | \ + awk -F: '{print $1}' | uniq) + +#echo l1_tests $l1_tests +#echo l2_tests $l2_tests +#echo l3_tests $l3_tests +#echo l4_tests $l4_tests +#echo l5_tests $l5_tests all_tests print_results $1 $2 @@ -166,24 +193,32 @@ all_tests() for test in $l3_tests; do l3_test $test done + + for test in $l4_tests; do + l4_test $test + done + + for test in $l5_tests; do + l5_test $test + done } # Use same parsing used for l1_tests and pick libraries this time. l1_test() { test_libs=$(grep --include=Makefile "^LDLIBS" $test | \ - grep -v "VAR_LDLIBS" | \ + grep -v "$filter" | \ sed -e 's/\:/ /' | \ sed -e 's/+/ /' | cut -d "=" -f 2) check_libs $test $test_libs } -# Use same parsing used for l2__tests and pick libraries this time. +# Use same parsing used for l2_tests and pick libraries this time. l2_test() { test_libs=$(grep --include=Makefile ": LDLIBS" $test | \ - grep -v "VAR_LDLIBS" | \ + grep -v "$filter" | \ sed -e 's/\:/ /' | sed -e 's/+/ /' | \ cut -d "=" -f 2) @@ -199,6 +234,24 @@ l3_test() check_libs $test $test_libs } +l4_test() +{ + test_libs=$(grep --include=Makefile "^VAR_LDLIBS\|^LDLIBS" $test | \ + grep "\(pkg-config\|PKG_CONFIG\).*|| echo " | \ + sed -e 's/.*|| echo //' | sed -e 's/)$//') + + check_libs $test $test_libs +} + +l5_test() +{ + tests=$(find $(dirname "$test") -type f -name "*.mk") + test_libs=$(grep "^IOURING_EXTRA_LIBS +\?=" $tests | \ + cut -d "=" -f 2) + + check_libs $test $test_libs +} + check_libs() { From cee5151c5410e868826b8afecfb356f3799ebea3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 7 Sep 2023 12:28:20 -0400 Subject: [PATCH 117/228] ring-buffer: Do not attempt to read past "commit" [ Upstream commit 95a404bd60af6c4d9d8db01ad14fe8957ece31ca ] When iterating over the ring buffer while the ring buffer is active, the writer can corrupt the reader. There's barriers to help detect this and handle it, but that code missed the case where the last event was at the very end of the page and has only 4 bytes left. The checks to detect the corruption by the writer to reads needs to see the length of the event. If the length in the first 4 bytes is zero then the length is stored in the second 4 bytes. But if the writer is in the process of updating that code, there's a small window where the length in the first 4 bytes could be zero even though the length is only 4 bytes. That will cause rb_event_length() to read the next 4 bytes which could happen to be off the allocated page. To protect against this, fail immediately if the next event pointer is less than 8 bytes from the end of the commit (last byte of data), as all events must be a minimum of 8 bytes anyway. Link: https://lore.kernel.org/all/20230905141245.26470-1-Tze-nan.Wu@mediatek.com/ Link: https://lore.kernel.org/linux-trace-kernel/20230907122820.0899019c@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Reported-by: Tze-nan Wu Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin --- kernel/trace/ring_buffer.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 752e9549a59e..812ec380da82 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2260,6 +2260,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ commit = rb_page_commit(iter_head_page); smp_rmb(); + + /* An event needs to be at least 8 bytes in size */ + if (iter->head > commit - 8) + goto reset; + event = __rb_page_index(iter_head_page, iter->head); length = rb_event_length(event); From f44e66447c4f18b425a91cf6698ecdbf07414f95 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Tue, 5 Sep 2023 09:32:43 -0400 Subject: [PATCH 118/228] platform/mellanox: mlxbf-bootctl: add NET dependency into Kconfig [ Upstream commit c2dffda1d8f7511505bbbf16ba282f2079b30089 ] The latest version of the mlxbf_bootctl driver utilizes "sysfs_format_mac", and this API is only available if NET is defined in the kernel configuration. This patch changes the mlxbf_bootctl Kconfig to depend on NET. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202309031058.JvwNDBKt-lkp@intel.com/ Reported-by: Randy Dunlap Signed-off-by: David Thompson Link: https://lore.kernel.org/r/20230905133243.31550-1-davthompson@nvidia.com Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/mellanox/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/mellanox/Kconfig b/drivers/platform/mellanox/Kconfig index 916b39dc11bc..1a11d1a441b5 100644 --- a/drivers/platform/mellanox/Kconfig +++ b/drivers/platform/mellanox/Kconfig @@ -48,6 +48,7 @@ config MLXBF_BOOTCTL tristate "Mellanox BlueField Firmware Boot Control driver" depends on ARM64 depends on ACPI + depends on NET help The Mellanox BlueField firmware implements functionality to request swapping the primary and alternate eMMC boot partition, From 0decc581e1dcd122c60a257f011ecffca33aed8e Mon Sep 17 00:00:00 2001 From: Michal Grzedzicki Date: Wed, 13 Sep 2023 08:56:10 -0700 Subject: [PATCH 119/228] scsi: pm80xx: Use phy-specific SAS address when sending PHY_START command [ Upstream commit 71996bb835aed58c7ec4967be1d05190a27339ec ] Some cards have more than one SAS address. Using an incorrect address causes communication issues with some devices like expanders. Closes: https://lore.kernel.org/linux-kernel/A57AEA84-5CA0-403E-8053-106033C73C70@fb.com/ Signed-off-by: Michal Grzedzicki Link: https://lore.kernel.org/r/20230913155611.3183612-1-mge@meta.com Acked-by: Jack Wang Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/pm8001/pm8001_hwi.c | 2 +- drivers/scsi/pm8001/pm80xx_hwi.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index e9b3485baee0..2b20c6a0293f 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -4344,7 +4344,7 @@ pm8001_chip_phy_start_req(struct pm8001_hba_info *pm8001_ha, u8 phy_id) payload.sas_identify.dev_type = SAS_END_DEVICE; payload.sas_identify.initiator_bits = SAS_PROTOCOL_ALL; memcpy(payload.sas_identify.sas_addr, - pm8001_ha->sas_addr, SAS_ADDR_SIZE); + &pm8001_ha->phy[phy_id].dev_sas_addr, SAS_ADDR_SIZE); payload.sas_identify.phy_id = phy_id; ret = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opcode, &payload, sizeof(payload), 0); diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index c98c0a53a018..ed01c9306209 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -4741,7 +4741,7 @@ pm80xx_chip_phy_start_req(struct pm8001_hba_info *pm8001_ha, u8 phy_id) payload.sas_identify.dev_type = SAS_END_DEVICE; payload.sas_identify.initiator_bits = SAS_PROTOCOL_ALL; memcpy(payload.sas_identify.sas_addr, - &pm8001_ha->sas_addr, SAS_ADDR_SIZE); + &pm8001_ha->phy[phy_id].dev_sas_addr, SAS_ADDR_SIZE); payload.sas_identify.phy_id = phy_id; ret = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opcode, &payload, sizeof(payload), 0); From 2afd8fcee0c4d65a482e30c3ad2a92c25e5e92d4 Mon Sep 17 00:00:00 2001 From: Michal Grzedzicki Date: Mon, 11 Sep 2023 10:03:40 -0700 Subject: [PATCH 120/228] scsi: pm80xx: Avoid leaking tags when processing OPC_INB_SET_CONTROLLER_CONFIG command [ Upstream commit c13e7331745852d0dd7c35eabbe181cbd5b01172 ] Tags allocated for OPC_INB_SET_CONTROLLER_CONFIG command need to be freed when we receive the response. Signed-off-by: Michal Grzedzicki Link: https://lore.kernel.org/r/20230911170340.699533-2-mge@meta.com Acked-by: Jack Wang Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/pm8001/pm80xx_hwi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index ed01c9306209..89051722e04d 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -3722,10 +3722,12 @@ static int mpi_set_controller_config_resp(struct pm8001_hba_info *pm8001_ha, (struct set_ctrl_cfg_resp *)(piomb + 4); u32 status = le32_to_cpu(pPayload->status); u32 err_qlfr_pgcd = le32_to_cpu(pPayload->err_qlfr_pgcd); + u32 tag = le32_to_cpu(pPayload->tag); pm8001_dbg(pm8001_ha, MSG, "SET CONTROLLER RESP: status 0x%x qlfr_pgcd 0x%x\n", status, err_qlfr_pgcd); + pm8001_tag_free(pm8001_ha, tag); return 0; } From f6cf19c1b3134b777f1f3243bed7bcbc4e456a2b Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 14 Sep 2023 00:19:16 +0200 Subject: [PATCH 121/228] ata: libata-eh: do not clear ATA_PFLAG_EH_PENDING in ata_eh_reset() [ Upstream commit 80cc944eca4f0baa9c381d0706f3160e491437f2 ] ata_scsi_port_error_handler() starts off by clearing ATA_PFLAG_EH_PENDING, before calling ap->ops->error_handler() (without holding the ap->lock). If an error IRQ is received while ap->ops->error_handler() is running, the irq handler will set ATA_PFLAG_EH_PENDING. Once ap->ops->error_handler() returns, ata_scsi_port_error_handler() checks if ATA_PFLAG_EH_PENDING is set, and if it is, another iteration of ATA EH is performed. The problem is that ATA_PFLAG_EH_PENDING is not only cleared by ata_scsi_port_error_handler(), it is also cleared by ata_eh_reset(). ata_eh_reset() is called by ap->ops->error_handler(). This additional clearing done by ata_eh_reset() breaks the whole retry logic in ata_scsi_port_error_handler(). Thus, if an error IRQ is received while ap->ops->error_handler() is running, the port will currently remain frozen and will never get re-enabled. The additional clearing in ata_eh_reset() was introduced in commit 1e641060c4b5 ("libata: clear eh_info on reset completion"). Looking at the original error report: https://marc.info/?l=linux-ide&m=124765325828495&w=2 We can see the following happening: [ 1.074659] ata3: XXX port freeze [ 1.074700] ata3: XXX hardresetting link, stopping engine [ 1.074746] ata3: XXX flipping SControl [ 1.411471] ata3: XXX irq_stat=400040 CONN|PHY [ 1.411475] ata3: XXX port freeze [ 1.420049] ata3: XXX starting engine [ 1.420096] ata3: XXX rc=0, class=1 [ 1.420142] ata3: XXX clearing IRQs for thawing [ 1.420188] ata3: XXX port thawed [ 1.420234] ata3: SATA link up 3.0 Gbps (SStatus 123 SControl 300) We are not supposed to be able to receive an error IRQ while the port is frozen (PxIE is set to 0, i.e. all IRQs for the port are disabled). AHCI 1.3.1 section 10.7.1.1 First Tier (IS Register) states: "Each bit location can be thought of as reporting a '1' if the virtual "interrupt line" for that port is indicating it wishes to generate an interrupt. That is, if a port has one or more interrupt status bit set, and the enables for those status bits are set, then this bit shall be set." Additionally, AHCI state P:ComInit clearly shows that the state machine will only jump to P:ComInitSetIS (which sets IS.IPS(x) to '1'), if PxIE.PCE is set to '1'. In our case, PxIE is set to 0, so IS.IPS(x) won't get set. So IS.IPS(x) only gets set if PxIS and PxIE is set. AHCI 1.3.1 section 10.7.1.1 First Tier (IS Register) also states: "The bits in this register are read/write clear. It is set by the level of the virtual interrupt line being a set, and cleared by a write of '1' from the software." So if IS.IPS(x) is set, you need to explicitly clear it by writing a 1 to IS.IPS(x) for that port. Since PxIE is cleared, the only way to get an interrupt while the port is frozen, is if IS.IPS(x) is set, and the only way IS.IPS(x) can be set when the port is frozen, is if it was set before the port was frozen. However, since commit 737dd811a3db ("ata: libahci: clear pending interrupt status"), we clear both PxIS and IS.IPS(x) after freezing the port, but before the COMRESET, so the problem that commit 1e641060c4b5 ("libata: clear eh_info on reset completion") fixed can no longer happen. Thus, revert commit 1e641060c4b5 ("libata: clear eh_info on reset completion"), so that the retry logic in ata_scsi_port_error_handler() works once again. (The retry logic is still needed, since we can still get an error IRQ _after_ the port has been thawed, but before ata_scsi_port_error_handler() takes the ap->lock in order to check if ATA_PFLAG_EH_PENDING is set.) Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal Signed-off-by: Sasha Levin --- drivers/ata/libata-eh.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 973f4d34d7cd..5fb3eda0a280 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -2703,18 +2703,11 @@ int ata_eh_reset(struct ata_link *link, int classify, postreset(slave, classes); } - /* - * Some controllers can't be frozen very well and may set spurious - * error conditions during reset. Clear accumulated error - * information and re-thaw the port if frozen. As reset is the - * final recovery action and we cross check link onlineness against - * device classification later, no hotplug event is lost by this. - */ + /* clear cached SError */ spin_lock_irqsave(link->ap->lock, flags); - memset(&link->eh_info, 0, sizeof(link->eh_info)); + link->eh_info.serror = 0; if (slave) - memset(&slave->eh_info, 0, sizeof(link->eh_info)); - ap->pflags &= ~ATA_PFLAG_EH_PENDING; + slave->eh_info.serror = 0; spin_unlock_irqrestore(link->ap->lock, flags); if (ap->pflags & ATA_PFLAG_FROZEN) From 0118244848a596c56efd8ed1805a59d37c7d9214 Mon Sep 17 00:00:00 2001 From: Han Xu Date: Wed, 6 Sep 2023 13:32:54 -0500 Subject: [PATCH 122/228] spi: nxp-fspi: reset the FLSHxCR1 registers [ Upstream commit 18495676f7886e105133f1dc06c1d5e8d5436f32 ] Reset the FLSHxCR1 registers to default value. ROM may set the register value and it affects the SPI NAND normal functions. Signed-off-by: Han Xu Link: https://lore.kernel.org/r/20230906183254.235847-1-han.xu@nxp.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-nxp-fspi.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index bcc0b5a3a459..90b5fbc914ae 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -950,6 +950,13 @@ static int nxp_fspi_default_setup(struct nxp_fspi *f) fspi_writel(f, FSPI_AHBCR_PREF_EN | FSPI_AHBCR_RDADDROPT, base + FSPI_AHBCR); + /* Reset the FLSHxCR1 registers. */ + reg = FSPI_FLSHXCR1_TCSH(0x3) | FSPI_FLSHXCR1_TCSS(0x3); + fspi_writel(f, reg, base + FSPI_FLSHA1CR1); + fspi_writel(f, reg, base + FSPI_FLSHA2CR1); + fspi_writel(f, reg, base + FSPI_FLSHB1CR1); + fspi_writel(f, reg, base + FSPI_FLSHB2CR1); + /* AHB Read - Set lut sequence ID for all CS. */ fspi_writel(f, SEQID_LUT, base + FSPI_FLSHA1CR2); fspi_writel(f, SEQID_LUT, base + FSPI_FLSHA2CR2); From ebc91848062e6abbe29265ce7e21fb3eded81601 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 11 Sep 2023 12:47:30 -0700 Subject: [PATCH 123/228] bpf: Clarify error expectations from bpf_clone_redirect [ Upstream commit 7cb779a6867fea00b4209bcf6de2f178a743247d ] Commit 151e887d8ff9 ("veth: Fixing transmit return status for dropped packets") exposed the fact that bpf_clone_redirect is capable of returning raw NET_XMIT_XXX return codes. This is in the conflict with its UAPI doc which says the following: "0 on success, or a negative error in case of failure." Update the UAPI to reflect the fact that bpf_clone_redirect can return positive error numbers, but don't explicitly define their meaning. Reported-by: Daniel Borkmann Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230911194731.286342-1-sdf@google.com Signed-off-by: Sasha Levin --- include/uapi/linux/bpf.h | 4 +++- tools/include/uapi/linux/bpf.h | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a234023821e..36ddfb98b70e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -976,7 +976,9 @@ union bpf_attr { * performed again, if the helper is used in combination with * direct packet access. * Return - * 0 on success, or a negative error in case of failure. + * 0 on success, or a negative error in case of failure. Positive + * error indicates a potential drop or congestion in the target + * device. The particular positive error codes are not defined. * * u64 bpf_get_current_pid_tgid(void) * Return diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7943e748916d..fd1a4d843e6f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -976,7 +976,9 @@ union bpf_attr { * performed again, if the helper is used in combination with * direct packet access. * Return - * 0 on success, or a negative error in case of failure. + * 0 on success, or a negative error in case of failure. Positive + * error indicates a potential drop or congestion in the target + * device. The particular positive error codes are not defined. * * u64 bpf_get_current_pid_tgid(void) * Return From f5bdbed0361cf56546942ea3f4cbedb154bf6466 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 17 Aug 2023 12:41:32 +0200 Subject: [PATCH 124/228] media: vb2: frame_vector.c: replace WARN_ONCE with a comment [ Upstream commit 735de5caf79e06cc9fb96b1b4f4974674ae3e917 ] The WARN_ONCE was issued also in cases that had nothing to do with VM_IO (e.g. if the start address was just a random value and uaccess fails with -EFAULT). There are no reports of WARN_ONCE being issued for actual VM_IO cases, so just drop it and instead add a note to the comment before the function. Signed-off-by: Hans Verkuil Reviewed-by: David Hildenbrand Reported-by: Yikebaer Aizezi Signed-off-by: Sasha Levin --- mm/frame_vector.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 0e589a9a8801..1cd81d38ad2d 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -29,6 +29,10 @@ * different type underlying the specified range of virtual addresses. * When the function isn't able to map a single page, it returns error. * + * Note that get_vaddr_frames() cannot follow VM_IO mappings. It used + * to be able to do that, but that could (racily) return non-refcounted + * pfns. + * * This function takes care of grabbing mmap_lock as necessary. */ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, @@ -77,8 +81,6 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, goto out; } - /* This used to (racily) return non-refcounted pfns. Let people know */ - WARN_ONCE(1, "get_vaddr_frames() cannot follow VM_IO mapping"); vec->nr_frames = 0; out: From 8c2500228b8f5ea9abcc4522dedca0963edd2f3b Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Tue, 29 Aug 2023 16:34:55 +1000 Subject: [PATCH 125/228] powerpc/watchpoints: Disable preemption in thread_change_pc() [ Upstream commit cc879ab3ce39bc39f9b1d238b283f43a5f6f957d ] thread_change_pc() uses CPU local data, so must be protected from swapping CPUs while it is reading the breakpoint struct. The error is more noticeable after 1e60f3564bad ("powerpc/watchpoints: Track perf single step directly on the breakpoint"), which added an unconditional __this_cpu_read() call in thread_change_pc(). However the existing __this_cpu_read() that runs if a breakpoint does need to be re-inserted has the same issue. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20230829063457.54157-2-bgray@linux.ibm.com Signed-off-by: Sasha Levin --- arch/powerpc/kernel/hw_breakpoint.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index f4e8f21046f5..6e5bed50c357 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -479,11 +479,13 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) struct arch_hw_breakpoint *info; int i; + preempt_disable(); + for (i = 0; i < nr_wp_slots(); i++) { if (unlikely(tsk->thread.last_hit_ubp[i])) goto reset; } - return; + goto out; reset: regs->msr &= ~MSR_SE; @@ -492,6 +494,9 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) __set_breakpoint(i, info); tsk->thread.last_hit_ubp[i] = NULL; } + +out: + preempt_enable(); } static bool is_larx_stcx_instr(int type) From ffc459a93065e554f1041f86c1fbe297476ca521 Mon Sep 17 00:00:00 2001 From: Johnathan Mantey Date: Fri, 15 Sep 2023 09:12:35 -0700 Subject: [PATCH 126/228] ncsi: Propagate carrier gain/loss events to the NCSI controller [ Upstream commit 3780bb29311eccb7a1c9641032a112eed237f7e3 ] Report the carrier/no-carrier state for the network interface shared between the BMC and the passthrough channel. Without this functionality the BMC is unable to reconfigure the NIC in the event of a re-cabling to a different subnet. Signed-off-by: Johnathan Mantey Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ncsi/ncsi-aen.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index 62fb1031763d..f8854bff286c 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -89,6 +89,11 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, if ((had_link == has_link) || chained) return 0; + if (had_link) + netif_carrier_off(ndp->ndev.dev); + else + netif_carrier_on(ndp->ndev.dev); + if (!ndp->multi_package && !nc->package->multi_channel) { if (had_link) { ndp->flags |= NCSI_DEV_RESHUFFLE; From da91481c5d2bd1db68f48c8455d79259416d3cf1 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 18 Sep 2023 11:03:49 +0200 Subject: [PATCH 127/228] fbdev/sh7760fb: Depend on FB=y [ Upstream commit f75f71b2c418a27a7c05139bb27a0c83adf88d19 ] Fix linker error if FB=m about missing fb_io_read and fb_io_write. The linker's error message suggests that this config setting has already been broken for other symbols. All errors (new ones prefixed by >>): sh4-linux-ld: drivers/video/fbdev/sh7760fb.o: in function `sh7760fb_probe': sh7760fb.c:(.text+0x374): undefined reference to `framebuffer_alloc' sh4-linux-ld: sh7760fb.c:(.text+0x394): undefined reference to `fb_videomode_to_var' sh4-linux-ld: sh7760fb.c:(.text+0x39c): undefined reference to `fb_alloc_cmap' sh4-linux-ld: sh7760fb.c:(.text+0x3a4): undefined reference to `register_framebuffer' sh4-linux-ld: sh7760fb.c:(.text+0x3ac): undefined reference to `fb_dealloc_cmap' sh4-linux-ld: sh7760fb.c:(.text+0x434): undefined reference to `framebuffer_release' sh4-linux-ld: drivers/video/fbdev/sh7760fb.o: in function `sh7760fb_remove': sh7760fb.c:(.text+0x800): undefined reference to `unregister_framebuffer' sh4-linux-ld: sh7760fb.c:(.text+0x804): undefined reference to `fb_dealloc_cmap' sh4-linux-ld: sh7760fb.c:(.text+0x814): undefined reference to `framebuffer_release' >> sh4-linux-ld: drivers/video/fbdev/sh7760fb.o:(.rodata+0xc): undefined reference to `fb_io_read' >> sh4-linux-ld: drivers/video/fbdev/sh7760fb.o:(.rodata+0x10): undefined reference to `fb_io_write' sh4-linux-ld: drivers/video/fbdev/sh7760fb.o:(.rodata+0x2c): undefined reference to `cfb_fillrect' sh4-linux-ld: drivers/video/fbdev/sh7760fb.o:(.rodata+0x30): undefined reference to `cfb_copyarea' sh4-linux-ld: drivers/video/fbdev/sh7760fb.o:(.rodata+0x34): undefined reference to `cfb_imageblit' Suggested-by: Randy Dunlap Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202309130632.LS04CPWu-lkp@intel.com/ Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Acked-by: John Paul Adrian Glaubitz Link: https://patchwork.freedesktop.org/patch/msgid/20230918090400.13264-1-tzimmermann@suse.de Signed-off-by: Sasha Levin --- drivers/video/fbdev/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 3ac78db17e46..dd5958463097 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -2014,7 +2014,7 @@ config FB_COBALT config FB_SH7760 bool "SH7760/SH7763/SH7720/SH7721 LCDC support" - depends on FB && (CPU_SUBTYPE_SH7760 || CPU_SUBTYPE_SH7763 \ + depends on FB=y && (CPU_SUBTYPE_SH7760 || CPU_SUBTYPE_SH7763 \ || CPU_SUBTYPE_SH7720 || CPU_SUBTYPE_SH7721) select FB_CFB_FILLRECT select FB_CFB_COPYAREA From ef3c728ca0d4d25776cb0c6c656a3e6e078c1194 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 28 Jul 2023 17:26:54 -0300 Subject: [PATCH 128/228] perf build: Define YYNOMEM as YYNOABORT for bison < 3.81 [ Upstream commit 88cc47e24597971b05b6e94c28a2fc81d2a8d61a ] YYNOMEM was introduced in bison 3.81, so define it as YYABORT for older versions, which should provide the previous perf behaviour. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin --- tools/perf/util/Build | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 0cf27354aa45..0f9732d5452e 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -253,6 +253,12 @@ ifeq ($(BISON_GE_35),1) else bison_flags += -w endif + +BISON_LT_381 := $(shell expr $(shell $(BISON) --version | grep bison | sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\).\([0-9]\+\)/\1\2\3/g') \< 381) +ifeq ($(BISON_LT_381),1) + bison_flags += -DYYNOMEM=YYABORT +endif + CFLAGS_parse-events-bison.o += $(bison_flags) CFLAGS_pmu-bison.o += -DYYLTYPE_IS_TRIVIAL=0 $(bison_flags) CFLAGS_expr-bison.o += -DYYLTYPE_IS_TRIVIAL=0 $(bison_flags) From f8e8e72c58c7b9ca5e5f8ca41feb0f92d732c6a6 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 29 Sep 2023 16:14:15 +0300 Subject: [PATCH 129/228] sched/cpuacct: Fix user/system in shown cpuacct.usage* commit dd02d4234c9a2214a81c57a16484304a1a51872a upstream. cpuacct has 2 different ways of accounting and showing user and system times. The first one uses cpuacct_account_field() to account times and cpuacct.stat file to expose them. And this one seems to work ok. The second one is uses cpuacct_charge() function for accounting and set of cpuacct.usage* files to show times. Despite some attempts to fix it in the past it still doesn't work. Sometimes while running KVM guest the cpuacct_charge() accounts most of the guest time as system time. This doesn't match with user&system times shown in cpuacct.stat or proc//stat. Demonstration: # git clone https://github.com/aryabinin/kvmsample # make # mkdir /sys/fs/cgroup/cpuacct/test # echo $$ > /sys/fs/cgroup/cpuacct/test/tasks # ./kvmsample & # for i in {1..5}; do cat /sys/fs/cgroup/cpuacct/test/cpuacct.usage_sys; sleep 1; done 1976535645 2979839428 3979832704 4983603153 5983604157 Use cpustats accounted in cpuacct_account_field() as the source of user/sys times for cpuacct.usage* files. Make cpuacct_charge() to account only summary execution time. Fixes: d740037fac70 ("sched/cpuacct: Split usage accounting into user_usage and sys_usage") Signed-off-by: Andrey Ryabinin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Jordan Acked-by: Tejun Heo Cc: Link: https://lore.kernel.org/r/20211115164607.23784-3-arbn@yandex-team.com [OP: adjusted context for v5.10] Signed-off-by: Ovidiu Panait Signed-off-by: Sasha Levin --- kernel/sched/cpuacct.c | 79 +++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 47 deletions(-) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 941c28cf9738..8a260115a137 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -21,15 +21,11 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -struct cpuacct_usage { - u64 usages[CPUACCT_STAT_NSTATS]; -}; - /* track CPU usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every CPU */ - struct cpuacct_usage __percpu *cpuusage; + u64 __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; }; @@ -49,7 +45,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); } -static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +64,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out; - ca->cpuusage = alloc_percpu(struct cpuacct_usage); + ca->cpuusage = alloc_percpu(u64); if (!ca->cpuusage) goto out_free_ca; @@ -99,7 +95,8 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, enum cpuacct_stat_index index) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; u64 data; /* @@ -115,14 +112,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - if (index == CPUACCT_STAT_NSTATS) { - int i = 0; - - data = 0; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - data += cpuusage->usages[i]; - } else { - data = cpuusage->usages[index]; + switch (index) { + case CPUACCT_STAT_USER: + data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE]; + break; + case CPUACCT_STAT_SYSTEM: + data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] + + cpustat[CPUTIME_SOFTIRQ]; + break; + case CPUACCT_STAT_NSTATS: + data = *cpuusage; + break; } #ifndef CONFIG_64BIT @@ -132,10 +132,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, return data; } -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - int i; + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + + /* Don't allow to reset global kernel_cpustat */ + if (ca == &root_cpuacct) + return; #ifndef CONFIG_64BIT /* @@ -143,9 +147,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - cpuusage->usages[i] = val; + *cpuusage = 0; + cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0; + cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0; + cpustat[CPUTIME_SOFTIRQ] = 0; #ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); @@ -196,7 +201,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL; for_each_possible_cpu(cpu) - cpuacct_cpuusage_write(ca, cpu, 0); + cpuacct_cpuusage_write(ca, cpu); return 0; } @@ -243,25 +248,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) seq_puts(m, "\n"); for_each_possible_cpu(cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - seq_printf(m, "%d", cpu); - - for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit - * platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); -#endif - - seq_printf(m, " %llu", cpuusage->usages[index]); - -#ifndef CONFIG_64BIT - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#endif - } + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %llu", + cpuacct_cpuusage_read(ca, cpu, index)); seq_puts(m, "\n"); } return 0; @@ -339,16 +329,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); - - if (regs && user_mode(regs)) - index = CPUACCT_STAT_USER; rcu_read_lock(); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - __this_cpu_add(ca->cpuusage->usages[index], cputime); + __this_cpu_add(*ca->cpuusage, cputime); rcu_read_unlock(); } From 82756d8a23943bf80c3da55f4c20a1fdedfd40e4 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 29 Sep 2023 16:14:16 +0300 Subject: [PATCH 130/228] sched/cpuacct: Fix charge percpu cpuusage commit 248cc9993d1cc12b8e9ed716cc3fc09f6c3517dd upstream. The cpuacct_account_field() is always called by the current task itself, so it's ok to use __this_cpu_add() to charge the tick time. But cpuacct_charge() maybe called by update_curr() in load_balance() on a random CPU, different from the CPU on which the task is running. So __this_cpu_add() will charge that cputime to a random incorrect CPU. Fixes: 73e6aafd9ea8 ("sched/cpuacct: Simplify the cpuacct code") Reported-by: Minye Zhu Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220220051426.5274-1-zhouchengming@bytedance.com Signed-off-by: Ovidiu Panait Signed-off-by: Sasha Levin --- kernel/sched/cpuacct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 8a260115a137..3c59c541dd31 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -328,12 +328,13 @@ static struct cftype files[] = { */ void cpuacct_charge(struct task_struct *tsk, u64 cputime) { + unsigned int cpu = task_cpu(tsk); struct cpuacct *ca; rcu_read_lock(); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - __this_cpu_add(*ca->cpuusage, cputime); + *per_cpu_ptr(ca->cpuusage, cpu) += cputime; rcu_read_unlock(); } From b2788f6d492497e2b7014dab2387e9b3549a5ff9 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 29 Sep 2023 16:14:17 +0300 Subject: [PATCH 131/228] sched/cpuacct: Optimize away RCU read lock commit dc6e0818bc9a0336d9accf3ea35d146d72aa7a18 upstream. Since cpuacct_charge() is called from the scheduler update_curr(), we must already have rq lock held, then the RCU read lock can be optimized away. And do the same thing in it's wrapper cgroup_account_cputime(), but we can't use lockdep_assert_rq_held() there, which defined in kernel/sched/sched.h. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220220051426.5274-2-zhouchengming@bytedance.com [OP: adjusted lockdep_assert_rq_held() -> lockdep_assert_held()] Signed-off-by: Ovidiu Panait Signed-off-by: Sasha Levin --- include/linux/cgroup.h | 2 -- kernel/sched/cpuacct.c | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 959b370733f0..7653f5418950 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -779,11 +779,9 @@ static inline void cgroup_account_cputime(struct task_struct *task, cpuacct_charge(task, delta_exec); - rcu_read_lock(); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime(cgrp, delta_exec); - rcu_read_unlock(); } static inline void cgroup_account_cputime_field(struct task_struct *task, diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 3c59c541dd31..8ee298321d78 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -331,12 +331,10 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) unsigned int cpu = task_cpu(tsk); struct cpuacct *ca; - rcu_read_lock(); + lockdep_assert_held(&cpu_rq(cpu)->lock); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) *per_cpu_ptr(ca->cpuusage, cpu) += cputime; - - rcu_read_unlock(); } /* From 67025d56545099f4d6789be628bb2750e6064ac5 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 29 Sep 2023 16:14:18 +0300 Subject: [PATCH 132/228] cgroup: Fix suspicious rcu_dereference_check() usage warning commit f2aa197e4794bf4c2c0c9570684f86e6fa103e8b upstream. task_css_set_check() will use rcu_dereference_check() to check for rcu_read_lock_held() on the read-side, which is not true after commit dc6e0818bc9a ("sched/cpuacct: Optimize away RCU read lock"). This commit drop explicit rcu_read_lock(), change to RCU-sched read-side critical section. So fix the RCU warning by adding check for rcu_read_lock_sched_held(). Fixes: dc6e0818bc9a ("sched/cpuacct: Optimize away RCU read lock") Reported-by: Linux Kernel Functional Testing Reported-by: syzbot+16e3f2c77e7c5a0113f9@syzkaller.appspotmail.com Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Tested-by: Zhouyi Zhou Tested-by: Marek Szyprowski Link: https://lore.kernel.org/r/20220305034103.57123-1-zhouchengming@bytedance.com Signed-off-by: Ovidiu Panait Signed-off-by: Sasha Levin --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7653f5418950..c9c430712d47 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -451,6 +451,7 @@ extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ + rcu_read_lock_sched_held() || \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) From 38f82cf8609689b14cf3b52c7a2518344b4a070d Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 9 Jun 2021 13:40:17 -0500 Subject: [PATCH 133/228] ACPI: Check StorageD3Enable _DSD property in ACPI code [ Upstream commit 2744d7a0733503931b71c00d156119ced002f22c ] Although first implemented for NVME, this check may be usable by other drivers as well. Microsoft's specification explicitly mentions that is may be usable by SATA and AHCI devices. Google also indicates that they have used this with SDHCI in a downstream kernel tree that a user can plug a storage device into. Link: https://docs.microsoft.com/en-us/windows-hardware/design/component-guidelines/power-management-for-storage-hardware-devices-intro Suggested-by: Keith Busch CC: Shyam-sundar S-k CC: Alexander Deucher CC: Rafael J. Wysocki CC: Prike Liang Signed-off-by: Mario Limonciello Reviewed-by: Rafael J. Wysocki Signed-off-by: Christoph Hellwig Stable-dep-of: dad651b2a44e ("nvme-pci: do not set the NUMA node of device if it has none") Signed-off-by: Sasha Levin --- drivers/acpi/device_pm.c | 29 +++++++++++++++++++++++++++++ drivers/nvme/host/pci.c | 28 +--------------------------- include/linux/acpi.h | 5 +++++ 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index ecd2ddc2215f..66e53df75865 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -1326,4 +1326,33 @@ int acpi_dev_pm_attach(struct device *dev, bool power_on) return 1; } EXPORT_SYMBOL_GPL(acpi_dev_pm_attach); + +/** + * acpi_storage_d3 - Check if D3 should be used in the suspend path + * @dev: Device to check + * + * Return %true if the platform firmware wants @dev to be programmed + * into D3hot or D3cold (if supported) in the suspend path, or %false + * when there is no specific preference. On some platforms, if this + * hint is ignored, @dev may remain unresponsive after suspending the + * platform as a whole. + * + * Although the property has storage in the name it actually is + * applied to the PCIe slot and plugging in a non-storage device the + * same platform restrictions will likely apply. + */ +bool acpi_storage_d3(struct device *dev) +{ + struct acpi_device *adev = ACPI_COMPANION(dev); + u8 val; + + if (!adev) + return false; + if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable", + &val)) + return false; + return val == 1; +} +EXPORT_SYMBOL_GPL(acpi_storage_d3); + #endif /* CONFIG_PM */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3aaead9b3a57..e384ade6c2cd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2840,32 +2840,6 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } -#ifdef CONFIG_ACPI -static bool nvme_acpi_storage_d3(struct pci_dev *dev) -{ - struct acpi_device *adev = ACPI_COMPANION(&dev->dev); - u8 val; - - /* - * Look for _DSD property specifying that the storage device on the port - * must use D3 to support deep platform power savings during - * suspend-to-idle. - */ - - if (!adev) - return false; - if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable", - &val)) - return false; - return val == 1; -} -#else -static inline bool nvme_acpi_storage_d3(struct pci_dev *dev) -{ - return false; -} -#endif /* CONFIG_ACPI */ - static void nvme_async_probe(void *data, async_cookie_t cookie) { struct nvme_dev *dev = data; @@ -2915,7 +2889,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) quirks |= check_vendor_combination_bug(pdev); - if (!noacpi && nvme_acpi_storage_d3(pdev)) { + if (!noacpi && acpi_storage_d3(&pdev->dev)) { /* * Some systems use a bios work around to ask for D3 on * platforms that support kernel managed suspend. diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 96d69404a54f..9c184dbceba4 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1001,6 +1001,7 @@ int acpi_dev_resume(struct device *dev); int acpi_subsys_runtime_suspend(struct device *dev); int acpi_subsys_runtime_resume(struct device *dev); int acpi_dev_pm_attach(struct device *dev, bool power_on); +bool acpi_storage_d3(struct device *dev); #else static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; } @@ -1008,6 +1009,10 @@ static inline int acpi_dev_pm_attach(struct device *dev, bool power_on) { return 0; } +static inline bool acpi_storage_d3(struct device *dev) +{ + return false; +} #endif #if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP) From 71357c751fb25c9c7ced051419dbddd02250b91d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:41:41 +0100 Subject: [PATCH 134/228] nvme-pci: factor the iod mempool creation into a helper [ Upstream commit 081a7d958ce4b65f9aab6e70e65b0b2e0b92297c ] Add a helper to create the iod mempool. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer Stable-dep-of: dad651b2a44e ("nvme-pci: do not set the NUMA node of device if it has none") Signed-off-by: Sasha Levin --- drivers/nvme/host/pci.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e384ade6c2cd..48886355ce90 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -387,14 +387,6 @@ static int nvme_pci_npages_sgl(void) NVME_CTRL_PAGE_SIZE); } -static size_t nvme_pci_iod_alloc_size(void) -{ - size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - - return sizeof(__le64 *) * npages + - sizeof(struct scatterlist) * NVME_MAX_SEGS; -} - static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -2557,6 +2549,22 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } +static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) +{ + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); + size_t alloc_size = sizeof(__le64 *) * npages + + sizeof(struct scatterlist) * NVME_MAX_SEGS; + + WARN_ON_ONCE(alloc_size > PAGE_SIZE); + dev->iod_mempool = mempool_create_node(1, + mempool_kmalloc, mempool_kfree, + (void *)alloc_size, GFP_KERNEL, + dev_to_node(dev->dev)); + if (!dev->iod_mempool) + return -ENOMEM; + return 0; +} + static void nvme_free_tagset(struct nvme_dev *dev) { if (dev->tagset.tags) @@ -2854,7 +2862,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) int node, result = -ENOMEM; struct nvme_dev *dev; unsigned long quirks = id->driver_data; - size_t alloc_size; node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) @@ -2899,21 +2906,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) quirks |= NVME_QUIRK_SIMPLE_SUSPEND; } - /* - * Double check that our mempool alloc size will cover the biggest - * command we support. - */ - alloc_size = nvme_pci_iod_alloc_size(); - WARN_ON_ONCE(alloc_size > PAGE_SIZE); - - dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, - mempool_kfree, - (void *) alloc_size, - GFP_KERNEL, node); - if (!dev->iod_mempool) { - result = -ENOMEM; + result = nvme_pci_alloc_iod_mempool(dev); + if (result) goto release_pools; - } result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, quirks); From 97e148dcb97d2b1fefedc83bbc2238dada68d224 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:44:00 +0100 Subject: [PATCH 135/228] nvme-pci: factor out a nvme_pci_alloc_dev helper [ Upstream commit 2e87570be9d2746e7c4e7ab1cc18fd3ca7de2768 ] Add a helper that allocates the nvme_dev structure up to the point where we can call nvme_init_ctrl. This pairs with the free_ctrl method and can thus be used to cleanup the teardown path and make it more symmetric. Note that this now calls nvme_init_ctrl a lot earlier during probing, which also means the per-controller character device shows up earlier. Due to the controller state no commnds can be send on it, but it might make sense to delay the cdev registration until nvme_init_ctrl_finish. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer Stable-dep-of: dad651b2a44e ("nvme-pci: do not set the NUMA node of device if it has none") Signed-off-by: Sasha Levin --- drivers/nvme/host/pci.c | 83 +++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 48886355ce90..c329f73dbbf3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2572,6 +2572,7 @@ static void nvme_free_tagset(struct nvme_dev *dev) dev->ctrl.tagset = NULL; } +/* pairs with nvme_pci_alloc_dev */ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); @@ -2857,19 +2858,23 @@ static void nvme_async_probe(void *data, async_cookie_t cookie) nvme_put_ctrl(&dev->ctrl); } -static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, + const struct pci_device_id *id) { - int node, result = -ENOMEM; - struct nvme_dev *dev; unsigned long quirks = id->driver_data; + int node = dev_to_node(&pdev->dev); + struct nvme_dev *dev; + int ret = -ENOMEM; - node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) set_dev_node(&pdev->dev, first_memory_node); dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) - return -ENOMEM; + return NULL; + INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); + INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); + mutex_init(&dev->shutdown_lock); dev->nr_write_queues = write_queues; dev->nr_poll_queues = poll_queues; @@ -2877,25 +2882,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->queues = kcalloc_node(dev->nr_allocated_queues, sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) - goto free; + goto out_free_dev; dev->dev = get_device(&pdev->dev); - pci_set_drvdata(pdev, dev); - - result = nvme_dev_map(dev); - if (result) - goto put_pci; - - INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); - INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - mutex_init(&dev->shutdown_lock); - - result = nvme_setup_prp_pools(dev); - if (result) - goto unmap; quirks |= check_vendor_combination_bug(pdev); - if (!noacpi && acpi_storage_d3(&pdev->dev)) { /* * Some systems use a bios work around to ask for D3 on @@ -2905,34 +2896,54 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) "platform quirk: setting simple suspend\n"); quirks |= NVME_QUIRK_SIMPLE_SUSPEND; } + ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, + quirks); + if (ret) + goto out_put_device; + return dev; + +out_put_device: + put_device(dev->dev); + kfree(dev->queues); +out_free_dev: + kfree(dev); + return ERR_PTR(ret); +} + +static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct nvme_dev *dev; + int result = -ENOMEM; + + dev = nvme_pci_alloc_dev(pdev, id); + if (!dev) + return -ENOMEM; + + result = nvme_dev_map(dev); + if (result) + goto out_uninit_ctrl; + + result = nvme_setup_prp_pools(dev); + if (result) + goto out_dev_unmap; result = nvme_pci_alloc_iod_mempool(dev); if (result) - goto release_pools; - - result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, - quirks); - if (result) - goto release_mempool; + goto out_release_prp_pools; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + pci_set_drvdata(pdev, dev); nvme_reset_ctrl(&dev->ctrl); async_schedule(nvme_async_probe, dev); - return 0; - release_mempool: - mempool_destroy(dev->iod_mempool); - release_pools: +out_release_prp_pools: nvme_release_prp_pools(dev); - unmap: +out_dev_unmap: nvme_dev_unmap(dev); - put_pci: - put_device(dev->dev); - free: - kfree(dev->queues); - kfree(dev); +out_uninit_ctrl: + nvme_uninit_ctrl(&dev->ctrl); return result; } From 0d599a3f57a51014209ebd7a177da03fe9c6040f Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 12 Sep 2023 17:52:49 +0200 Subject: [PATCH 136/228] nvme-pci: do not set the NUMA node of device if it has none [ Upstream commit dad651b2a44eb6b201738f810254279dca29d30d ] If a device has no NUMA node information associated with it, the driver puts the device in node first_memory_node (say node 0). Not having a NUMA node and being associated with node 0 are completely different things and it makes little sense to mix the two. Signed-off-by: Pratyush Yadav Signed-off-by: Keith Busch Signed-off-by: Sasha Levin --- drivers/nvme/host/pci.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c329f73dbbf3..7bb42d0e087a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2866,9 +2866,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, struct nvme_dev *dev; int ret = -ENOMEM; - if (node == NUMA_NO_NODE) - set_dev_node(&pdev->dev, first_memory_node); - dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) return NULL; From 13b7d49f339a59de67ef721f9fd324b3acf94688 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2023 09:05:34 +0200 Subject: [PATCH 137/228] watchdog: iTCO_wdt: No need to stop the timer in probe commit 1ae3e78c08209ac657c59f6f7ea21bbbd7f6a1d4 upstream. The watchdog core can handle pinging of the watchdog before userspace opens the device. For this reason instead of stopping the timer, just mark it as running and let the watchdog core take care of it. Cc: Malin Jonsson Signed-off-by: Mika Westerberg Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20210921102900.61586-1-mika.westerberg@linux.intel.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Signed-off-by: Sasha Levin --- drivers/watchdog/iTCO_wdt.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index a370a185a41c..9048fa44897f 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -426,6 +426,16 @@ static unsigned int iTCO_wdt_get_timeleft(struct watchdog_device *wd_dev) return time_left; } +static void iTCO_wdt_set_running(struct iTCO_wdt_private *p) +{ + u16 val; + + /* Bit 11: TCO Timer Halt -> 0 = The TCO timer is * enabled */ + val = inw(TCO1_CNT(p)); + if (!(val & BIT(11))) + set_bit(WDOG_HW_RUNNING, &p->wddev.status); +} + /* * Kernel Interfaces */ @@ -568,8 +578,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev) watchdog_set_drvdata(&p->wddev, p); platform_set_drvdata(pdev, p); - /* Make sure the watchdog is not running */ - iTCO_wdt_stop(&p->wddev); + iTCO_wdt_set_running(p); /* Check that the heartbeat value is within it's range; if not reset to the default */ From 152b8ac839c97ab1c26669869a9ca400e05807dc Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Oct 2023 09:05:35 +0200 Subject: [PATCH 138/228] watchdog: iTCO_wdt: Set NO_REBOOT if the watchdog is not already running MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit ef9b7bf52c2f47f0a9bf988543c577b92c92d15e upstream. Daniel reported that the commit 1ae3e78c0820 ("watchdog: iTCO_wdt: No need to stop the timer in probe") makes QEMU implementation of the iTCO watchdog not to trigger reboot anymore when NO_REBOOT flag is initially cleared using this option (in QEMU command line): -global ICH9-LPC.noreboot=false The problem with the commit is that it left the unconditional setting of NO_REBOOT that is not cleared anymore when the kernel keeps pinging the watchdog (as opposed to the previous code that called iTCO_wdt_stop() that cleared it). Fix this so that we only set NO_REBOOT if the watchdog was not initially running. Fixes: 1ae3e78c0820 ("watchdog: iTCO_wdt: No need to stop the timer in probe") Reported-by: Daniel P. Berrangé Signed-off-by: Mika Westerberg Tested-by: Daniel P. Berrangé Reviewed-by: Daniel P. Berrangé Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20221028062750.45451-1-mika.westerberg@linux.intel.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Signed-off-by: Sasha Levin --- drivers/watchdog/iTCO_wdt.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index 9048fa44897f..50c874d48860 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -426,14 +426,18 @@ static unsigned int iTCO_wdt_get_timeleft(struct watchdog_device *wd_dev) return time_left; } -static void iTCO_wdt_set_running(struct iTCO_wdt_private *p) +/* Returns true if the watchdog was running */ +static bool iTCO_wdt_set_running(struct iTCO_wdt_private *p) { u16 val; - /* Bit 11: TCO Timer Halt -> 0 = The TCO timer is * enabled */ + /* Bit 11: TCO Timer Halt -> 0 = The TCO timer is enabled */ val = inw(TCO1_CNT(p)); - if (!(val & BIT(11))) + if (!(val & BIT(11))) { set_bit(WDOG_HW_RUNNING, &p->wddev.status); + return true; + } + return false; } /* @@ -524,9 +528,6 @@ static int iTCO_wdt_probe(struct platform_device *pdev) return -ENODEV; /* Cannot reset NO_REBOOT bit */ } - /* Set the NO_REBOOT bit to prevent later reboots, just for sure */ - p->update_no_reboot_bit(p->no_reboot_priv, true); - if (turn_SMI_watchdog_clear_off >= p->iTCO_version) { /* * Bit 13: TCO_EN -> 0 @@ -578,7 +579,13 @@ static int iTCO_wdt_probe(struct platform_device *pdev) watchdog_set_drvdata(&p->wddev, p); platform_set_drvdata(pdev, p); - iTCO_wdt_set_running(p); + if (!iTCO_wdt_set_running(p)) { + /* + * If the watchdog was not running set NO_REBOOT now to + * prevent later reboots. + */ + p->update_no_reboot_bit(p->no_reboot_priv, true); + } /* Check that the heartbeat value is within it's range; if not reset to the default */ From e18216cd0ec7fef9bb87cf8b4124bc1212e5add1 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 11 Jun 2021 19:06:45 +0200 Subject: [PATCH 139/228] netfilter: nft_exthdr: Search chunks in SCTP packets only [ Upstream commit 5acc44f39458f43dac9724cefa4da29847cfe997 ] Since user space does not generate a payload dependency, plain sctp chunk matches cause searching in non-SCTP packets, too. Avoid this potential mis-interpretation of packet data by checking pkt->tprot. Fixes: 133dc203d77df ("netfilter: nft_exthdr: Support SCTP chunks") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_exthdr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index b4682aeabab9..274c5f008518 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -371,6 +371,9 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; + if (pkt->tprot != IPPROTO_SCTP) + goto err; + do { sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch); if (!sch || !sch->length) @@ -391,7 +394,7 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, } offset += SCTP_PAD4(ntohs(sch->length)); } while (offset < pkt->skb->len); - +err: if (priv->flags & NFT_EXTHDR_F_PRESENT) nft_reg_store8(dest, false); else From fb28f89d50c0fad6da851b98526368c3709f17be Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 8 Jun 2021 11:40:57 +0200 Subject: [PATCH 140/228] netfilter: nft_exthdr: Fix for unsafe packet data read [ Upstream commit cf6b5ffdce5a78b2fcb0e53b3a2487c490bcbf7f ] While iterating through an SCTP packet's chunks, skb_header_pointer() is called for the minimum expected chunk header size. If (that part of) the skbuff is non-linear, the following memcpy() may read data past temporary buffer '_sch'. Use skb_copy_bits() instead which does the right thing in this situation. Fixes: 133dc203d77df ("netfilter: nft_exthdr: Support SCTP chunks") Suggested-by: Florian Westphal Signed-off-by: Phil Sutter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_exthdr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 274c5f008518..eb183c024ac4 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -389,7 +389,9 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, break; dest[priv->len / NFT_REG32_SIZE] = 0; - memcpy(dest, (char *)sch + priv->offset, priv->len); + if (skb_copy_bits(pkt->skb, offset + priv->offset, + dest, priv->len) < 0) + break; return; } offset += SCTP_PAD4(ntohs(sch->length)); From 831f18c735e2fc16bf7bfb1ec41cc30e6be9eb0f Mon Sep 17 00:00:00 2001 From: Irvin Cote Date: Thu, 9 Feb 2023 17:43:57 -0300 Subject: [PATCH 141/228] nvme-pci: always return an ERR_PTR from nvme_pci_alloc_dev [ Upstream commit dc785d69d753a3894c93afc23b91404652382ead ] Don't mix NULL and ERR_PTR returns. Fixes: 2e87570be9d2 ("nvme-pci: factor out a nvme_pci_alloc_dev helper") Signed-off-by: Irvin Cote Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/pci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7bb42d0e087a..9c67ebd4eac3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2868,7 +2868,7 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) - return NULL; + return ERR_PTR(-ENOMEM); INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); mutex_init(&dev->shutdown_lock); @@ -2913,8 +2913,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) int result = -ENOMEM; dev = nvme_pci_alloc_dev(pdev, id); - if (!dev) - return -ENOMEM; + if (IS_ERR(dev)) + return PTR_ERR(dev); result = nvme_dev_map(dev); if (result) From 41de7a6b95df8b22c77750421a2f9dfe241df731 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Mon, 8 May 2023 19:02:34 +0200 Subject: [PATCH 142/228] smack: Record transmuting in smk_transmuted [ Upstream commit 2c085f3a8f23c9b444e8b99d93c15d7ce870fc4e ] smack_dentry_create_files_as() determines whether transmuting should occur based on the label of the parent directory the new inode will be added to, and not the label of the directory where it is created. This helps for example to do transmuting on overlayfs, since the latter first creates the inode in the working directory, and then moves it to the correct destination. However, despite smack_dentry_create_files_as() provides the correct label, smack_inode_init_security() does not know from passed information whether or not transmuting occurred. Without this information, smack_inode_init_security() cannot set SMK_INODE_CHANGED in smk_flags, which will result in the SMACK64TRANSMUTE xattr not being set in smack_d_instantiate(). Thus, add the smk_transmuted field to the task_smack structure, and set it in smack_dentry_create_files_as() to smk_task if transmuting occurred. If smk_task is equal to smk_transmuted in smack_inode_init_security(), act as if transmuting was successful but without taking the label from the parent directory (the inode label was already set correctly from the current credentials in smack_inode_alloc_security()). Signed-off-by: Roberto Sassu Signed-off-by: Casey Schaufler Signed-off-by: Sasha Levin --- security/smack/smack.h | 1 + security/smack/smack_lsm.c | 41 +++++++++++++++++++++++++++----------- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/security/smack/smack.h b/security/smack/smack.h index a9768b12716b..b5187915e074 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -120,6 +120,7 @@ struct inode_smack { struct task_smack { struct smack_known *smk_task; /* label for access control */ struct smack_known *smk_forked; /* label when forked */ + struct smack_known *smk_transmuted;/* label when transmuted */ struct list_head smk_rules; /* per task access rules */ struct mutex smk_rules_lock; /* lock for the rules */ struct list_head smk_relabel; /* transit allowed labels */ diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index b36b8668f1f4..e7f6f55bbae2 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -972,8 +972,9 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const char **name, void **value, size_t *len) { + struct task_smack *tsp = smack_cred(current_cred()); struct inode_smack *issp = smack_inode(inode); - struct smack_known *skp = smk_of_current(); + struct smack_known *skp = smk_of_task(tsp); struct smack_known *isp = smk_of_inode(inode); struct smack_known *dsp = smk_of_inode(dir); int may; @@ -982,20 +983,34 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir, *name = XATTR_SMACK_SUFFIX; if (value && len) { - rcu_read_lock(); - may = smk_access_entry(skp->smk_known, dsp->smk_known, - &skp->smk_rules); - rcu_read_unlock(); + /* + * If equal, transmuting already occurred in + * smack_dentry_create_files_as(). No need to check again. + */ + if (tsp->smk_task != tsp->smk_transmuted) { + rcu_read_lock(); + may = smk_access_entry(skp->smk_known, dsp->smk_known, + &skp->smk_rules); + rcu_read_unlock(); + } /* - * If the access rule allows transmutation and - * the directory requests transmutation then - * by all means transmute. + * In addition to having smk_task equal to smk_transmuted, + * if the access rule allows transmutation and the directory + * requests transmutation then by all means transmute. * Mark the inode as changed. */ - if (may > 0 && ((may & MAY_TRANSMUTE) != 0) && - smk_inode_transmutable(dir)) { - isp = dsp; + if ((tsp->smk_task == tsp->smk_transmuted) || + (may > 0 && ((may & MAY_TRANSMUTE) != 0) && + smk_inode_transmutable(dir))) { + /* + * The caller of smack_dentry_create_files_as() + * should have overridden the current cred, so the + * inode label was already set correctly in + * smack_inode_alloc_security(). + */ + if (tsp->smk_task != tsp->smk_transmuted) + isp = dsp; issp->smk_flags |= SMK_INODE_CHANGED; } @@ -4685,8 +4700,10 @@ static int smack_dentry_create_files_as(struct dentry *dentry, int mode, * providing access is transmuting use the containing * directory label instead of the process label. */ - if (may > 0 && (may & MAY_TRANSMUTE)) + if (may > 0 && (may & MAY_TRANSMUTE)) { ntsp->smk_task = isp->smk_inode; + ntsp->smk_transmuted = ntsp->smk_task; + } } return 0; } From 297c51c63fe156ef75eff20ec39b6a0b6c42d758 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Mon, 8 May 2023 19:02:33 +0200 Subject: [PATCH 143/228] smack: Retrieve transmuting information in smack_inode_getsecurity() [ Upstream commit 3a3d8fce31a49363cc31880dce5e3b0617c9c38b ] Enhance smack_inode_getsecurity() to retrieve the value for SMACK64TRANSMUTE from the inode security blob, similarly to SMACK64. This helps to display accurate values in the situation where the security labels come from mount options and not from xattrs. Signed-off-by: Roberto Sassu Signed-off-by: Casey Schaufler Signed-off-by: Sasha Levin --- security/smack/smack_lsm.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index e7f6f55bbae2..6bfc3c8d9310 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1444,10 +1444,19 @@ static int smack_inode_getsecurity(struct inode *inode, struct super_block *sbp; struct inode *ip = (struct inode *)inode; struct smack_known *isp; + struct inode_smack *ispp; + size_t label_len; + char *label = NULL; - if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) + if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) { isp = smk_of_inode(inode); - else { + } else if (strcmp(name, XATTR_SMACK_TRANSMUTE) == 0) { + ispp = smack_inode(inode); + if (ispp->smk_flags & SMK_INODE_TRANSMUTE) + label = TRANS_TRUE; + else + label = ""; + } else { /* * The rest of the Smack xattrs are only on sockets. */ @@ -1469,13 +1478,18 @@ static int smack_inode_getsecurity(struct inode *inode, return -EOPNOTSUPP; } + if (!label) + label = isp->smk_known; + + label_len = strlen(label); + if (alloc) { - *buffer = kstrdup(isp->smk_known, GFP_KERNEL); + *buffer = kstrdup(label, GFP_KERNEL); if (*buffer == NULL) return -ENOMEM; } - return strlen(isp->smk_known); + return label_len; } From 14443223e08c8bf4f2bf535e9551505143ced256 Mon Sep 17 00:00:00 2001 From: Vishal Goel Date: Fri, 17 Sep 2021 13:08:14 +0530 Subject: [PATCH 144/228] Smack:- Use overlay inode label in smack_inode_copy_up() [ Upstream commit 387ef964460f14fe1c1ea29aba70e22731ea7cf7 ] Currently in "smack_inode_copy_up()" function, process label is changed with the label on parent inode. Due to which, process is assigned directory label and whatever file or directory created by the process are also getting directory label which is wrong label. Changes has been done to use label of overlay inode instead of parent inode. Signed-off-by: Vishal Goel Signed-off-by: Casey Schaufler Signed-off-by: Sasha Levin --- security/smack/smack_lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 6bfc3c8d9310..814518ad4402 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4663,7 +4663,7 @@ static int smack_inode_copy_up(struct dentry *dentry, struct cred **new) /* * Get label from overlay inode and set it in create_sid */ - isp = smack_inode(d_inode(dentry->d_parent)); + isp = smack_inode(d_inode(dentry)); skp = isp->smk_inode; tsp->smk_task = skp; *new = new_creds; From 6d5c8862932d31a810b6545f7d69ecc124402c6e Mon Sep 17 00:00:00 2001 From: Daniel Starke Date: Thu, 14 Sep 2023 07:15:07 +0200 Subject: [PATCH 145/228] Revert "tty: n_gsm: fix UAF in gsm_cleanup_mux" commit 29346e217b8ab8a52889b88f00b268278d6b7668 upstream. This reverts commit 9b9c8195f3f0d74a826077fc1c01b9ee74907239. The commit above is reverted as it did not solve the original issue. gsm_cleanup_mux() tries to free up the virtual ttys by calling gsm_dlci_release() for each available DLCI. There, dlci_put() is called to decrease the reference counter for the DLCI via tty_port_put() which finally calls gsm_dlci_free(). This already clears the pointer which is being checked in gsm_cleanup_mux() before calling gsm_dlci_release(). Therefore, it is not necessary to clear this pointer in gsm_cleanup_mux() as done in the reverted commit. The commit introduces a null pointer dereference: ? __die+0x1f/0x70 ? page_fault_oops+0x156/0x420 ? search_exception_tables+0x37/0x50 ? fixup_exception+0x21/0x310 ? exc_page_fault+0x69/0x150 ? asm_exc_page_fault+0x26/0x30 ? tty_port_put+0x19/0xa0 gsmtty_cleanup+0x29/0x80 [n_gsm] release_one_tty+0x37/0xe0 process_one_work+0x1e6/0x3e0 worker_thread+0x4c/0x3d0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe1/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 The actual issue is that nothing guards dlci_put() from being called multiple times while the tty driver was triggered but did not yet finished calling gsm_dlci_free(). Fixes: 9b9c8195f3f0 ("tty: n_gsm: fix UAF in gsm_cleanup_mux") Cc: stable Signed-off-by: Daniel Starke Link: https://lore.kernel.org/r/20230914051507.3240-1-daniel.starke@siemens.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_gsm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c index 94c963462d74..3693ad9f4521 100644 --- a/drivers/tty/n_gsm.c +++ b/drivers/tty/n_gsm.c @@ -2179,10 +2179,8 @@ static void gsm_cleanup_mux(struct gsm_mux *gsm, bool disc) /* Free up any link layer users and finally the control channel */ for (i = NUM_DLCI - 1; i >= 0; i--) - if (gsm->dlci[i]) { + if (gsm->dlci[i]) gsm_dlci_release(gsm->dlci[i]); - gsm->dlci[i] = NULL; - } mutex_unlock(&gsm->mutex); /* Now wipe the queues */ tty_ldisc_flush(gsm->tty); From e14afa4450cb7e4cf93e993a765801203d41d014 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Sep 2023 01:25:55 +0300 Subject: [PATCH 146/228] serial: 8250_port: Check IRQ data before use commit cce7fc8b29961b64fadb1ce398dc5ff32a79643b upstream. In case the leaf driver wants to use IRQ polling (irq = 0) and IIR register shows that an interrupt happened in the 8250 hardware the IRQ data can be NULL. In such a case we need to skip the wake event as we came to this path from the timer interrupt and quite likely system is already awake. Without this fix we have got an Oops: serial8250: ttyS0 at I/O 0x3f8 (irq = 0, base_baud = 115200) is a 16550A ... BUG: kernel NULL pointer dereference, address: 0000000000000010 RIP: 0010:serial8250_handle_irq+0x7c/0x240 Call Trace: ? serial8250_handle_irq+0x7c/0x240 ? __pfx_serial8250_timeout+0x10/0x10 Fixes: 0ba9e3a13c6a ("serial: 8250: Add missing wakeup event reporting") Cc: stable Signed-off-by: Andy Shevchenko Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20230831222555.614426-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 7499954c9aa7..8b49ac4856d2 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1914,7 +1914,10 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir) skip_rx = true; if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) { - if (irqd_is_wakeup_set(irq_get_irq_data(port->irq))) + struct irq_data *d; + + d = irq_get_irq_data(port->irq); + if (d && irqd_is_wakeup_set(d)) pm_wakeup_event(tport->tty->dev, 0); if (!up->dma || handle_rx_dma(up, iir)) status = serial8250_rx_chars(up, status); From 7130a87ca32396eb9bf48b71a2d42259ae44c6c7 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Thu, 21 Sep 2023 23:17:31 +0900 Subject: [PATCH 147/228] nilfs2: fix potential use after free in nilfs_gccache_submit_read_data() commit 7ee29facd8a9c5a26079148e36bcf07141b3a6bc upstream. In nilfs_gccache_submit_read_data(), brelse(bh) is called to drop the reference count of bh when the call to nilfs_dat_translate() fails. If the reference count hits 0 and its owner page gets unlocked, bh may be freed. However, bh->b_page is dereferenced to put the page after that, which may result in a use-after-free bug. This patch moves the release operation after unlocking and putting the page. NOTE: The function in question is only called in GC, and in combination with current userland tools, address translation using DAT does not occur in that function, so the code path that causes this issue will not be executed. However, it is possible to run that code path by intentionally modifying the userland GC library or by calling the GC ioctl directly. [konishi.ryusuke@gmail.com: NOTE added to the commit log] Link: https://lkml.kernel.org/r/1543201709-53191-1-git-send-email-bianpan2016@163.com Link: https://lkml.kernel.org/r/20230921141731.10073-1-konishi.ryusuke@gmail.com Fixes: a3d93f709e89 ("nilfs2: block cache for garbage collection") Signed-off-by: Pan Bian Reported-by: Ferry Meng Closes: https://lkml.kernel.org/r/20230818092022.111054-1-mengferry@linux.alibaba.com Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/gcinode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index aadea660c66c..b0077f5f7112 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -73,10 +73,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, struct the_nilfs *nilfs = inode->i_sb->s_fs_info; err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn); - if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ - brelse(bh); + if (unlikely(err)) /* -EIO, -ENOMEM, -ENOENT */ goto failed; - } } lock_buffer(bh); @@ -102,6 +100,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, failed: unlock_page(bh->b_page); put_page(bh->b_page); + if (unlikely(err)) + brelse(bh); return err; } From 5a03b42ae1ed646eb5f5acceff1fb2b1d85ec077 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 7 Sep 2023 08:22:33 +0200 Subject: [PATCH 148/228] netfilter: nf_tables: disallow rule removal from chain binding [ Upstream commit f15f29fd4779be8a418b66e9d52979bb6d6c2325 ] Chain binding only requires the rule addition/insertion command within the same transaction. Removal of rules from chain bindings within the same transaction makes no sense, userspace does not utilize this feature. Replace nft_chain_is_bound() check to nft_chain_binding() in rule deletion commands. Replace command implies a rule deletion, reject this command too. Rule flush command can also safely rely on this nft_chain_binding() check because unbound chains are not allowed since 62e1e94b246e ("netfilter: nf_tables: reject unbound chain set before commit phase"). Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Reported-by: Kevin Rich Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 16e2500e8590..78b268bd7f01 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1268,7 +1268,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx->chain = chain; @@ -1312,7 +1312,7 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx->chain = chain; @@ -2599,6 +2599,9 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } + if (nft_chain_binding(chain)) + return -EOPNOTSUPP; + if (nlh->nlmsg_flags & NLM_F_NONREC && chain->use > 0) return -EBUSY; @@ -3498,6 +3501,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (nft_chain_binding(chain)) { + err = -EOPNOTSUPP; + goto err_destroy_flow_rule; + } + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; @@ -3606,7 +3614,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) return -EOPNOTSUPP; } @@ -3636,7 +3644,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx.chain = chain; @@ -9529,7 +9537,7 @@ static void __nft_release_table(struct net *net, struct nft_table *table) ctx.family = table->family; ctx.table = table; list_for_each_entry(chain, &table->chains, list) { - if (nft_chain_is_bound(chain)) + if (nft_chain_binding(chain)) continue; ctx.chain = chain; From 25872c67de20d2f33d7c48ef4e38296858bfcc21 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Thu, 7 Sep 2023 15:24:34 +0800 Subject: [PATCH 149/228] ALSA: hda: Disable power save for solving pop issue on Lenovo ThinkCentre M70q commit 057a28ef93bdbe84326d34cdb5543afdaab49fe1 upstream. Lenovo ThinkCentre M70q had boot up pop noise. Disable power save will solve pop issue. Signed-off-by: Kailang Yang Cc: Link: https://lore.kernel.org/r/315900e2efef42fd9855eacfeb443abd@realtek.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_intel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 1f641712233e..dfef761d5521 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2271,6 +2271,7 @@ static const struct snd_pci_quirk power_save_denylist[] = { SND_PCI_QUIRK(0x8086, 0x2068, "Intel NUC7i3BNB", 0), /* https://bugzilla.kernel.org/show_bug.cgi?id=198611 */ SND_PCI_QUIRK(0x17aa, 0x2227, "Lenovo X1 Carbon 3rd Gen", 0), + SND_PCI_QUIRK(0x17aa, 0x316e, "Lenovo ThinkCentre M70q", 0), /* https://bugzilla.redhat.com/show_bug.cgi?id=1689623 */ SND_PCI_QUIRK(0x17aa, 0x367b, "Lenovo IdeaCentre B550", 0), /* https://bugzilla.redhat.com/show_bug.cgi?id=1572975 */ From b664e9db8d2c5e5d4e85277acd0815802a0d76fc Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 18 Sep 2023 22:24:50 +0200 Subject: [PATCH 150/228] ata: libata-scsi: ignore reserved bits for REPORT SUPPORTED OPERATION CODES commit 3ef600923521616ebe192c893468ad0424de2afb upstream. For REPORT SUPPORTED OPERATION CODES command, the service action field is defined as bits 0-4 in the second byte in the CDB. Bits 5-7 in the second byte are reserved. Only look at the service action field in the second byte when determining if the MAINTENANCE IN opcode is a REPORT SUPPORTED OPERATION CODES command. This matches how we only look at the service action field in the second byte when determining if the SERVICE ACTION IN(16) opcode is a READ CAPACITY(16) command (reserved bits 5-7 in the second byte are ignored). Fixes: 7b2030942859 ("libata: Add support for SCT Write Same") Cc: stable@vger.kernel.org Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index dfa090ccd21c..36f32fa052df 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -4259,7 +4259,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd) break; case MAINTENANCE_IN: - if (scsicmd[1] == MI_REPORT_SUPPORTED_OPERATION_CODES) + if ((scsicmd[1] & 0x1f) == MI_REPORT_SUPPORTED_OPERATION_CODES) ata_scsi_rbuf_fill(&args, ata_scsiop_maint_in); else ata_scsi_set_invalid_field(dev, cmd, 1, 0xff); From e3b8c9e0fc3c467dbf5179eecb869f6a5f5723dd Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 14 Sep 2023 23:08:44 +0200 Subject: [PATCH 151/228] i2c: i801: unregister tco_pdev in i801_probe() error path commit 3914784553f68c931fc666dbe7e86fe881aada38 upstream. We have to unregister tco_pdev also if i2c_add_adapter() fails. Fixes: 9424693035a5 ("i2c: i801: Create iTCO device on newer Intel PCHs") Cc: stable@vger.kernel.org Signed-off-by: Heiner Kallweit Reviewed-by: Mika Westerberg Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-i801.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 45682d30d705..4aec451d7251 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1907,6 +1907,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) "SMBus I801 adapter at %04lx", priv->smba); err = i2c_add_adapter(&priv->adapter); if (err) { + platform_device_unregister(priv->tco_pdev); i801_acpi_remove(priv); return err; } From acc7fc82d0addac5ae0998bb447401e44babbb92 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 17 Sep 2023 19:26:46 -0400 Subject: [PATCH 152/228] Revert "SUNRPC dont update timeout value on connection reset" commit a275ab62606bcd894ddff09460f7d253828313dc upstream. This reverts commit 88428cc4ae7abcc879295fbb19373dd76aad2bdd. The problem this commit is intended to fix was comprehensively fixed in commit 7de62bc09fe6 ("SUNRPC dont update timeout value on connection reset"). Since then, this commit has been preventing the correct timeout of soft mounted requests. Cc: stable@vger.kernel.org # 5.9.x: 09252177d5f9: SUNRPC: Handle major timeout in xprt_adjust_timeout() Cc: stable@vger.kernel.org # 5.9.x: 7de62bc09fe6: SUNRPC dont update timeout value on connection reset Cc: stable@vger.kernel.org # 5.9.x Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/clnt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index e9a3fca4aedc..c7c1754f8744 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2354,8 +2354,7 @@ call_status(struct rpc_task *task) goto out_exit; } task->tk_action = call_encode; - if (status != -ECONNRESET && status != -ECONNABORTED) - rpc_check_timeout(task); + rpc_check_timeout(task); return; out_exit: rpc_call_rpcerror(task, status); From b7a0df4c08778e1398413a94f19a6319851b97ff Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Thu, 14 Sep 2023 12:30:20 -0400 Subject: [PATCH 153/228] proc: nommu: /proc//maps: release mmap read lock commit 578d7699e5c2add8c2e9549d9d75dfb56c460cb3 upstream. The no-MMU implementation of /proc//map doesn't normally release the mmap read lock, because it uses !IS_ERR_OR_NULL(_vml) to determine whether to release the lock. Since _vml is NULL when the end of the mappings is reached, the lock is not released. Reading /proc/1/maps twice doesn't cause a hang because it only takes the read lock, which can be taken multiple times and therefore doesn't show any problem if the lock isn't released. Instead, you need to perform some operation that attempts to take the write lock after reading /proc//maps. To actually reproduce the bug, compile the following code as 'proc_maps_bug': #include #include #include int main(int argc, char *argv[]) { void *buf; sleep(1); buf = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); puts("mmap returned"); return 0; } Then, run: ./proc_maps_bug &; cat /proc/$!/maps; fg Without this patch, mmap() will hang and the command will never complete. This code was incorrectly adapted from the MMU implementation, which at the time released the lock in m_next() before returning the last entry. The MMU implementation has diverged further from the no-MMU version since then, so this patch brings their locking and error handling into sync, fixing the bug and hopefully avoiding similar issues in the future. Link: https://lkml.kernel.org/r/20230914163019.4050530-2-ben.wolsieffer@hefring.com Fixes: 47fecca15c09 ("fs/proc/task_nommu.c: don't use priv->task->mm") Signed-off-by: Ben Wolsieffer Acked-by: Oleg Nesterov Cc: Giulio Benetti Cc: Greg Ungerer Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_nommu.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index a6d21fc0033c..97f387d30e74 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -208,11 +208,16 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-ESRCH); mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } if (mmap_read_lock_killable(mm)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } @@ -221,23 +226,21 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (n-- == 0) return p; - mmap_read_unlock(mm); - mmput(mm); return NULL; } -static void m_stop(struct seq_file *m, void *_vml) +static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; - if (!IS_ERR_OR_NULL(_vml)) { - mmap_read_unlock(priv->mm); - mmput(priv->mm); - } - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } static void *m_next(struct seq_file *m, void *_p, loff_t *pos) From ef47f25e98de9574c5b7ab15570a9854821c97de Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 29 Sep 2023 18:01:13 -0400 Subject: [PATCH 154/228] ring-buffer: Update "shortest_full" in polling commit 1e0cb399c7653462d9dadf8ab9425337c355d358 upstream. It was discovered that the ring buffer polling was incorrectly stating that read would not block, but that's because polling did not take into account that reads will block if the "buffer-percent" was set. Instead, the ring buffer polling would say reads would not block if there was any data in the ring buffer. This was incorrect behavior from a user space point of view. This was fixed by commit 42fb0a1e84ff by having the polling code check if the ring buffer had more data than what the user specified "buffer percent" had. The problem now is that the polling code did not register itself to the writer that it wanted to wait for a specific "full" value of the ring buffer. The result was that the writer would wake the polling waiter whenever there was a new event. The polling waiter would then wake up, see that there's not enough data in the ring buffer to notify user space and then go back to sleep. The next event would wake it up again. Before the polling fix was added, the code would wake up around 100 times for a hackbench 30 benchmark. After the "fix", due to the constant waking of the writer, it would wake up over 11,0000 times! It would never leave the kernel, so the user space behavior was still "correct", but this definitely is not the desired effect. To fix this, have the polling code add what it's waiting for to the "shortest_full" variable, to tell the writer not to wake it up if the buffer is not as full as it expects to be. Note, after this fix, it appears that the waiter is now woken up around 2x the times it was before (~200). This is a tremendous improvement from the 11,000 times, but I will need to spend some time to see why polling is more aggressive in its wakeups than the read blocking code. Link: https://lore.kernel.org/linux-trace-kernel/20230929180113.01c2cae3@rorschach.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Fixes: 42fb0a1e84ff ("tracing/ring-buffer: Have polling block on watermark") Reported-by: Julia Lawall Tested-by: Julia Lawall Signed-off-by: Steven Rostedt (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 812ec380da82..7d9af09bb006 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1008,6 +1008,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, if (full) { poll_wait(filp, &work->full_waiters, poll_table); work->full_waiters_pending = true; + if (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full) + cpu_buffer->shortest_full = full; } else { poll_wait(filp, &work->waiters, poll_table); work->waiters_pending = true; From cbbfdb4bab700a72f681de2e49a8c22780b2bdf9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 18 Sep 2023 10:34:51 -0400 Subject: [PATCH 155/228] btrfs: properly report 0 avail for very full file systems commit 58bfe2ccec5f9f137b41dd38f335290dcc13cd5c upstream. A user reported some issues with smaller file systems that get very full. While investigating this issue I noticed that df wasn't showing 100% full, despite having 0 chunk space and having < 1MiB of available metadata space. This turns out to be an overflow issue, we're doing: total_available_metadata_space - SZ_4M < global_block_rsv_size to determine if there's not enough space to make metadata allocations, which overflows if total_available_metadata_space is < 4M. Fix this by checking to see if our available space is greater than the 4M threshold. This makes df properly report 100% usage on the file system. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b33505330e33..ea731fa8bd35 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2267,7 +2267,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * calculated f_bavail. */ if (!mixed && block_rsv->space_info->full && - total_free_meta - thresh < block_rsv->size) + (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size)) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; From 6a80578bd4410a3ed95b91048675b36e7611cec2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 15 Sep 2023 10:34:27 -0700 Subject: [PATCH 156/228] bpf: Fix BTF_ID symbol generation collision commit 8f908db77782630c45ba29dac35c434b5ce0b730 upstream. Marcus and Satya reported an issue where BTF_ID macro generates same symbol in separate objects and that breaks final vmlinux link. ld.lld: error: ld-temp.o :14577:1: symbol '__BTF_ID__struct__cgroup__624' is already defined This can be triggered under specific configs when __COUNTER__ happens to be the same for the same symbol in two different translation units, which is already quite unlikely to happen. Add __LINE__ number suffix to make BTF_ID symbol more unique, which is not a complete fix, but it would help for now and meanwhile we can work on better solution as suggested by Andrii. Cc: stable@vger.kernel.org Reported-by: Satya Durga Srinivasu Prabhala Reported-by: Marcus Seyfarth Closes: https://github.com/ClangBuiltLinux/linux/issues/1913 Debugged-by: Nathan Chancellor Link: https://lore.kernel.org/bpf/CAEf4Bzb5KQ2_LmhN769ifMeSJaWfebccUasQOfQKaOd0nQ51tw@mail.gmail.com/ Signed-off-by: Jiri Olsa Signed-off-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20230915-bpf_collision-v3-1-263fc519c21f@google.com Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- include/linux/btf_ids.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 57890b357f85..eca91e7a4d39 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -38,7 +38,7 @@ asm( \ ____BTF_ID(symbol) #define __ID(prefix) \ - __PASTE(prefix, __COUNTER__) + __PASTE(__PASTE(prefix, __COUNTER__), __LINE__) /* * The BTF_ID defines unique symbol for each ID pointing From 72595dbfcae3b3eac6ebdc1143b77a5130517817 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 15 Sep 2023 10:34:28 -0700 Subject: [PATCH 157/228] bpf: Fix BTF_ID symbol generation collision in tools/ commit c0bb9fb0e52a64601d38b3739b729d9138d4c8a1 upstream. Marcus and Satya reported an issue where BTF_ID macro generates same symbol in separate objects and that breaks final vmlinux link. ld.lld: error: ld-temp.o :14577:1: symbol '__BTF_ID__struct__cgroup__624' is already defined This can be triggered under specific configs when __COUNTER__ happens to be the same for the same symbol in two different translation units, which is already quite unlikely to happen. Add __LINE__ number suffix to make BTF_ID symbol more unique, which is not a complete fix, but it would help for now and meanwhile we can work on better solution as suggested by Andrii. Cc: stable@vger.kernel.org Reported-by: Satya Durga Srinivasu Prabhala Reported-by: Marcus Seyfarth Closes: https://github.com/ClangBuiltLinux/linux/issues/1913 Debugged-by: Nathan Chancellor Co-developed-by: Jiri Olsa Link: https://lore.kernel.org/bpf/CAEf4Bzb5KQ2_LmhN769ifMeSJaWfebccUasQOfQKaOd0nQ51tw@mail.gmail.com/ Signed-off-by: Nick Desaulniers Link: https://lore.kernel.org/r/20230915-bpf_collision-v3-2-263fc519c21f@google.com Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- tools/include/linux/btf_ids.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 57890b357f85..eca91e7a4d39 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -38,7 +38,7 @@ asm( \ ____BTF_ID(symbol) #define __ID(prefix) \ - __PASTE(prefix, __COUNTER__) + __PASTE(__PASTE(prefix, __COUNTER__), __LINE__) /* * The BTF_ID defines unique symbol for each ID pointing From eaf4496662213a6ebd10441a0bde86031bdc775d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 13 Sep 2023 08:26:47 +0300 Subject: [PATCH 158/228] net: thunderbolt: Fix TCPv6 GSO checksum calculation commit e0b65f9b81fef180cf5f103adecbe5505c961153 upstream. Alex reported that running ssh over IPv6 does not work with Thunderbolt/USB4 networking driver. The reason for that is that driver should call skb_is_gso() before calling skb_is_gso_v6(), and it should not return false after calculates the checksum successfully. This probably was a copy paste error from the original driver where it was done properly. Reported-by: Alex Balcanquall Fixes: e69b6c02b4c3 ("net: Add support for networking over Thunderbolt cable") Cc: stable@vger.kernel.org Signed-off-by: Mika Westerberg Reviewed-by: Eric Dumazet Reviewed-by: Jiri Pirko Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/thunderbolt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/thunderbolt.c b/drivers/net/thunderbolt.c index 5d96dc1b00b3..e05bcf86306c 100644 --- a/drivers/net/thunderbolt.c +++ b/drivers/net/thunderbolt.c @@ -958,12 +958,11 @@ static bool tbnet_xmit_csum_and_map(struct tbnet *net, struct sk_buff *skb, *tucso = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 0, ip_hdr(skb)->protocol, 0); - } else if (skb_is_gso_v6(skb)) { + } else if (skb_is_gso(skb) && skb_is_gso_v6(skb)) { tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data); *tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, 0, IPPROTO_TCP, 0); - return false; } else if (protocol == htons(ETH_P_IPV6)) { tucso = dest + skb_checksum_start_offset(skb) + skb->csum_offset; *tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, From 99d308c31923773683bcd768237e90e8e7d4ca4a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 4 Sep 2023 20:38:13 +0900 Subject: [PATCH 159/228] ata: libata-core: Fix ata_port_request_pm() locking commit 3b8e0af4a7a331d1510e963b8fd77e2fca0a77f1 upstream. The function ata_port_request_pm() checks the port flag ATA_PFLAG_PM_PENDING and calls ata_port_wait_eh() if this flag is set to ensure that power management operations for a port are not scheduled simultaneously. However, this flag check is done without holding the port lock. Fix this by taking the port lock on entry to the function and checking the flag under this lock. The lock is released and re-taken if ata_port_wait_eh() needs to be called. The two WARN_ON() macros checking that the ATA_PFLAG_PM_PENDING flag was cleared are removed as the first call is racy and the second one done without holding the port lock. Fixes: 5ef41082912b ("ata: add ata port system PM callbacks") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Tested-by: Chia-Lin Kao (AceLan) Reviewed-by: Niklas Cassel Tested-by: Geert Uytterhoeven Reviewed-by: Martin K. Petersen Reviewed-by: Bart Van Assche Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-core.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 14150767be44..256d0ebd1ac5 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4974,17 +4974,19 @@ static void ata_port_request_pm(struct ata_port *ap, pm_message_t mesg, struct ata_link *link; unsigned long flags; - /* Previous resume operation might still be in - * progress. Wait for PM_PENDING to clear. - */ - if (ap->pflags & ATA_PFLAG_PM_PENDING) { - ata_port_wait_eh(ap); - WARN_ON(ap->pflags & ATA_PFLAG_PM_PENDING); - } - - /* request PM ops to EH */ spin_lock_irqsave(ap->lock, flags); + /* + * A previous PM operation might still be in progress. Wait for + * ATA_PFLAG_PM_PENDING to clear. + */ + if (ap->pflags & ATA_PFLAG_PM_PENDING) { + spin_unlock_irqrestore(ap->lock, flags); + ata_port_wait_eh(ap); + spin_lock_irqsave(ap->lock, flags); + } + + /* Request PM operation to EH */ ap->pm_mesg = mesg; ap->pflags |= ATA_PFLAG_PM_PENDING; ata_for_each_link(link, ap, HOST_FIRST) { @@ -4996,10 +4998,8 @@ static void ata_port_request_pm(struct ata_port *ap, pm_message_t mesg, spin_unlock_irqrestore(ap->lock, flags); - if (!async) { + if (!async) ata_port_wait_eh(ap); - WARN_ON(ap->pflags & ATA_PFLAG_PM_PENDING); - } } /* From 531d9f6dbfd5027c7a70d679403684c696533c24 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 26 Aug 2023 13:07:36 +0900 Subject: [PATCH 160/228] ata: libata-core: Fix port and device removal commit 84d76529c650f887f1e18caee72d6f0589e1baf9 upstream. Whenever an ATA adapter driver is removed (e.g. rmmod), ata_port_detach() is called repeatedly for all the adapter ports to remove (unload) the devices attached to the port and delete the port device itself. Removing of devices is done using libata EH with the ATA_PFLAG_UNLOADING port flag set. This causes libata EH to execute ata_eh_unload() which disables all devices attached to the port. ata_port_detach() finishes by calling scsi_remove_host() to remove the scsi host associated with the port. This function will trigger the removal of all scsi devices attached to the host and in the case of disks, calls to sd_shutdown() which will flush the device write cache and stop the device. However, given that the devices were already disabled by ata_eh_unload(), the synchronize write cache command and start stop unit commands fail. E.g. running "rmmod ahci" with first removing sd_mod results in error messages like: ata13.00: disable device sd 0:0:0:0: [sda] Synchronizing SCSI cache sd 0:0:0:0: [sda] Synchronize Cache(10) failed: Result: hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK sd 0:0:0:0: [sda] Stopping disk sd 0:0:0:0: [sda] Start/Stop Unit failed: Result: hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK Fix this by removing all scsi devices of the ata devices connected to the port before scheduling libata EH to disable the ATA devices. Fixes: 720ba12620ee ("[PATCH] libata-hp: update unload-unplug") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Niklas Cassel Tested-by: Chia-Lin Kao (AceLan) Tested-by: Geert Uytterhoeven Reviewed-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-core.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 256d0ebd1ac5..b37d62dfe727 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5915,11 +5915,30 @@ static void ata_port_detach(struct ata_port *ap) if (!ap->ops->error_handler) goto skip_eh; - /* tell EH we're leaving & flush EH */ + /* Wait for any ongoing EH */ + ata_port_wait_eh(ap); + + mutex_lock(&ap->scsi_scan_mutex); spin_lock_irqsave(ap->lock, flags); + + /* Remove scsi devices */ + ata_for_each_link(link, ap, HOST_FIRST) { + ata_for_each_dev(dev, link, ALL) { + if (dev->sdev) { + spin_unlock_irqrestore(ap->lock, flags); + scsi_remove_device(dev->sdev); + spin_lock_irqsave(ap->lock, flags); + dev->sdev = NULL; + } + } + } + + /* Tell EH to disable all devices */ ap->pflags |= ATA_PFLAG_UNLOADING; ata_port_schedule_eh(ap); + spin_unlock_irqrestore(ap->lock, flags); + mutex_unlock(&ap->scsi_scan_mutex); /* wait till EH commits suicide */ ata_port_wait_eh(ap); From dc0bd0f2da5c530b45da538d02a2314a1eb3e189 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 8 Sep 2023 20:04:52 +0900 Subject: [PATCH 161/228] ata: libata-core: Do not register PM operations for SAS ports commit 75e2bd5f1ede42a2bc88aa34b431e1ace8e0bea0 upstream. libsas does its own domain based power management of ports. For such ports, libata should not use a device type defining power management operations as executing these operations for suspend/resume in addition to libsas calls to ata_sas_port_suspend() and ata_sas_port_resume() is not necessary (and likely dangerous to do, even though problems are not seen currently). Introduce the new ata_port_sas_type device_type for ports managed by libsas. This new device type is used in ata_tport_add() and is defined without power management operations. Fixes: 2fcbdcb4c802 ("[SCSI] libata: export ata_port suspend/resume infrastructure for sas") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Tested-by: Chia-Lin Kao (AceLan) Tested-by: Geert Uytterhoeven Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-core.c | 2 +- drivers/ata/libata-transport.c | 9 ++++++++- drivers/ata/libata.h | 2 ++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index b37d62dfe727..702b8e061b36 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5167,7 +5167,7 @@ EXPORT_SYMBOL_GPL(ata_host_resume); #endif const struct device_type ata_port_type = { - .name = "ata_port", + .name = ATA_PORT_TYPE_NAME, #ifdef CONFIG_PM .pm = &ata_port_pm_ops, #endif diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c index 31a66fc0c31d..513b37942422 100644 --- a/drivers/ata/libata-transport.c +++ b/drivers/ata/libata-transport.c @@ -266,6 +266,10 @@ void ata_tport_delete(struct ata_port *ap) put_device(dev); } +static const struct device_type ata_port_sas_type = { + .name = ATA_PORT_TYPE_NAME, +}; + /** ata_tport_add - initialize a transport ATA port structure * * @parent: parent device @@ -283,7 +287,10 @@ int ata_tport_add(struct device *parent, struct device *dev = &ap->tdev; device_initialize(dev); - dev->type = &ata_port_type; + if (ap->flags & ATA_FLAG_SAS_HOST) + dev->type = &ata_port_sas_type; + else + dev->type = &ata_port_type; dev->parent = parent; ata_host_get(ap->host); diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h index 68cdd81d747c..bf71bd9e66cd 100644 --- a/drivers/ata/libata.h +++ b/drivers/ata/libata.h @@ -30,6 +30,8 @@ enum { ATA_DNXFER_QUIET = (1 << 31), }; +#define ATA_PORT_TYPE_NAME "ata_port" + extern atomic_t ata_print_id; extern int atapi_passthru16; extern int libata_fua; From 8ec1abb59a9888b101c0928f8dde92b900bcb6e7 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Fri, 22 Sep 2023 22:55:16 +0200 Subject: [PATCH 162/228] ata: libata-sata: increase PMP SRST timeout to 10s commit 753a4d531bc518633ea88ac0ed02b25a16823d51 upstream. On certain SATA controllers, softreset fails after wakeup from S2RAM with the message "softreset failed (1st FIS failed)", sometimes resulting in drives not being detected again. With the increased timeout, this issue is avoided. Instead, "softreset failed (device not ready)" is now logged 1-2 times; this later failure seems to cause fewer problems however, and the drives are detected reliably once they've spun up and the probe is retried. The issue was observed with the primary SATA controller of the QNAP TS-453B, which is an "Intel Corporation Celeron/Pentium Silver Processor SATA Controller [8086:31e3] (rev 06)" integrated in the Celeron J4125 CPU, and the following drives: - Seagate IronWolf ST12000VN0008 - Seagate IronWolf ST8000NE0004 The SATA controller seems to be more relevant to this issue than the drives, as the same drives are always detected reliably on the secondary SATA controller on the same board (an ASMedia 106x) without any "softreset failed" errors even without the increased timeout. Fixes: e7d3ef13d52a ("libata: change drive ready wait after hard reset to 5s") Cc: stable@vger.kernel.org Signed-off-by: Matthias Schiffer Signed-off-by: Damien Le Moal Signed-off-by: Greg Kroah-Hartman --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/libata.h b/include/linux/libata.h index 2de6b4a61394..1ceec830d5f7 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -297,7 +297,7 @@ enum { * advised to wait only for the following duration before * doing SRST. */ - ATA_TMOUT_PMP_SRST_WAIT = 5000, + ATA_TMOUT_PMP_SRST_WAIT = 10000, /* When the LPM policy is set to ATA_LPM_MAX_POWER, there might * be a spurious PHY event, so ignore the first PHY event that From ae03dafc3761e1f5e81cc2c9494fb6b50bdc2617 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 7 Sep 2023 11:18:08 +1000 Subject: [PATCH 163/228] fs: binfmt_elf_efpic: fix personality for ELF-FDPIC commit 7c3151585730b7095287be8162b846d31e6eee61 upstream. The elf-fdpic loader hard sets the process personality to either PER_LINUX_FDPIC for true elf-fdpic binaries or to PER_LINUX for normal ELF binaries (in this case they would be constant displacement compiled with -pie for example). The problem with that is that it will lose any other bits that may be in the ELF header personality (such as the "bug emulation" bits). On the ARM architecture the ADDR_LIMIT_32BIT flag is used to signify a normal 32bit binary - as opposed to a legacy 26bit address binary. This matters since start_thread() will set the ARM CPSR register as required based on this flag. If the elf-fdpic loader loses this bit the process will be mis-configured and crash out pretty quickly. Modify elf-fdpic loader personality setting so that it preserves the upper three bytes by using the SET_PERSONALITY macro to set it. This macro in the generic case sets PER_LINUX and preserves the upper bytes. Architectures can override this for their specific use case, and ARM does exactly this. The problem shows up quite easily running under qemu using the ARM architecture, but not necessarily on all types of real ARM hardware. If the underlying ARM processor does not support the legacy 26-bit addressing mode then everything will work as expected. Link: https://lkml.kernel.org/r/20230907011808.2985083-1-gerg@kernel.org Fixes: 1bde925d23547 ("fs/binfmt_elf_fdpic.c: provide NOMMU loader for regular ELF binaries") Signed-off-by: Greg Ungerer Cc: Al Viro Cc: Christian Brauner Cc: Eric W. Biederman Cc: Greg Ungerer Cc: Kees Cook Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_elf_fdpic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 8c3b7cc8e2a1..48bb1290ed2f 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -345,10 +345,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) /* there's now no turning back... the old userspace image is dead, * defunct, deceased, etc. */ + SET_PERSONALITY(exec_params.hdr); if (elf_check_fdpic(&exec_params.hdr)) - set_personality(PER_LINUX_FDPIC); - else - set_personality(PER_LINUX); + current->personality |= PER_LINUX_FDPIC; if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) current->personality |= READ_IMPLIES_EXEC; From 2cdec9c13f81313dd9f41f09c7cdecbfa4bea91d Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Thu, 15 Apr 2021 15:46:44 +0800 Subject: [PATCH 164/228] spi: spi-zynqmp-gqspi: Fix runtime PM imbalance in zynqmp_qspi_probe [ Upstream commit a21fbc42807b15b74b0891bd557063e6acf4fcae ] When platform_get_irq() fails, a pairing PM usage counter increment is needed to keep the counter balanced. It's the same for the following error paths. Signed-off-by: Dinghao Liu Link: https://lore.kernel.org/r/20210408092559.3824-1-dinghao.liu@zju.edu.cn Signed-off-by: Mark Brown Stable-dep-of: 1527b076ae2c ("spi: zynqmp-gqspi: fix clock imbalance on probe failure") Signed-off-by: Sasha Levin --- drivers/spi/spi-zynqmp-gqspi.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-zynqmp-gqspi.c b/drivers/spi/spi-zynqmp-gqspi.c index 3d3ac48243eb..ed68e237314f 100644 --- a/drivers/spi/spi-zynqmp-gqspi.c +++ b/drivers/spi/spi-zynqmp-gqspi.c @@ -1147,11 +1147,16 @@ static int zynqmp_qspi_probe(struct platform_device *pdev) pm_runtime_set_autosuspend_delay(&pdev->dev, SPI_AUTOSUSPEND_TIMEOUT); pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); + + ret = pm_runtime_get_sync(&pdev->dev); + if (ret < 0) { + dev_err(&pdev->dev, "Failed to pm_runtime_get_sync: %d\n", ret); + goto clk_dis_all; + } + /* QSPI controller initializations */ zynqmp_qspi_init_hw(xqspi); - pm_runtime_mark_last_busy(&pdev->dev); - pm_runtime_put_autosuspend(&pdev->dev); xqspi->irq = platform_get_irq(pdev, 0); if (xqspi->irq <= 0) { ret = -ENXIO; @@ -1178,6 +1183,7 @@ static int zynqmp_qspi_probe(struct platform_device *pdev) ctlr->mode_bits = SPI_CPOL | SPI_CPHA | SPI_RX_DUAL | SPI_RX_QUAD | SPI_TX_DUAL | SPI_TX_QUAD; ctlr->dev.of_node = np; + ctlr->auto_runtime_pm = true; ret = devm_spi_register_controller(&pdev->dev, ctlr); if (ret) { @@ -1185,9 +1191,13 @@ static int zynqmp_qspi_probe(struct platform_device *pdev) goto clk_dis_all; } + pm_runtime_mark_last_busy(&pdev->dev); + pm_runtime_put_autosuspend(&pdev->dev); + return 0; clk_dis_all: + pm_runtime_put_sync(&pdev->dev); pm_runtime_set_suspended(&pdev->dev); pm_runtime_disable(&pdev->dev); clk_disable_unprepare(xqspi->refclk); From 19f3d5d13b756b913be582a9e0d0afdeca9c397e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 22 Jun 2023 10:24:35 +0200 Subject: [PATCH 165/228] spi: zynqmp-gqspi: fix clock imbalance on probe failure [ Upstream commit 1527b076ae2cb6a9c590a02725ed39399fcad1cf ] Make sure that the device is not runtime suspended before explicitly disabling the clocks on probe failure and on driver unbind to avoid a clock enable-count imbalance. Fixes: 9e3a000362ae ("spi: zynqmp: Add pm runtime support") Cc: stable@vger.kernel.org # 4.19 Cc: Naga Sureshkumar Relli Cc: Shubhrajyoti Datta Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/Message-Id: <20230622082435.7873-1-johan+linaro@kernel.org> Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-zynqmp-gqspi.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-zynqmp-gqspi.c b/drivers/spi/spi-zynqmp-gqspi.c index ed68e237314f..12d9c5d6b9e2 100644 --- a/drivers/spi/spi-zynqmp-gqspi.c +++ b/drivers/spi/spi-zynqmp-gqspi.c @@ -1197,9 +1197,9 @@ static int zynqmp_qspi_probe(struct platform_device *pdev) return 0; clk_dis_all: - pm_runtime_put_sync(&pdev->dev); - pm_runtime_set_suspended(&pdev->dev); pm_runtime_disable(&pdev->dev); + pm_runtime_put_noidle(&pdev->dev); + pm_runtime_set_suspended(&pdev->dev); clk_disable_unprepare(xqspi->refclk); clk_dis_pclk: clk_disable_unprepare(xqspi->pclk); @@ -1223,11 +1223,15 @@ static int zynqmp_qspi_remove(struct platform_device *pdev) { struct zynqmp_qspi *xqspi = platform_get_drvdata(pdev); + pm_runtime_get_sync(&pdev->dev); + zynqmp_gqspi_write(xqspi, GQSPI_EN_OFST, 0x0); + + pm_runtime_disable(&pdev->dev); + pm_runtime_put_noidle(&pdev->dev); + pm_runtime_set_suspended(&pdev->dev); clk_disable_unprepare(xqspi->refclk); clk_disable_unprepare(xqspi->pclk); - pm_runtime_set_suspended(&pdev->dev); - pm_runtime_disable(&pdev->dev); return 0; } From 598539f38c72aa569f6af93f4857e8ee3d2a5dc4 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 20 Apr 2023 12:17:35 -0400 Subject: [PATCH 166/228] NFS: Cleanup unused rpc_clnt variable [ Upstream commit e025f0a73f6acb920d86549b2177a5883535421d ] The root rpc_clnt is not used here, clean it up. Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if swap is enabled") Signed-off-by: Benjamin Coddington Reviewed-by: NeilBrown Signed-off-by: Anna Schumaker Stable-dep-of: 956fd46f97d2 ("NFSv4: Fix a state manager thread deadlock regression") Signed-off-by: Sasha Levin --- fs/nfs/nfs4state.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ff6ca05a9d44..3fcef19e9198 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1212,10 +1212,6 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; - struct rpc_clnt *cl = clp->cl_rpcclient; - - while (cl != cl->cl_parent) - cl = cl->cl_parent; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { From ec4325e80633c61cd82605901a4fc4b3f429dba7 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 15 Jun 2023 14:07:22 -0400 Subject: [PATCH 167/228] NFS: rename nfs_client_kset to nfs_kset [ Upstream commit 8b18a2edecc0741b0eecf8b18fdb356a0f8682de ] Be brief and match the subsystem name. There's no need to distinguish this kset variable from the server. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust Stable-dep-of: 956fd46f97d2 ("NFSv4: Fix a state manager thread deadlock regression") Signed-off-by: Sasha Levin --- fs/nfs/sysfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 8cb70755e3c9..f7f778e3e5ca 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -18,7 +18,7 @@ #include "sysfs.h" struct kobject *nfs_client_kobj; -static struct kset *nfs_client_kset; +static struct kset *nfs_kset; static void nfs_netns_object_release(struct kobject *kobj) { @@ -55,13 +55,13 @@ static struct kobject *nfs_netns_object_alloc(const char *name, int nfs_sysfs_init(void) { - nfs_client_kset = kset_create_and_add("nfs", NULL, fs_kobj); - if (!nfs_client_kset) + nfs_kset = kset_create_and_add("nfs", NULL, fs_kobj); + if (!nfs_kset) return -ENOMEM; - nfs_client_kobj = nfs_netns_object_alloc("net", nfs_client_kset, NULL); + nfs_client_kobj = nfs_netns_object_alloc("net", nfs_kset, NULL); if (!nfs_client_kobj) { - kset_unregister(nfs_client_kset); - nfs_client_kset = NULL; + kset_unregister(nfs_kset); + nfs_kset = NULL; return -ENOMEM; } return 0; @@ -70,7 +70,7 @@ int nfs_sysfs_init(void) void nfs_sysfs_exit(void) { kobject_put(nfs_client_kobj); - kset_unregister(nfs_client_kset); + kset_unregister(nfs_kset); } static ssize_t nfs_netns_identifier_show(struct kobject *kobj, @@ -158,7 +158,7 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent, p = kzalloc(sizeof(*p), GFP_KERNEL); if (p) { p->net = net; - p->kobject.kset = nfs_client_kset; + p->kobject.kset = nfs_kset; if (kobject_init_and_add(&p->kobject, &nfs_netns_client_type, parent, "nfs_client") == 0) return p; From 2ad1a1d3d61641a75c44c8008e00720782ab375f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Sep 2023 13:14:15 -0400 Subject: [PATCH 168/228] NFSv4: Fix a state manager thread deadlock regression [ Upstream commit 956fd46f97d238032cb5fa4771cdaccc6e760f9a ] Commit 4dc73c679114 reintroduces the deadlock that was fixed by commit aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") because it prevents the setup of new threads to handle reboot recovery, while the older recovery thread is stuck returning delegations. Fixes: 4dc73c679114 ("NFSv4: keep state manager thread active if swap is enabled") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/nfs4proc.c | 4 +++- fs/nfs/nfs4state.c | 36 +++++++++++++++++++++++++----------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c34df51a8f2b..1c2ed14bccef 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -10408,7 +10408,9 @@ static void nfs4_disable_swap(struct inode *inode) */ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + wake_up_var(&clp->cl_state); } static const struct inode_operations nfs4_dir_inode_operations = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3fcef19e9198..10946b24c66f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1212,13 +1212,23 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *clnt = clp->cl_rpcclient; + bool swapon = false; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { - wake_up_var(&clp->cl_state); - return; + + if (atomic_read(&clnt->cl_swapper)) { + swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, + &clp->cl_state); + if (!swapon) { + wake_up_var(&clp->cl_state); + return; + } } - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); + + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; + __module_get(THIS_MODULE); refcount_inc(&clp->cl_count); @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) __func__, PTR_ERR(task)); if (!nfs_client_init_is_complete(clp)) nfs_mark_client_ready(clp, PTR_ERR(task)); + if (swapon) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs4_clear_state_manager_bit(clp); - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -2717,22 +2728,25 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); again: - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); nfs4_state_manager(clp); - if (atomic_read(&cl->cl_swapper)) { + + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && + !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) { wait_var_event_interruptible(&clp->cl_state, test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)); - if (atomic_read(&cl->cl_swapper) && - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + if (!atomic_read(&cl->cl_swapper)) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; /* Either no longer a swapper, or were signalled */ + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); } - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); if (refcount_read(&clp->cl_count) > 1 && !signalled() && test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; nfs_put_client(clp); From 0ecde7dd766fcdaf9161a1ca039f036eb4f9b568 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 15 Mar 2023 15:24:46 +0100 Subject: [PATCH 169/228] ring-buffer: remove obsolete comment for free_buffer_page() [ Upstream commit a98151ad53b53f010ee364ec2fd06445b328578b ] The comment refers to mm/slob.c which is being removed. It comes from commit ed56829cb319 ("ring_buffer: reset buffer page when freeing") and according to Steven the borrowed code was a page mapcount and mapping reset, which was later removed by commit e4c2ce82ca27 ("ring_buffer: allocate buffer page pointer"). Thus the comment is not accurate anyway, remove it. Link: https://lore.kernel.org/linux-trace-kernel/20230315142446.27040-1-vbabka@suse.cz Cc: Masami Hiramatsu Cc: Ingo Molnar Reported-by: Mike Rapoport Suggested-by: Steven Rostedt (Google) Fixes: e4c2ce82ca27 ("ring_buffer: allocate buffer page pointer") Signed-off-by: Vlastimil Babka Reviewed-by: Mukesh Ojha Signed-off-by: Steven Rostedt (Google) Stable-dep-of: 45d99ea451d0 ("ring-buffer: Fix bytes info in per_cpu buffer stats") Signed-off-by: Sasha Levin --- kernel/trace/ring_buffer.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7d9af09bb006..682540bd5635 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -355,10 +355,6 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } -/* - * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing - * this issue out. - */ static void free_buffer_page(struct buffer_page *bpage) { free_page((unsigned long)bpage->page); From c33d75a57a8199eedc03ffe7a599ddb7aaec7db9 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Thu, 21 Sep 2023 20:54:25 +0800 Subject: [PATCH 170/228] ring-buffer: Fix bytes info in per_cpu buffer stats [ Upstream commit 45d99ea451d0c30bfd4864f0fe485d7dac014902 ] The 'bytes' info in file 'per_cpu/cpu/stats' means the number of bytes in cpu buffer that have not been consumed. However, currently after consuming data by reading file 'trace_pipe', the 'bytes' info was not changed as expected. # cat per_cpu/cpu0/stats entries: 0 overrun: 0 commit overrun: 0 bytes: 568 <--- 'bytes' is problematical !!! oldest event ts: 8651.371479 now ts: 8653.912224 dropped events: 0 read events: 8 The root cause is incorrect stat on cpu_buffer->read_bytes. To fix it: 1. When stat 'read_bytes', account consumed event in rb_advance_reader(); 2. When stat 'entries_bytes', exclude the discarded padding event which is smaller than minimum size because it is invisible to reader. Then use rb_page_commit() instead of BUF_PAGE_SIZE at where accounting for page-based read/remove/overrun. Also correct the comments of ring_buffer_bytes_cpu() in this patch. Link: https://lore.kernel.org/linux-trace-kernel/20230921125425.1708423-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: c64e148a3be3 ("trace: Add ring buffer stats to measure rate of events") Signed-off-by: Zheng Yejian Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin --- kernel/trace/ring_buffer.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 682540bd5635..0938222b4598 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -355,6 +355,11 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->page->commit); +} + static void free_buffer_page(struct buffer_page *bpage) { free_page((unsigned long)bpage->page); @@ -1886,7 +1891,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) * Increment overrun to account for the lost events. */ local_add(page_entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); } @@ -2236,11 +2241,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) -{ - return local_read(&bpage->page->commit); -} - static struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { @@ -2386,7 +2386,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the counters. */ local_add(entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); /* @@ -2529,9 +2529,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); - /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); - /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2545,7 +2542,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * write counter enough to allow another writer to slip * in on this page. * We put in a discarded commit instead, to make sure - * that this space is not used again. + * that this space is not used again, and this space will + * not be accounted into 'entries_bytes'. * * If we are less than the minimum size, we don't need to * worry about it. @@ -2570,6 +2568,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* time delta must be non zero */ event->time_delta = 1; + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); @@ -3935,7 +3936,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); /** - * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ @@ -4443,6 +4444,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) length = rb_event_length(event); cpu_buffer->reader_page->read += length; + cpu_buffer->read_bytes += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) @@ -5534,7 +5536,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += BUF_PAGE_SIZE; + cpu_buffer->read_bytes += rb_page_commit(reader); /* swap the pages */ rb_init_page(bpage); From d0952ce316d133fffc7a632c7ce0ab254702070a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 22 Sep 2023 08:51:17 -0700 Subject: [PATCH 171/228] drm/mediatek: Fix backport issue in mtk_drm_gem_prime_vmap() When building with clang: drivers/gpu/drm/mediatek/mtk_drm_gem.c:255:10: error: incompatible integer to pointer conversion returning 'int' from a function with result type 'void *' [-Wint-conversion] 255 | return -ENOMEM; | ^~~~~~~ 1 error generated. GCC reports the same issue as a warning, rather than an error. Prior to commit 7e542ff8b463 ("drm/mediatek: Use struct dma_buf_map in GEM vmap ops"), this function returned a pointer rather than an integer. This function is indirectly called in drm_gem_vmap(), which treats NULL as -ENOMEM through an error pointer. Return NULL in this block to resolve the warning but keep the same end result. Fixes: 43f561e809aa ("drm/mediatek: Fix potential memory leak if vmap() fail") Signed-off-by: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/mediatek/mtk_drm_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_gem.c b/drivers/gpu/drm/mediatek/mtk_drm_gem.c index fe64bf2176f3..b20ea58907c2 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_gem.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_gem.c @@ -252,7 +252,7 @@ void *mtk_drm_gem_prime_vmap(struct drm_gem_object *obj) if (!mtk_gem->kvaddr) { kfree(sgt); kfree(mtk_gem->pages); - return -ENOMEM; + return NULL; } out: kfree(sgt); From 7c4f11d73b2425420c0a6e2d5d8a37eee36dad17 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 5 Oct 2023 11:59:32 +0200 Subject: [PATCH 172/228] rbd: move rbd_dev_refresh() definition commit 0b035401c57021fc6c300272cbb1c5a889d4fe45 upstream. Move rbd_dev_refresh() definition further down to avoid having to move struct parent_image_info definition in the next commit. This spares some forward declarations too. Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang [idryomov@gmail.com: backport to 5.10-6.1: context] Signed-off-by: Sasha Levin --- drivers/block/rbd.c | 68 ++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 95cbd5790ed6..82cf9be4badc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -633,8 +633,6 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); static int rbd_dev_refresh(struct rbd_device *rbd_dev); static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); -static int rbd_dev_header_info(struct rbd_device *rbd_dev); -static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u64 snap_id); static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, @@ -4989,39 +4987,6 @@ static void rbd_dev_update_size(struct rbd_device *rbd_dev) } } -static int rbd_dev_refresh(struct rbd_device *rbd_dev) -{ - u64 mapping_size; - int ret; - - down_write(&rbd_dev->header_rwsem); - mapping_size = rbd_dev->mapping.size; - - ret = rbd_dev_header_info(rbd_dev); - if (ret) - goto out; - - /* - * If there is a parent, see if it has disappeared due to the - * mapped image getting flattened. - */ - if (rbd_dev->parent) { - ret = rbd_dev_v2_parent_info(rbd_dev); - if (ret) - goto out; - } - - rbd_assert(!rbd_is_snap(rbd_dev)); - rbd_dev->mapping.size = rbd_dev->header.image_size; - -out: - up_write(&rbd_dev->header_rwsem); - if (!ret && mapping_size != rbd_dev->mapping.size) - rbd_dev_update_size(rbd_dev); - - return ret; -} - static const struct blk_mq_ops rbd_mq_ops = { .queue_rq = rbd_queue_rq, }; @@ -7115,6 +7080,39 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) return ret; } +static int rbd_dev_refresh(struct rbd_device *rbd_dev) +{ + u64 mapping_size; + int ret; + + down_write(&rbd_dev->header_rwsem); + mapping_size = rbd_dev->mapping.size; + + ret = rbd_dev_header_info(rbd_dev); + if (ret) + goto out; + + /* + * If there is a parent, see if it has disappeared due to the + * mapped image getting flattened. + */ + if (rbd_dev->parent) { + ret = rbd_dev_v2_parent_info(rbd_dev); + if (ret) + goto out; + } + + rbd_assert(!rbd_is_snap(rbd_dev)); + rbd_dev->mapping.size = rbd_dev->header.image_size; + +out: + up_write(&rbd_dev->header_rwsem); + if (!ret && mapping_size != rbd_dev->mapping.size) + rbd_dev_update_size(rbd_dev); + + return ret; +} + static ssize_t do_rbd_add(struct bus_type *bus, const char *buf, size_t count) From 3ceb306f9b2d2e7a8c8d36f6815417cb056072cd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 5 Oct 2023 11:59:33 +0200 Subject: [PATCH 173/228] rbd: decouple header read-in from updating rbd_dev->header commit 510a7330c82a7754d5df0117a8589e8a539067c7 upstream. Make rbd_dev_header_info() populate a passed struct rbd_image_header instead of rbd_dev->header and introduce rbd_dev_update_header() for updating mutable fields in rbd_dev->header upon refresh. The initial read-in of both mutable and immutable fields in rbd_dev_image_probe() passes in rbd_dev->header so no update step is required there. rbd_init_layout() is now called directly from rbd_dev_image_probe() instead of individually in format 1 and format 2 implementations. Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang Signed-off-by: Sasha Levin --- drivers/block/rbd.c | 210 ++++++++++++++++++++++++-------------------- 1 file changed, 116 insertions(+), 94 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 82cf9be4badc..73f917a429f3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -632,7 +632,8 @@ void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); static int rbd_dev_refresh(struct rbd_device *rbd_dev); -static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev, + struct rbd_image_header *header); static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u64 snap_id); static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, @@ -1045,15 +1046,24 @@ static void rbd_init_layout(struct rbd_device *rbd_dev) RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); } +static void rbd_image_header_cleanup(struct rbd_image_header *header) +{ + kfree(header->object_prefix); + ceph_put_snap_context(header->snapc); + kfree(header->snap_sizes); + kfree(header->snap_names); + + memset(header, 0, sizeof(*header)); +} + /* * Fill an rbd image header with information from the given format 1 * on-disk header. */ -static int rbd_header_from_disk(struct rbd_device *rbd_dev, - struct rbd_image_header_ondisk *ondisk) +static int rbd_header_from_disk(struct rbd_image_header *header, + struct rbd_image_header_ondisk *ondisk, + bool first_time) { - struct rbd_image_header *header = &rbd_dev->header; - bool first_time = header->object_prefix == NULL; struct ceph_snap_context *snapc; char *object_prefix = NULL; char *snap_names = NULL; @@ -1120,11 +1130,6 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, if (first_time) { header->object_prefix = object_prefix; header->obj_order = ondisk->options.order; - rbd_init_layout(rbd_dev); - } else { - ceph_put_snap_context(header->snapc); - kfree(header->snap_names); - kfree(header->snap_sizes); } /* The remaining fields always get updated (when we refresh) */ @@ -4914,7 +4919,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, * return, the rbd_dev->header field will contain up-to-date * information about the image. */ -static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) +static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev, + struct rbd_image_header *header, + bool first_time) { struct rbd_image_header_ondisk *ondisk = NULL; u32 snap_count = 0; @@ -4962,7 +4969,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) snap_count = le32_to_cpu(ondisk->snap_count); } while (snap_count != want_count); - ret = rbd_header_from_disk(rbd_dev, ondisk); + ret = rbd_header_from_disk(header, ondisk, first_time); out: kfree(ondisk); @@ -5541,17 +5548,12 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, return 0; } -static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) -{ - return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, - &rbd_dev->header.obj_order, - &rbd_dev->header.image_size); -} - -static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) +static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev, + char **pobject_prefix) { size_t size; void *reply_buf; + char *object_prefix; int ret; void *p; @@ -5569,16 +5571,16 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) goto out; p = reply_buf; - rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, - p + ret, NULL, GFP_NOIO); + object_prefix = ceph_extract_encoded_string(&p, p + ret, NULL, + GFP_NOIO); + if (IS_ERR(object_prefix)) { + ret = PTR_ERR(object_prefix); + goto out; + } ret = 0; - if (IS_ERR(rbd_dev->header.object_prefix)) { - ret = PTR_ERR(rbd_dev->header.object_prefix); - rbd_dev->header.object_prefix = NULL; - } else { - dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); - } + *pobject_prefix = object_prefix; + dout(" object_prefix = %s\n", object_prefix); out: kfree(reply_buf); @@ -5629,13 +5631,6 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, return 0; } -static int rbd_dev_v2_features(struct rbd_device *rbd_dev) -{ - return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, - rbd_is_ro(rbd_dev), - &rbd_dev->header.features); -} - /* * These are generic image flags, but since they are used only for * object map, store them in rbd_dev->object_map_flags. @@ -5910,14 +5905,14 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) return ret; } -static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) +static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev, + u64 *stripe_unit, u64 *stripe_count) { struct { __le64 stripe_unit; __le64 stripe_count; } __attribute__ ((packed)) striping_info_buf = { 0 }; size_t size = sizeof (striping_info_buf); - void *p; int ret; ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, @@ -5929,27 +5924,33 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) if (ret < size) return -ERANGE; - p = &striping_info_buf; - rbd_dev->header.stripe_unit = ceph_decode_64(&p); - rbd_dev->header.stripe_count = ceph_decode_64(&p); + *stripe_unit = le64_to_cpu(striping_info_buf.stripe_unit); + *stripe_count = le64_to_cpu(striping_info_buf.stripe_count); + dout(" stripe_unit = %llu stripe_count = %llu\n", *stripe_unit, + *stripe_count); + return 0; } -static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) +static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev, s64 *data_pool_id) { - __le64 data_pool_id; + __le64 data_pool_buf; int ret; ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, &rbd_dev->header_oloc, "get_data_pool", - NULL, 0, &data_pool_id, sizeof(data_pool_id)); + NULL, 0, &data_pool_buf, + sizeof(data_pool_buf)); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; - if (ret < sizeof(data_pool_id)) + if (ret < sizeof(data_pool_buf)) return -EBADMSG; - rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); - WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); + *data_pool_id = le64_to_cpu(data_pool_buf); + dout(" data_pool_id = %lld\n", *data_pool_id); + WARN_ON(*data_pool_id == CEPH_NOPOOL); + return 0; } @@ -6141,7 +6142,8 @@ static int rbd_spec_fill_names(struct rbd_device *rbd_dev) return ret; } -static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) +static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, + struct ceph_snap_context **psnapc) { size_t size; int ret; @@ -6202,9 +6204,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) for (i = 0; i < snap_count; i++) snapc->snaps[i] = ceph_decode_64(&p); - ceph_put_snap_context(rbd_dev->header.snapc); - rbd_dev->header.snapc = snapc; - + *psnapc = snapc; dout(" snap context seq = %llu, snap_count = %u\n", (unsigned long long)seq, (unsigned int)snap_count); out: @@ -6253,38 +6253,42 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, return snap_name; } -static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) +static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev, + struct rbd_image_header *header, + bool first_time) { - bool first_time = rbd_dev->header.object_prefix == NULL; int ret; - ret = rbd_dev_v2_image_size(rbd_dev); + ret = _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, + first_time ? &header->obj_order : NULL, + &header->image_size); if (ret) return ret; if (first_time) { - ret = rbd_dev_v2_header_onetime(rbd_dev); + ret = rbd_dev_v2_header_onetime(rbd_dev, header); if (ret) return ret; } - ret = rbd_dev_v2_snap_context(rbd_dev); - if (ret && first_time) { - kfree(rbd_dev->header.object_prefix); - rbd_dev->header.object_prefix = NULL; - } + ret = rbd_dev_v2_snap_context(rbd_dev, &header->snapc); + if (ret) + return ret; - return ret; + return 0; } -static int rbd_dev_header_info(struct rbd_device *rbd_dev) +static int rbd_dev_header_info(struct rbd_device *rbd_dev, + struct rbd_image_header *header, + bool first_time) { rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + rbd_assert(!header->object_prefix && !header->snapc); if (rbd_dev->image_format == 1) - return rbd_dev_v1_header_info(rbd_dev); + return rbd_dev_v1_header_info(rbd_dev, header, first_time); - return rbd_dev_v2_header_info(rbd_dev); + return rbd_dev_v2_header_info(rbd_dev, header, first_time); } /* @@ -6771,60 +6775,49 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) */ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { - struct rbd_image_header *header; - rbd_dev_parent_put(rbd_dev); rbd_object_map_free(rbd_dev); rbd_dev_mapping_clear(rbd_dev); /* Free dynamic fields from the header, then zero it out */ - header = &rbd_dev->header; - ceph_put_snap_context(header->snapc); - kfree(header->snap_sizes); - kfree(header->snap_names); - kfree(header->object_prefix); - memset(header, 0, sizeof (*header)); + rbd_image_header_cleanup(&rbd_dev->header); } -static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev, + struct rbd_image_header *header) { int ret; - ret = rbd_dev_v2_object_prefix(rbd_dev); + ret = rbd_dev_v2_object_prefix(rbd_dev, &header->object_prefix); if (ret) - goto out_err; + return ret; /* * Get the and check features for the image. Currently the * features are assumed to never change. */ - ret = rbd_dev_v2_features(rbd_dev); + ret = _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, + rbd_is_ro(rbd_dev), &header->features); if (ret) - goto out_err; + return ret; /* If the image supports fancy striping, get its parameters */ - if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { - ret = rbd_dev_v2_striping_info(rbd_dev); - if (ret < 0) - goto out_err; - } - - if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { - ret = rbd_dev_v2_data_pool(rbd_dev); + if (header->features & RBD_FEATURE_STRIPINGV2) { + ret = rbd_dev_v2_striping_info(rbd_dev, &header->stripe_unit, + &header->stripe_count); if (ret) - goto out_err; + return ret; } - rbd_init_layout(rbd_dev); - return 0; + if (header->features & RBD_FEATURE_DATA_POOL) { + ret = rbd_dev_v2_data_pool(rbd_dev, &header->data_pool_id); + if (ret) + return ret; + } -out_err: - rbd_dev->header.features = 0; - kfree(rbd_dev->header.object_prefix); - rbd_dev->header.object_prefix = NULL; - return ret; + return 0; } /* @@ -7019,13 +7012,15 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) if (!depth) down_write(&rbd_dev->header_rwsem); - ret = rbd_dev_header_info(rbd_dev); + ret = rbd_dev_header_info(rbd_dev, &rbd_dev->header, true); if (ret) { if (ret == -ENOENT && !need_watch) rbd_print_dne(rbd_dev, false); goto err_out_probe; } + rbd_init_layout(rbd_dev); + /* * If this image is the one being mapped, we have pool name and * id, image name and id, and snap name - need to fill snap id. @@ -7080,15 +7075,39 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) return ret; } +static void rbd_dev_update_header(struct rbd_device *rbd_dev, + struct rbd_image_header *header) +{ + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + rbd_assert(rbd_dev->header.object_prefix); /* !first_time */ + + rbd_dev->header.image_size = header->image_size; + + ceph_put_snap_context(rbd_dev->header.snapc); + rbd_dev->header.snapc = header->snapc; + header->snapc = NULL; + + if (rbd_dev->image_format == 1) { + kfree(rbd_dev->header.snap_names); + rbd_dev->header.snap_names = header->snap_names; + header->snap_names = NULL; + + kfree(rbd_dev->header.snap_sizes); + rbd_dev->header.snap_sizes = header->snap_sizes; + header->snap_sizes = NULL; + } +} + static int rbd_dev_refresh(struct rbd_device *rbd_dev) { + struct rbd_image_header header = { 0 }; u64 mapping_size; int ret; down_write(&rbd_dev->header_rwsem); mapping_size = rbd_dev->mapping.size; - ret = rbd_dev_header_info(rbd_dev); + ret = rbd_dev_header_info(rbd_dev, &header, false); if (ret) goto out; @@ -7102,6 +7121,8 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) goto out; } + rbd_dev_update_header(rbd_dev, &header); + rbd_assert(!rbd_is_snap(rbd_dev)); rbd_dev->mapping.size = rbd_dev->header.image_size; @@ -7110,6 +7131,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) if (!ret && mapping_size != rbd_dev->mapping.size) rbd_dev_update_size(rbd_dev); + rbd_image_header_cleanup(&header); return ret; } From d3d170c5fc0649e1ba9c310029849467cff50dfd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 5 Oct 2023 11:59:34 +0200 Subject: [PATCH 174/228] rbd: decouple parent info read-in from updating rbd_dev commit c10311776f0a8ddea2276df96e255625b07045a8 upstream. Unlike header read-in, parent info read-in is already decoupled in get_parent_info(), but it's buried in rbd_dev_v2_parent_info() along with the processing logic. Separate the initial read-in and update read-in logic into rbd_dev_setup_parent() and rbd_dev_update_parent() respectively and have rbd_dev_v2_parent_info() just populate struct parent_image_info (i.e. what get_parent_info() did). Some existing QoI issues, like flatten of a standalone clone being disregarded on refresh, remain. Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang Signed-off-by: Sasha Levin --- drivers/block/rbd.c | 142 +++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 62 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 73f917a429f3..628b986351ee 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5667,6 +5667,14 @@ struct parent_image_info { u64 overlap; }; +static void rbd_parent_info_cleanup(struct parent_image_info *pii) +{ + kfree(pii->pool_ns); + kfree(pii->image_id); + + memset(pii, 0, sizeof(*pii)); +} + /* * The caller is responsible for @pii. */ @@ -5736,6 +5744,9 @@ static int __get_parent_info(struct rbd_device *rbd_dev, if (pii->has_overlap) ceph_decode_64_safe(&p, end, pii->overlap, e_inval); + dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", + __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id, + pii->has_overlap, pii->overlap); return 0; e_inval: @@ -5774,14 +5785,17 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev, pii->has_overlap = true; ceph_decode_64_safe(&p, end, pii->overlap, e_inval); + dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", + __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id, + pii->has_overlap, pii->overlap); return 0; e_inval: return -EINVAL; } -static int get_parent_info(struct rbd_device *rbd_dev, - struct parent_image_info *pii) +static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev, + struct parent_image_info *pii) { struct page *req_page, *reply_page; void *p; @@ -5809,7 +5823,7 @@ static int get_parent_info(struct rbd_device *rbd_dev, return ret; } -static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) +static int rbd_dev_setup_parent(struct rbd_device *rbd_dev) { struct rbd_spec *parent_spec; struct parent_image_info pii = { 0 }; @@ -5819,37 +5833,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) if (!parent_spec) return -ENOMEM; - ret = get_parent_info(rbd_dev, &pii); + ret = rbd_dev_v2_parent_info(rbd_dev, &pii); if (ret) goto out_err; - dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", - __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id, - pii.has_overlap, pii.overlap); - - if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) { - /* - * Either the parent never existed, or we have - * record of it but the image got flattened so it no - * longer has a parent. When the parent of a - * layered image disappears we immediately set the - * overlap to 0. The effect of this is that all new - * requests will be treated as if the image had no - * parent. - * - * If !pii.has_overlap, the parent image spec is not - * applicable. It's there to avoid duplication in each - * snapshot record. - */ - if (rbd_dev->parent_overlap) { - rbd_dev->parent_overlap = 0; - rbd_dev_parent_put(rbd_dev); - pr_info("%s: clone image has been flattened\n", - rbd_dev->disk->disk_name); - } - + if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) goto out; /* No parent? No problem. */ - } /* The ceph file layout needs to fit pool id in 32 bits */ @@ -5861,46 +5850,34 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } /* - * The parent won't change (except when the clone is - * flattened, already handled that). So we only need to - * record the parent spec we have not already done so. + * The parent won't change except when the clone is flattened, + * so we only need to record the parent image spec once. */ - if (!rbd_dev->parent_spec) { - parent_spec->pool_id = pii.pool_id; - if (pii.pool_ns && *pii.pool_ns) { - parent_spec->pool_ns = pii.pool_ns; - pii.pool_ns = NULL; - } - parent_spec->image_id = pii.image_id; - pii.image_id = NULL; - parent_spec->snap_id = pii.snap_id; - - rbd_dev->parent_spec = parent_spec; - parent_spec = NULL; /* rbd_dev now owns this */ + parent_spec->pool_id = pii.pool_id; + if (pii.pool_ns && *pii.pool_ns) { + parent_spec->pool_ns = pii.pool_ns; + pii.pool_ns = NULL; } + parent_spec->image_id = pii.image_id; + pii.image_id = NULL; + parent_spec->snap_id = pii.snap_id; + + rbd_assert(!rbd_dev->parent_spec); + rbd_dev->parent_spec = parent_spec; + parent_spec = NULL; /* rbd_dev now owns this */ /* - * We always update the parent overlap. If it's zero we issue - * a warning, as we will proceed as if there was no parent. + * Record the parent overlap. If it's zero, issue a warning as + * we will proceed as if there is no parent. */ - if (!pii.overlap) { - if (parent_spec) { - /* refresh, careful to warn just once */ - if (rbd_dev->parent_overlap) - rbd_warn(rbd_dev, - "clone now standalone (overlap became 0)"); - } else { - /* initial probe */ - rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); - } - } + if (!pii.overlap) + rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); rbd_dev->parent_overlap = pii.overlap; out: ret = 0; out_err: - kfree(pii.pool_ns); - kfree(pii.image_id); + rbd_parent_info_cleanup(&pii); rbd_spec_put(parent_spec); return ret; } @@ -7049,7 +7026,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) } if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { - ret = rbd_dev_v2_parent_info(rbd_dev); + ret = rbd_dev_setup_parent(rbd_dev); if (ret) goto err_out_probe; } @@ -7098,9 +7075,47 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev, } } +static void rbd_dev_update_parent(struct rbd_device *rbd_dev, + struct parent_image_info *pii) +{ + if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) { + /* + * Either the parent never existed, or we have + * record of it but the image got flattened so it no + * longer has a parent. When the parent of a + * layered image disappears we immediately set the + * overlap to 0. The effect of this is that all new + * requests will be treated as if the image had no + * parent. + * + * If !pii.has_overlap, the parent image spec is not + * applicable. It's there to avoid duplication in each + * snapshot record. + */ + if (rbd_dev->parent_overlap) { + rbd_dev->parent_overlap = 0; + rbd_dev_parent_put(rbd_dev); + pr_info("%s: clone has been flattened\n", + rbd_dev->disk->disk_name); + } + } else { + rbd_assert(rbd_dev->parent_spec); + + /* + * Update the parent overlap. If it became zero, issue + * a warning as we will proceed as if there is no parent. + */ + if (!pii->overlap && rbd_dev->parent_overlap) + rbd_warn(rbd_dev, + "clone has become standalone (overlap 0)"); + rbd_dev->parent_overlap = pii->overlap; + } +} + static int rbd_dev_refresh(struct rbd_device *rbd_dev) { struct rbd_image_header header = { 0 }; + struct parent_image_info pii = { 0 }; u64 mapping_size; int ret; @@ -7116,12 +7131,14 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) * mapped image getting flattened. */ if (rbd_dev->parent) { - ret = rbd_dev_v2_parent_info(rbd_dev); + ret = rbd_dev_v2_parent_info(rbd_dev, &pii); if (ret) goto out; } rbd_dev_update_header(rbd_dev, &header); + if (rbd_dev->parent) + rbd_dev_update_parent(rbd_dev, &pii); rbd_assert(!rbd_is_snap(rbd_dev)); rbd_dev->mapping.size = rbd_dev->header.image_size; @@ -7131,6 +7148,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) if (!ret && mapping_size != rbd_dev->mapping.size) rbd_dev_update_size(rbd_dev); + rbd_parent_info_cleanup(&pii); rbd_image_header_cleanup(&header); return ret; } From 0d6987d4a34c6f224453004944fa27f3e17153d1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 5 Oct 2023 11:59:35 +0200 Subject: [PATCH 175/228] rbd: take header_rwsem in rbd_dev_refresh() only when updating commit 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d upstream. rbd_dev_refresh() has been holding header_rwsem across header and parent info read-in unnecessarily for ages. With commit 870611e4877e ("rbd: get snapshot context after exclusive lock is ensured to be held"), the potential for deadlocks became much more real owning to a) header_rwsem now nesting inside lock_rwsem and b) rw_semaphores not allowing new readers after a writer is registered. For example, assuming that I/O request 1, I/O request 2 and header read-in request all target the same OSD: 1. I/O request 1 comes in and gets submitted 2. watch error occurs 3. rbd_watch_errcb() takes lock_rwsem for write, clears owner_cid and releases lock_rwsem 4. after reestablishing the watch, rbd_reregister_watch() calls rbd_dev_refresh() which takes header_rwsem for write and submits a header read-in request 5. I/O request 2 comes in: after taking lock_rwsem for read in __rbd_img_handle_request(), it blocks trying to take header_rwsem for read in rbd_img_object_requests() 6. another watch error occurs 7. rbd_watch_errcb() blocks trying to take lock_rwsem for write 8. I/O request 1 completion is received by the messenger but can't be processed because lock_rwsem won't be granted anymore 9. header read-in request completion can't be received, let alone processed, because the messenger is stranded Change rbd_dev_refresh() to take header_rwsem only for actually updating rbd_dev->header. Header and parent info read-in don't need any locking. Cc: stable@vger.kernel.org # 0b035401c570: rbd: move rbd_dev_refresh() definition Cc: stable@vger.kernel.org # 510a7330c82a: rbd: decouple header read-in from updating rbd_dev->header Cc: stable@vger.kernel.org # c10311776f0a: rbd: decouple parent info read-in from updating rbd_dev Cc: stable@vger.kernel.org Fixes: 870611e4877e ("rbd: get snapshot context after exclusive lock is ensured to be held") Signed-off-by: Ilya Dryomov Reviewed-by: Dongsheng Yang Signed-off-by: Sasha Levin --- drivers/block/rbd.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 628b986351ee..b0f7930524ba 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -7058,7 +7058,14 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev, rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); rbd_assert(rbd_dev->header.object_prefix); /* !first_time */ - rbd_dev->header.image_size = header->image_size; + if (rbd_dev->header.image_size != header->image_size) { + rbd_dev->header.image_size = header->image_size; + + if (!rbd_is_snap(rbd_dev)) { + rbd_dev->mapping.size = header->image_size; + rbd_dev_update_size(rbd_dev); + } + } ceph_put_snap_context(rbd_dev->header.snapc); rbd_dev->header.snapc = header->snapc; @@ -7116,11 +7123,9 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) { struct rbd_image_header header = { 0 }; struct parent_image_info pii = { 0 }; - u64 mapping_size; int ret; - down_write(&rbd_dev->header_rwsem); - mapping_size = rbd_dev->mapping.size; + dout("%s rbd_dev %p\n", __func__, rbd_dev); ret = rbd_dev_header_info(rbd_dev, &header, false); if (ret) @@ -7136,18 +7141,13 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) goto out; } + down_write(&rbd_dev->header_rwsem); rbd_dev_update_header(rbd_dev, &header); if (rbd_dev->parent) rbd_dev_update_parent(rbd_dev, &pii); - - rbd_assert(!rbd_is_snap(rbd_dev)); - rbd_dev->mapping.size = rbd_dev->header.image_size; + up_write(&rbd_dev->header_rwsem); out: - up_write(&rbd_dev->header_rwsem); - if (!ret && mapping_size != rbd_dev->mapping.size) - rbd_dev_update_size(rbd_dev); - rbd_parent_info_cleanup(&pii); rbd_image_header_cleanup(&header); return ret; From f94471c0cc31d06cb939968950ecdc801e85cc31 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 15 Dec 2022 10:16:29 +0800 Subject: [PATCH 176/228] block: fix use-after-free of q->q_usage_counter commit d36a9ea5e7766961e753ee38d4c331bbe6ef659b upstream. For blk-mq, queue release handler is usually called after blk_mq_freeze_queue_wait() returns. However, the q_usage_counter->release() handler may not be run yet at that time, so this can cause a use-after-free. Fix the issue by moving percpu_ref_exit() into blk_free_queue_rcu(). Since ->release() is called with rcu read lock held, it is agreed that the race should be covered in caller per discussion from the two links. Reported-by: Zhang Wensheng Reported-by: Zhong Jinghua Link: https://lore.kernel.org/linux-block/Y5prfOjyyjQKUrtH@T590/T/#u Link: https://lore.kernel.org/lkml/Y4%2FmzMd4evRg9yDi@fedora/ Cc: Hillf Danton Cc: Yu Kuai Cc: Dennis Zhou Fixes: 2b0d3d3e4fcf ("percpu_ref: reduce memory footprint of percpu_ref in fast path") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20221215021629.74870-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Saranya Muruganandam Signed-off-by: Greg Kroah-Hartman --- block/blk-core.c | 2 -- block/blk-sysfs.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index d0d0dd8151f7..e5eeec801f56 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -414,8 +414,6 @@ void blk_cleanup_queue(struct request_queue *q) blk_mq_sched_free_requests(q); mutex_unlock(&q->sysfs_lock); - percpu_ref_exit(&q->q_usage_counter); - /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8c5816364dd1..9174137a913c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -726,6 +726,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) { struct request_queue *q = container_of(rcu_head, struct request_queue, rcu_head); + + percpu_ref_exit(&q->q_usage_counter); kmem_cache_free(blk_requestq_cachep, q); } From c2cf152e8bb848608a151bc9c0e13c422175413f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 7 Oct 2023 13:42:26 +0200 Subject: [PATCH 177/228] Revert "clk: imx: pll14xx: dynamically configure PLL for 393216000/361267200Hz" This reverts commit 972acd701b1982da9cdbeb892bf17eeef2094508 which is commit 72d00e560d10665e6139c9431956a87ded6e9880 upstream. Marek writes: The commit message states 'Cc: stable@vger.kernel.org # v5.18+' and the commit should only be applied to Linux 5.18.y and newer, on anything older it breaks PLL configuration due to missing prerequisite patches. Reported-by: Marek Vasut Cc: Ahmad Fatoum Cc: Marco Felsch Cc: Abel Vesa Link: https://lore.kernel.org/r/4e5fa5b2-66b8-8f0b-ccb9-c2b774054e4e@denx.de Signed-off-by: Greg Kroah-Hartman --- drivers/clk/imx/clk-pll14xx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/clk/imx/clk-pll14xx.c b/drivers/clk/imx/clk-pll14xx.c index e46311c2e63e..aba36e4217d2 100644 --- a/drivers/clk/imx/clk-pll14xx.c +++ b/drivers/clk/imx/clk-pll14xx.c @@ -60,6 +60,8 @@ static const struct imx_pll14xx_rate_table imx_pll1443x_tbl[] = { PLL_1443X_RATE(650000000U, 325, 3, 2, 0), PLL_1443X_RATE(594000000U, 198, 2, 2, 0), PLL_1443X_RATE(519750000U, 173, 2, 2, 16384), + PLL_1443X_RATE(393216000U, 262, 2, 3, 9437), + PLL_1443X_RATE(361267200U, 361, 3, 3, 17511), }; struct imx_pll14xx_clk imx_1443x_pll = { From 04b6b67a3e7726e427112b17dd34a5a71a9661be Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 7 Oct 2023 13:57:34 +0200 Subject: [PATCH 178/228] Revert "PCI: qcom: Disable write access to read only registers for IP v2.3.3" This reverts commit 48e11e7c81b91002a120a513312a4de9f5ba7f08 which is commit a33d700e8eea76c62120cb3dbf5e01328f18319a upstream. It was applied to the incorrect function as the original function the commit changed is not in this kernel branch. Reported-by: Ben Hutchings Link: https://lore.kernel.org/r/f23affddab4d8b3cc07508f2d8735d88d823821d.camel@decadent.org.uk Signed-off-by: Greg Kroah-Hartman --- drivers/pci/controller/dwc/pcie-qcom.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c index 737cc9d6fa6a..c68e14271c02 100644 --- a/drivers/pci/controller/dwc/pcie-qcom.c +++ b/drivers/pci/controller/dwc/pcie-qcom.c @@ -771,8 +771,6 @@ static int qcom_pcie_get_resources_2_4_0(struct qcom_pcie *pcie) return PTR_ERR(res->phy_ahb_reset); } - dw_pcie_dbi_ro_wr_dis(pci); - return 0; } From b9c4b3ca9016b47eb414c3e987e3f029a0b0e02d Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sat, 23 Sep 2023 18:37:23 +0800 Subject: [PATCH 179/228] scsi: zfcp: Fix a double put in zfcp_port_enqueue() commit b481f644d9174670b385c3a699617052cd2a79d3 upstream. When device_register() fails, zfcp_port_release() will be called after put_device(). As a result, zfcp_ccw_adapter_put() will be called twice: one in zfcp_port_release() and one in the error path after device_register(). So the reference on the adapter object is doubly put, which may lead to a premature free. Fix this by adjusting the error tag after device_register(). Fixes: f3450c7b9172 ("[SCSI] zfcp: Replace local reference counting with common kref") Signed-off-by: Dinghao Liu Link: https://lore.kernel.org/r/20230923103723.10320-1-dinghao.liu@zju.edu.cn Acked-by: Benjamin Block Cc: stable@vger.kernel.org # v2.6.33+ Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_aux.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c index 18b713a616de..36c2bd2016f2 100644 --- a/drivers/s390/scsi/zfcp_aux.c +++ b/drivers/s390/scsi/zfcp_aux.c @@ -497,12 +497,12 @@ struct zfcp_port *zfcp_port_enqueue(struct zfcp_adapter *adapter, u64 wwpn, if (port) { put_device(&port->dev); retval = -EEXIST; - goto err_out; + goto err_put; } port = kzalloc(sizeof(struct zfcp_port), GFP_KERNEL); if (!port) - goto err_out; + goto err_put; rwlock_init(&port->unit_list_lock); INIT_LIST_HEAD(&port->unit_list); @@ -525,7 +525,7 @@ struct zfcp_port *zfcp_port_enqueue(struct zfcp_adapter *adapter, u64 wwpn, if (dev_set_name(&port->dev, "0x%016llx", (unsigned long long)wwpn)) { kfree(port); - goto err_out; + goto err_put; } retval = -EINVAL; @@ -542,7 +542,8 @@ struct zfcp_port *zfcp_port_enqueue(struct zfcp_adapter *adapter, u64 wwpn, return port; -err_out: +err_put: zfcp_ccw_adapter_put(adapter); +err_out: return ERR_PTR(retval); } From f6f25930fa346035b66e464c6fb35b27fa597168 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 23 Sep 2023 19:15:59 -0600 Subject: [PATCH 180/228] qed/red_ll2: Fix undefined behavior bug in struct qed_ll2_info commit eea03d18af9c44235865a4bc9bec4d780ef6cf21 upstream. The flexible structure (a structure that contains a flexible-array member at the end) `qed_ll2_tx_packet` is nested within the second layer of `struct qed_ll2_info`: struct qed_ll2_tx_packet { ... /* Flexible Array of bds_set determined by max_bds_per_packet */ struct { struct core_tx_bd *txq_bd; dma_addr_t tx_frag; u16 frag_len; } bds_set[]; }; struct qed_ll2_tx_queue { ... struct qed_ll2_tx_packet cur_completing_packet; }; struct qed_ll2_info { ... struct qed_ll2_tx_queue tx_queue; struct qed_ll2_cbs cbs; }; The problem is that member `cbs` in `struct qed_ll2_info` is placed just after an object of type `struct qed_ll2_tx_queue`, which is in itself an implicit flexible structure, which by definition ends in a flexible array member, in this case `bds_set`. This causes an undefined behavior bug at run-time when dynamic memory is allocated for `bds_set`, which could lead to a serious issue if `cbs` in `struct qed_ll2_info` is overwritten by the contents of `bds_set`. Notice that the type of `cbs` is a structure full of function pointers (and a cookie :) ): include/linux/qed/qed_ll2_if.h: 107 typedef 108 void (*qed_ll2_complete_rx_packet_cb)(void *cxt, 109 struct qed_ll2_comp_rx_data *data); 110 111 typedef 112 void (*qed_ll2_release_rx_packet_cb)(void *cxt, 113 u8 connection_handle, 114 void *cookie, 115 dma_addr_t rx_buf_addr, 116 bool b_last_packet); 117 118 typedef 119 void (*qed_ll2_complete_tx_packet_cb)(void *cxt, 120 u8 connection_handle, 121 void *cookie, 122 dma_addr_t first_frag_addr, 123 bool b_last_fragment, 124 bool b_last_packet); 125 126 typedef 127 void (*qed_ll2_release_tx_packet_cb)(void *cxt, 128 u8 connection_handle, 129 void *cookie, 130 dma_addr_t first_frag_addr, 131 bool b_last_fragment, bool b_last_packet); 132 133 typedef 134 void (*qed_ll2_slowpath_cb)(void *cxt, u8 connection_handle, 135 u32 opaque_data_0, u32 opaque_data_1); 136 137 struct qed_ll2_cbs { 138 qed_ll2_complete_rx_packet_cb rx_comp_cb; 139 qed_ll2_release_rx_packet_cb rx_release_cb; 140 qed_ll2_complete_tx_packet_cb tx_comp_cb; 141 qed_ll2_release_tx_packet_cb tx_release_cb; 142 qed_ll2_slowpath_cb slowpath_cb; 143 void *cookie; 144 }; Fix this by moving the declaration of `cbs` to the middle of its containing structure `qed_ll2_info`, preventing it from being overwritten by the contents of `bds_set` at run-time. This bug was introduced in 2017, when `bds_set` was converted to a one-element array, and started to be used as a Variable Length Object (VLO) at run-time. Fixes: f5823fe6897c ("qed: Add ll2 option to limit the number of bds per packet") Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/ZQ+Nz8DfPg56pIzr@work Signed-off-by: Paolo Abeni Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qlogic/qed/qed_ll2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h index df88d00053a2..efd025d15655 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h @@ -111,9 +111,9 @@ struct qed_ll2_info { enum core_tx_dest tx_dest; u8 tx_stats_en; bool main_func_queue; + struct qed_ll2_cbs cbs; struct qed_ll2_rx_queue rx_queue; struct qed_ll2_tx_queue tx_queue; - struct qed_ll2_cbs cbs; }; extern const struct qed_ll2_ops qed_ll2_ops_pass; From 1aeff207e2953e18c26811a49bd93e7c367b5d75 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 24 Aug 2023 21:06:51 -0600 Subject: [PATCH 181/228] wifi: mwifiex: Fix tlv_buf_left calculation commit eec679e4ac5f47507774956fb3479c206e761af7 upstream. In a TLV encoding scheme, the Length part represents the length after the header containing the values for type and length. In this case, `tlv_len` should be: tlv_len == (sizeof(*tlv_rxba) - 1) - sizeof(tlv_rxba->header) + tlv_bitmap_len Notice that the `- 1` accounts for the one-element array `bitmap`, which 1-byte size is already included in `sizeof(*tlv_rxba)`. So, if the above is correct, there is a double-counting of some members in `struct mwifiex_ie_types_rxba_sync`, when `tlv_buf_left` and `tmp` are calculated: 968 tlv_buf_left -= (sizeof(*tlv_rxba) + tlv_len); 969 tmp = (u8 *)tlv_rxba + tlv_len + sizeof(*tlv_rxba); in specific, members: drivers/net/wireless/marvell/mwifiex/fw.h:777 777 u8 mac[ETH_ALEN]; 778 u8 tid; 779 u8 reserved; 780 __le16 seq_num; 781 __le16 bitmap_len; This is clearly wrong, and affects the subsequent decoding of data in `event_buf` through `tlv_rxba`: 970 tlv_rxba = (struct mwifiex_ie_types_rxba_sync *)tmp; Fix this by using `sizeof(tlv_rxba->header)` instead of `sizeof(*tlv_rxba)` in the calculation of `tlv_buf_left` and `tmp`. This results in the following binary differences before/after changes: | drivers/net/wireless/marvell/mwifiex/11n_rxreorder.o | @@ -4698,11 +4698,11 @@ | drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c:968 | tlv_buf_left -= (sizeof(tlv_rxba->header) + tlv_len); | - 1da7: lea -0x11(%rbx),%edx | + 1da7: lea -0x4(%rbx),%edx | 1daa: movzwl %bp,%eax | drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c:969 | tmp = (u8 *)tlv_rxba + sizeof(tlv_rxba->header) + tlv_len; | - 1dad: lea 0x11(%r15,%rbp,1),%r15 | + 1dad: lea 0x4(%r15,%rbp,1),%r15 The above reflects the desired change: avoid counting 13 too many bytes; which is the total size of the double-counted members in `struct mwifiex_ie_types_rxba_sync`: $ pahole -C mwifiex_ie_types_rxba_sync drivers/net/wireless/marvell/mwifiex/11n_rxreorder.o struct mwifiex_ie_types_rxba_sync { struct mwifiex_ie_types_header header; /* 0 4 */ |----------------------------------------------------------------------- | u8 mac[6]; /* 4 6 */ | | u8 tid; /* 10 1 */ | | u8 reserved; /* 11 1 */ | | __le16 seq_num; /* 12 2 */ | | __le16 bitmap_len; /* 14 2 */ | | u8 bitmap[1]; /* 16 1 */ | |----------------------------------------------------------------------| | 13 bytes| ----------- /* size: 17, cachelines: 1, members: 7 */ /* last cacheline: 17 bytes */ } __attribute__((__packed__)); Fixes: 99ffe72cdae4 ("mwifiex: process rxba_sync event") Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/06668edd68e7a26bbfeebd1201ae077a2a7a8bce.1692931954.git.gustavoars@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c b/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c index 1046b59647f5..cbe4a200e4ea 100644 --- a/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c +++ b/drivers/net/wireless/marvell/mwifiex/11n_rxreorder.c @@ -977,8 +977,8 @@ void mwifiex_11n_rxba_sync_event(struct mwifiex_private *priv, } } - tlv_buf_left -= (sizeof(*tlv_rxba) + tlv_len); - tmp = (u8 *)tlv_rxba + tlv_len + sizeof(*tlv_rxba); + tlv_buf_left -= (sizeof(tlv_rxba->header) + tlv_len); + tmp = (u8 *)tlv_rxba + sizeof(tlv_rxba->header) + tlv_len; tlv_rxba = (struct mwifiex_ie_types_rxba_sync *)tmp; } } From 72fc02ebfc93dc818da3fd1cd4c9fb03fd199c9d Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Thu, 21 Sep 2023 18:46:40 -0500 Subject: [PATCH 182/228] net: replace calls to sock->ops->connect() with kernel_connect() commit 26297b4ce1ce4ea40bc9a48ec99f45da3f64d2e2 upstream. commit 0bdf399342c5 ("net: Avoid address overwrite in kernel_connect") ensured that kernel_connect() will not overwrite the address parameter in cases where BPF connect hooks perform an address rewrite. This change replaces direct calls to sock->ops->connect() in net with kernel_connect() to make these call safe. Link: https://lore.kernel.org/netdev/20230912013332.2048422-1-jrife@google.com/ Fixes: d74bad4e74ee ("bpf: Hooks for sys_connect") Cc: stable@vger.kernel.org Reviewed-by: Willem de Bruijn Signed-off-by: Jordan Rife Reviewed-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- net/rds/tcp_connect.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index fc8db03d3efc..e45ffa762bbe 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1507,8 +1507,8 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id, } get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); - result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - salen, 0); + result = kernel_connect(sock, (struct sockaddr *)&mcast_addr, + salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); goto error; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 4e64598176b0..2f38dac0160e 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -169,7 +169,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) * own the socket */ rds_tcp_set_callbacks(sock, cp); - ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK); + ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK); rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) From 81d03e2518945c4bc7b9a7b3f1935203954bf3ba Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Thu, 21 Sep 2023 18:46:41 -0500 Subject: [PATCH 183/228] net: prevent rewrite of msg_name in sock_sendmsg() commit 86a7e0b69bd5b812e48a20c66c2161744f3caa16 upstream. Callers of sock_sendmsg(), and similarly kernel_sendmsg(), in kernel space may observe their value of msg_name change in cases where BPF sendmsg hooks rewrite the send address. This has been confirmed to break NFS mounts running in UDP mode and has the potential to break other systems. This patch: 1) Creates a new function called __sock_sendmsg() with same logic as the old sock_sendmsg() function. 2) Replaces calls to sock_sendmsg() made by __sys_sendto() and __sys_sendmsg() with __sock_sendmsg() to avoid an unnecessary copy, as these system calls are already protected. 3) Modifies sock_sendmsg() so that it makes a copy of msg_name if present before passing it down the stack to insulate callers from changes to the send address. Link: https://lore.kernel.org/netdev/20230912013332.2048422-1-jrife@google.com/ Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Cc: stable@vger.kernel.org Reviewed-by: Willem de Bruijn Signed-off-by: Jordan Rife Reviewed-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/socket.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/net/socket.c b/net/socket.c index 1a4033511703..de89ab55d475 100644 --- a/net/socket.c +++ b/net/socket.c @@ -655,6 +655,14 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) return ret; } +static int __sock_sendmsg(struct socket *sock, struct msghdr *msg) +{ + int err = security_socket_sendmsg(sock, msg, + msg_data_left(msg)); + + return err ?: sock_sendmsg_nosec(sock, msg); +} + /** * sock_sendmsg - send a message through @sock * @sock: socket @@ -665,10 +673,19 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) */ int sock_sendmsg(struct socket *sock, struct msghdr *msg) { - int err = security_socket_sendmsg(sock, msg, - msg_data_left(msg)); + struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name; + struct sockaddr_storage address; + int ret; - return err ?: sock_sendmsg_nosec(sock, msg); + if (msg->msg_name) { + memcpy(&address, msg->msg_name, msg->msg_namelen); + msg->msg_name = &address; + } + + ret = __sock_sendmsg(sock, msg); + msg->msg_name = save_addr; + + return ret; } EXPORT_SYMBOL(sock_sendmsg); @@ -995,7 +1012,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; - res = sock_sendmsg(sock, &msg); + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; } @@ -1983,7 +2000,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; - err = sock_sendmsg(sock, &msg); + err = __sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); @@ -2356,7 +2373,7 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } - err = sock_sendmsg(sock, msg_sys); + err = __sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. From 33420a82067b5c9975426bbba4d551dc24389de6 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 21 Sep 2023 14:41:51 -0500 Subject: [PATCH 184/228] arm64: Add Cortex-A520 CPU part definition commit a654a69b9f9c06b2e56387d0b99f0e3e6b0ff4ef upstream. Add the CPU Part number for the new Arm design. Cc: stable@vger.kernel.org Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20230921194156.1050055-1-robh@kernel.org Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 9cf5d9551e99..c2a1ccd5fd46 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -79,6 +79,7 @@ #define ARM_CPU_PART_CORTEX_A78AE 0xD42 #define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 +#define ARM_CPU_PART_CORTEX_A520 0xD80 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_X2 0xD48 #define ARM_CPU_PART_NEOVERSE_N2 0xD49 @@ -130,6 +131,7 @@ #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) +#define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2) #define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2) From a5f643ab11631095c60c9852705220edf1659d6f Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Sun, 23 Apr 2023 19:10:41 +0800 Subject: [PATCH 185/228] ubi: Refuse attaching if mtd's erasesize is 0 [ Upstream commit 017c73a34a661a861712f7cc1393a123e5b2208c ] There exists mtd devices with zero erasesize, which will trigger a divide-by-zero exception while attaching ubi device. Fix it by refusing attaching if mtd's erasesize is 0. Fixes: 801c135ce73d ("UBI: Unsorted Block Images") Reported-by: Yu Hao Link: https://lore.kernel.org/lkml/977347543.226888.1682011999468.JavaMail.zimbra@nod.at/T/ Signed-off-by: Zhihao Cheng Reviewed-by: Miquel Raynal Signed-off-by: Richard Weinberger Signed-off-by: Sasha Levin --- drivers/mtd/ubi/build.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 929ce489b062..c689bed64628 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -889,6 +889,13 @@ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, return -EINVAL; } + /* UBI cannot work on flashes with zero erasesize. */ + if (!mtd->erasesize) { + pr_err("ubi: refuse attaching mtd%d - zero erasesize flash is not supported\n", + mtd->index); + return -EINVAL; + } + if (ubi_num == UBI_DEV_NUM_AUTO) { /* Search for an empty slot in the @ubi_devices array */ for (ubi_num = 0; ubi_num < UBI_MAX_DEVICES; ubi_num++) From 8c15c1bcc5b543c5fd673d8f1f1d40bd7c81c135 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 16 Jun 2023 11:03:34 +0200 Subject: [PATCH 186/228] wifi: iwlwifi: dbg_ini: fix structure packing [ Upstream commit 424c82e8ad56756bb98b08268ffcf68d12d183eb ] The iwl_fw_ini_error_dump_range structure has conflicting alignment requirements for the inner union and the outer struct: In file included from drivers/net/wireless/intel/iwlwifi/fw/dbg.c:9: drivers/net/wireless/intel/iwlwifi/fw/error-dump.h:312:2: error: field within 'struct iwl_fw_ini_error_dump_range' is less aligned than 'union iwl_fw_ini_error_dump_range::(anonymous at drivers/net/wireless/intel/iwlwifi/fw/error-dump.h:312:2)' and is usually due to 'struct iwl_fw_ini_error_dump_range' being packed, which can lead to unaligned accesses [-Werror,-Wunaligned-access] union { As the original intention was apparently to make the entire structure unaligned, mark the innermost members the same way so the union becomes packed as well. Fixes: 973193554cae6 ("iwlwifi: dbg_ini: dump headers cleanup") Signed-off-by: Arnd Bergmann Acked-by: Gregory Greenman Link: https://lore.kernel.org/r/20230616090343.2454061-1-arnd@kernel.org Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- drivers/net/wireless/intel/iwlwifi/fw/error-dump.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/error-dump.h b/drivers/net/wireless/intel/iwlwifi/fw/error-dump.h index cb40f509ab61..d08750abac95 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/error-dump.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/error-dump.h @@ -334,9 +334,9 @@ struct iwl_fw_ini_fifo_hdr { struct iwl_fw_ini_error_dump_range { __le32 range_data_size; union { - __le32 internal_base_addr; - __le64 dram_base_addr; - __le32 page_num; + __le32 internal_base_addr __packed; + __le64 dram_base_addr __packed; + __le32 page_num __packed; struct iwl_fw_ini_fifo_hdr fifo_hdr; struct iwl_cmd_header fw_pkt_hdr; }; From 10a18c8bac7f60d32b7af22da03b66f350beee38 Mon Sep 17 00:00:00 2001 From: Pin-yen Lin Date: Fri, 8 Sep 2023 18:41:12 +0800 Subject: [PATCH 187/228] wifi: mwifiex: Fix oob check condition in mwifiex_process_rx_packet [ Upstream commit aef7a0300047e7b4707ea0411dc9597cba108fc8 ] Only skip the code path trying to access the rfc1042 headers when the buffer is too small, so the driver can still process packets without rfc1042 headers. Fixes: 119585281617 ("wifi: mwifiex: Fix OOB and integer underflow when rx packets") Signed-off-by: Pin-yen Lin Acked-by: Brian Norris Reviewed-by: Matthew Wang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20230908104308.1546501-1-treapking@chromium.org Signed-off-by: Sasha Levin --- drivers/net/wireless/marvell/mwifiex/sta_rx.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/sta_rx.c b/drivers/net/wireless/marvell/mwifiex/sta_rx.c index 3c555946cb2c..5b16e330014a 100644 --- a/drivers/net/wireless/marvell/mwifiex/sta_rx.c +++ b/drivers/net/wireless/marvell/mwifiex/sta_rx.c @@ -98,7 +98,8 @@ int mwifiex_process_rx_packet(struct mwifiex_private *priv, rx_pkt_len = le16_to_cpu(local_rx_pd->rx_pkt_length); rx_pkt_hdr = (void *)local_rx_pd + rx_pkt_off; - if (sizeof(*rx_pkt_hdr) + rx_pkt_off > skb->len) { + if (sizeof(rx_pkt_hdr->eth803_hdr) + sizeof(rfc1042_header) + + rx_pkt_off > skb->len) { mwifiex_dbg(priv->adapter, ERROR, "wrong rx packet offset: len=%d, rx_pkt_off=%d\n", skb->len, rx_pkt_off); @@ -107,12 +108,13 @@ int mwifiex_process_rx_packet(struct mwifiex_private *priv, return -1; } - if ((!memcmp(&rx_pkt_hdr->rfc1042_hdr, bridge_tunnel_header, - sizeof(bridge_tunnel_header))) || - (!memcmp(&rx_pkt_hdr->rfc1042_hdr, rfc1042_header, - sizeof(rfc1042_header)) && - ntohs(rx_pkt_hdr->rfc1042_hdr.snap_type) != ETH_P_AARP && - ntohs(rx_pkt_hdr->rfc1042_hdr.snap_type) != ETH_P_IPX)) { + if (sizeof(*rx_pkt_hdr) + rx_pkt_off <= skb->len && + ((!memcmp(&rx_pkt_hdr->rfc1042_hdr, bridge_tunnel_header, + sizeof(bridge_tunnel_header))) || + (!memcmp(&rx_pkt_hdr->rfc1042_hdr, rfc1042_header, + sizeof(rfc1042_header)) && + ntohs(rx_pkt_hdr->rfc1042_hdr.snap_type) != ETH_P_AARP && + ntohs(rx_pkt_hdr->rfc1042_hdr.snap_type) != ETH_P_IPX))) { /* * Replace the 803 header and rfc1042 header (llc/snap) with an * EthernetII header, keep the src/dst and snap_type From 93dd471d3a2f37fbd2c5f5fef61cd8b7c8eec027 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sun, 17 Sep 2023 23:38:46 +0800 Subject: [PATCH 188/228] bpf: Fix tr dereferencing [ Upstream commit b724a6418f1f853bcb39c8923bf14a50c7bdbd07 ] Fix 'tr' dereferencing bug when CONFIG_BPF_JIT is turned off. When CONFIG_BPF_JIT is turned off, 'bpf_trampoline_get()' returns NULL, which is same as the cases when CONFIG_BPF_JIT is turned on. Closes: https://lore.kernel.org/r/202309131936.5Nc8eUD0-lkp@intel.com/ Fixes: f7b12b6fea00 ("bpf: verifier: refactor check_attach_btf_id()") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Leon Hwang Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20230917153846.88732-1-hffilwlqm@gmail.com Signed-off-by: Sasha Levin --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b010d45a1ecd..8f4379e93ad4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -725,7 +725,7 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog, static inline struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { - return ERR_PTR(-EOPNOTSUPP); + return NULL; } static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} #define DEFINE_BPF_DISPATCHER(name) From 54a4faab2baa7c5756a6d75438a129c183e9beb6 Mon Sep 17 00:00:00 2001 From: Alexandra Diupina Date: Tue, 19 Sep 2023 17:25:02 +0300 Subject: [PATCH 189/228] drivers/net: process the result of hdlc_open() and add call of hdlc_close() in uhdlc_close() [ Upstream commit a59addacf899b1b21a7b7449a1c52c98704c2472 ] Process the result of hdlc_open() and call uhdlc_close() in case of an error. It is necessary to pass the error code up the control flow, similar to a possible error in request_irq(). Also add a hdlc_close() call to the uhdlc_close() because the comment to hdlc_close() says it must be called by the hardware driver when the HDLC device is being closed Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: c19b6d246a35 ("drivers/net: support hdlc function for QE-UCC") Signed-off-by: Alexandra Diupina Reviewed-by: Christophe Leroy Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/wan/fsl_ucc_hdlc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index ae1ae65e7f90..bc3650c70730 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -34,6 +34,8 @@ #define TDM_PPPOHT_SLIC_MAXIN #define RX_BD_ERRORS (R_CD_S | R_OV_S | R_CR_S | R_AB_S | R_NO_S | R_LG_S) +static int uhdlc_close(struct net_device *dev); + static struct ucc_tdm_info utdm_primary_info = { .uf_info = { .tsa = 0, @@ -708,6 +710,7 @@ static int uhdlc_open(struct net_device *dev) hdlc_device *hdlc = dev_to_hdlc(dev); struct ucc_hdlc_private *priv = hdlc->priv; struct ucc_tdm *utdm = priv->utdm; + int rc = 0; if (priv->hdlc_busy != 1) { if (request_irq(priv->ut_info->uf_info.irq, @@ -731,10 +734,13 @@ static int uhdlc_open(struct net_device *dev) napi_enable(&priv->napi); netdev_reset_queue(dev); netif_start_queue(dev); - hdlc_open(dev); + + rc = hdlc_open(dev); + if (rc) + uhdlc_close(dev); } - return 0; + return rc; } static void uhdlc_memclean(struct ucc_hdlc_private *priv) @@ -824,6 +830,8 @@ static int uhdlc_close(struct net_device *dev) netdev_reset_queue(dev); priv->hdlc_busy = 0; + hdlc_close(dev); + return 0; } From 666cdc43df2475807ae3aaa85e40f87c76ee2629 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 19 Sep 2023 21:47:47 +0200 Subject: [PATCH 190/228] wifi: mt76: mt76x02: fix MT76x0 external LNA gain handling [ Upstream commit 684e45e120b82deccaf8b85633905304a3bbf56d ] On MT76x0, LNA gain should be applied for both external and internal LNA. On MT76x2, LNA gain should be treated as 0 for external LNA. Move the LNA type based logic to mt76x2 in order to fix mt76x0. Fixes: 2daa67588f34 ("mt76x0: unify lna_gain parsing") Reported-by: Shiji Yang Signed-off-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20230919194747.31647-1-nbd@nbd.name Signed-off-by: Sasha Levin --- drivers/net/wireless/mediatek/mt76/mt76x02_eeprom.c | 7 ------- drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c | 13 +++++++++++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_eeprom.c b/drivers/net/wireless/mediatek/mt76/mt76x02_eeprom.c index 0acabba2d1a5..5d402cf2951c 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_eeprom.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_eeprom.c @@ -131,15 +131,8 @@ u8 mt76x02_get_lna_gain(struct mt76x02_dev *dev, s8 *lna_2g, s8 *lna_5g, struct ieee80211_channel *chan) { - u16 val; u8 lna; - val = mt76x02_eeprom_get(dev, MT_EE_NIC_CONF_1); - if (val & MT_EE_NIC_CONF_1_LNA_EXT_2G) - *lna_2g = 0; - if (val & MT_EE_NIC_CONF_1_LNA_EXT_5G) - memset(lna_5g, 0, sizeof(s8) * 3); - if (chan->band == NL80211_BAND_2GHZ) lna = *lna_2g; else if (chan->hw_value <= 64) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c b/drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c index 410ffce3baff..60478116014f 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/eeprom.c @@ -256,7 +256,8 @@ void mt76x2_read_rx_gain(struct mt76x02_dev *dev) struct ieee80211_channel *chan = dev->mphy.chandef.chan; int channel = chan->hw_value; s8 lna_5g[3], lna_2g; - u8 lna; + bool use_lna; + u8 lna = 0; u16 val; if (chan->band == NL80211_BAND_2GHZ) @@ -275,7 +276,15 @@ void mt76x2_read_rx_gain(struct mt76x02_dev *dev) dev->cal.rx.mcu_gain |= (lna_5g[1] & 0xff) << 16; dev->cal.rx.mcu_gain |= (lna_5g[2] & 0xff) << 24; - lna = mt76x02_get_lna_gain(dev, &lna_2g, lna_5g, chan); + val = mt76x02_eeprom_get(dev, MT_EE_NIC_CONF_1); + if (chan->band == NL80211_BAND_2GHZ) + use_lna = !(val & MT_EE_NIC_CONF_1_LNA_EXT_2G); + else + use_lna = !(val & MT_EE_NIC_CONF_1_LNA_EXT_5G); + + if (use_lna) + lna = mt76x02_get_lna_gain(dev, &lna_2g, lna_5g, chan); + dev->cal.rx.lna_gain = mt76x02_sign_extend(lna, 8); } EXPORT_SYMBOL_GPL(mt76x2_read_rx_gain); From 725fd20805590d91d8ab82197fff3fd527d27383 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 22 Sep 2023 16:37:11 +0100 Subject: [PATCH 191/228] regmap: rbtree: Fix wrong register marked as in-cache when creating new node [ Upstream commit 7a795ac8d49e2433e1b97caf5e99129daf8e1b08 ] When regcache_rbtree_write() creates a new rbtree_node it was passing the wrong bit number to regcache_rbtree_set_register(). The bit number is the offset __in number of registers__, but in the case of creating a new block regcache_rbtree_write() was not dividing by the address stride to get the number of registers. Fix this by dividing by map->reg_stride. Compare with regcache_rbtree_read() where the bit is checked. This bug meant that the wrong register was marked as present. The register that was written to the cache could not be read from the cache because it was not marked as cached. But a nearby register could be marked as having a cached value even if it was never written to the cache. Signed-off-by: Richard Fitzgerald Fixes: 3f4ff561bc88 ("regmap: rbtree: Make cache_present bitmap per node") Link: https://lore.kernel.org/r/20230922153711.28103-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/base/regmap/regcache-rbtree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/regmap/regcache-rbtree.c b/drivers/base/regmap/regcache-rbtree.c index ae6b8788d5f3..d65715b9e129 100644 --- a/drivers/base/regmap/regcache-rbtree.c +++ b/drivers/base/regmap/regcache-rbtree.c @@ -453,7 +453,8 @@ static int regcache_rbtree_write(struct regmap *map, unsigned int reg, if (!rbnode) return -ENOMEM; regcache_rbtree_set_register(map, rbnode, - reg - rbnode->base_reg, value); + (reg - rbnode->base_reg) / map->reg_stride, + value); regcache_rbtree_insert(map, &rbtree_ctx->root, rbnode); rbtree_ctx->cached_rbnode = rbnode; } From a9430129d8dbfce075b9392dd376ddfb3cc8a805 Mon Sep 17 00:00:00 2001 From: Oleksandr Tymoshenko Date: Thu, 21 Sep 2023 06:45:05 +0000 Subject: [PATCH 192/228] ima: Finish deprecation of IMA_TRUSTED_KEYRING Kconfig [ Upstream commit be210c6d3597faf330cb9af33b9f1591d7b2a983 ] The removal of IMA_TRUSTED_KEYRING made IMA_LOAD_X509 and IMA_BLACKLIST_KEYRING unavailable because the latter two depend on the former. Since IMA_TRUSTED_KEYRING was deprecated in favor of INTEGRITY_TRUSTED_KEYRING use it as a dependency for the two Kconfigs affected by the deprecation. Fixes: 5087fd9e80e5 ("ima: Remove deprecated IMA_TRUSTED_KEYRING Kconfig") Signed-off-by: Oleksandr Tymoshenko Reviewed-by: Nayna Jain Signed-off-by: Mimi Zohar Signed-off-by: Sasha Levin --- security/integrity/ima/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index 05b8f5bcc37a..d1b490705c2e 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -268,7 +268,7 @@ config IMA_KEYRINGS_PERMIT_SIGNED_BY_BUILTIN_OR_SECONDARY config IMA_BLACKLIST_KEYRING bool "Create IMA machine owner blacklist keyrings (EXPERIMENTAL)" depends on SYSTEM_TRUSTED_KEYRING - depends on IMA_TRUSTED_KEYRING + depends on INTEGRITY_TRUSTED_KEYRING default n help This option creates an IMA blacklist keyring, which contains all @@ -278,7 +278,7 @@ config IMA_BLACKLIST_KEYRING config IMA_LOAD_X509 bool "Load X509 certificate onto the '.ima' trusted keyring" - depends on IMA_TRUSTED_KEYRING + depends on INTEGRITY_TRUSTED_KEYRING default n help File signature verification is based on the public keys From 77f82df960cbd457ae4741048cb405bdb73129c1 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Mon, 18 Sep 2023 15:58:48 -0700 Subject: [PATCH 193/228] scsi: target: core: Fix deadlock due to recursive locking [ Upstream commit a154f5f643c6ecddd44847217a7a3845b4350003 ] The following call trace shows a deadlock issue due to recursive locking of mutex "device_mutex". First lock acquire is in target_for_each_device() and second in target_free_device(). PID: 148266 TASK: ffff8be21ffb5d00 CPU: 10 COMMAND: "iscsi_ttx" #0 [ffffa2bfc9ec3b18] __schedule at ffffffffa8060e7f #1 [ffffa2bfc9ec3ba0] schedule at ffffffffa8061224 #2 [ffffa2bfc9ec3bb8] schedule_preempt_disabled at ffffffffa80615ee #3 [ffffa2bfc9ec3bc8] __mutex_lock at ffffffffa8062fd7 #4 [ffffa2bfc9ec3c40] __mutex_lock_slowpath at ffffffffa80631d3 #5 [ffffa2bfc9ec3c50] mutex_lock at ffffffffa806320c #6 [ffffa2bfc9ec3c68] target_free_device at ffffffffc0935998 [target_core_mod] #7 [ffffa2bfc9ec3c90] target_core_dev_release at ffffffffc092f975 [target_core_mod] #8 [ffffa2bfc9ec3ca0] config_item_put at ffffffffa79d250f #9 [ffffa2bfc9ec3cd0] config_item_put at ffffffffa79d2583 #10 [ffffa2bfc9ec3ce0] target_devices_idr_iter at ffffffffc0933f3a [target_core_mod] #11 [ffffa2bfc9ec3d00] idr_for_each at ffffffffa803f6fc #12 [ffffa2bfc9ec3d60] target_for_each_device at ffffffffc0935670 [target_core_mod] #13 [ffffa2bfc9ec3d98] transport_deregister_session at ffffffffc0946408 [target_core_mod] #14 [ffffa2bfc9ec3dc8] iscsit_close_session at ffffffffc09a44a6 [iscsi_target_mod] #15 [ffffa2bfc9ec3df0] iscsit_close_connection at ffffffffc09a4a88 [iscsi_target_mod] #16 [ffffa2bfc9ec3df8] finish_task_switch at ffffffffa76e5d07 #17 [ffffa2bfc9ec3e78] iscsit_take_action_for_connection_exit at ffffffffc0991c23 [iscsi_target_mod] #18 [ffffa2bfc9ec3ea0] iscsi_target_tx_thread at ffffffffc09a403b [iscsi_target_mod] #19 [ffffa2bfc9ec3f08] kthread at ffffffffa76d8080 #20 [ffffa2bfc9ec3f50] ret_from_fork at ffffffffa8200364 Fixes: 36d4cb460bcb ("scsi: target: Avoid that EXTENDED COPY commands trigger lock inversion") Signed-off-by: Junxiao Bi Link: https://lore.kernel.org/r/20230918225848.66463-1-junxiao.bi@oracle.com Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/target/target_core_device.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 4664330fb55d..9aeedcff7d02 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -867,7 +867,6 @@ sector_t target_to_linux_sector(struct se_device *dev, sector_t lb) EXPORT_SYMBOL(target_to_linux_sector); struct devices_idr_iter { - struct config_item *prev_item; int (*fn)(struct se_device *dev, void *data); void *data; }; @@ -877,11 +876,9 @@ static int target_devices_idr_iter(int id, void *p, void *data) { struct devices_idr_iter *iter = data; struct se_device *dev = p; + struct config_item *item; int ret; - config_item_put(iter->prev_item); - iter->prev_item = NULL; - /* * We add the device early to the idr, so it can be used * by backend modules during configuration. We do not want @@ -891,12 +888,13 @@ static int target_devices_idr_iter(int id, void *p, void *data) if (!target_dev_configured(dev)) return 0; - iter->prev_item = config_item_get_unless_zero(&dev->dev_group.cg_item); - if (!iter->prev_item) + item = config_item_get_unless_zero(&dev->dev_group.cg_item); + if (!item) return 0; mutex_unlock(&device_mutex); ret = iter->fn(dev, iter->data); + config_item_put(item); mutex_lock(&device_mutex); return ret; @@ -919,7 +917,6 @@ int target_for_each_device(int (*fn)(struct se_device *dev, void *data), mutex_lock(&device_mutex); ret = idr_for_each(&devices_idr, target_devices_idr_iter, &iter); mutex_unlock(&device_mutex); - config_item_put(iter.prev_item); return ret; } From d8f2ba9ec3582ef83e2f3050e4a7c83c1e7d9d17 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 27 Sep 2023 09:22:14 +0200 Subject: [PATCH 194/228] ima: rework CONFIG_IMA dependency block [ Upstream commit 91e326563ee34509c35267808a4b1b3ea3db62a8 ] Changing the direct dependencies of IMA_BLACKLIST_KEYRING and IMA_LOAD_X509 caused them to no longer depend on IMA, but a a configuration without IMA results in link failures: arm-linux-gnueabi-ld: security/integrity/iint.o: in function `integrity_load_keys': iint.c:(.init.text+0xd8): undefined reference to `ima_load_x509' aarch64-linux-ld: security/integrity/digsig_asymmetric.o: in function `asymmetric_verify': digsig_asymmetric.c:(.text+0x104): undefined reference to `ima_blacklist_keyring' Adding explicit dependencies on IMA would fix this, but a more reliable way to do this is to enclose the entire Kconfig file in an 'if IMA' block. This also allows removing the existing direct dependencies. Fixes: be210c6d3597f ("ima: Finish deprecation of IMA_TRUSTED_KEYRING Kconfig") Signed-off-by: Arnd Bergmann Signed-off-by: Mimi Zohar Signed-off-by: Sasha Levin --- security/integrity/ima/Kconfig | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index d1b490705c2e..d0d3ff58da49 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -29,9 +29,11 @@ config IMA to learn more about IMA. If unsure, say N. +if IMA + config IMA_KEXEC bool "Enable carrying the IMA measurement list across a soft boot" - depends on IMA && TCG_TPM && HAVE_IMA_KEXEC + depends on TCG_TPM && HAVE_IMA_KEXEC default n help TPM PCRs are only reset on a hard reboot. In order to validate @@ -43,7 +45,6 @@ config IMA_KEXEC config IMA_MEASURE_PCR_IDX int - depends on IMA range 8 14 default 10 help @@ -53,7 +54,7 @@ config IMA_MEASURE_PCR_IDX config IMA_LSM_RULES bool - depends on IMA && AUDIT && (SECURITY_SELINUX || SECURITY_SMACK || SECURITY_APPARMOR) + depends on AUDIT && (SECURITY_SELINUX || SECURITY_SMACK || SECURITY_APPARMOR) default y help Disabling this option will disregard LSM based policy rules. @@ -61,7 +62,6 @@ config IMA_LSM_RULES choice prompt "Default template" default IMA_NG_TEMPLATE - depends on IMA help Select the default IMA measurement template. @@ -80,14 +80,12 @@ endchoice config IMA_DEFAULT_TEMPLATE string - depends on IMA default "ima-ng" if IMA_NG_TEMPLATE default "ima-sig" if IMA_SIG_TEMPLATE choice prompt "Default integrity hash algorithm" default IMA_DEFAULT_HASH_SHA1 - depends on IMA help Select the default hash algorithm used for the measurement list, integrity appraisal and audit log. The compiled default @@ -117,7 +115,6 @@ endchoice config IMA_DEFAULT_HASH string - depends on IMA default "sha1" if IMA_DEFAULT_HASH_SHA1 default "sha256" if IMA_DEFAULT_HASH_SHA256 default "sha512" if IMA_DEFAULT_HASH_SHA512 @@ -126,7 +123,6 @@ config IMA_DEFAULT_HASH config IMA_WRITE_POLICY bool "Enable multiple writes to the IMA policy" - depends on IMA default n help IMA policy can now be updated multiple times. The new rules get @@ -137,7 +133,6 @@ config IMA_WRITE_POLICY config IMA_READ_POLICY bool "Enable reading back the current IMA policy" - depends on IMA default y if IMA_WRITE_POLICY default n if !IMA_WRITE_POLICY help @@ -147,7 +142,6 @@ config IMA_READ_POLICY config IMA_APPRAISE bool "Appraise integrity measurements" - depends on IMA default n help This option enables local measurement integrity appraisal. @@ -303,7 +297,6 @@ config IMA_APPRAISE_SIGNED_INIT config IMA_MEASURE_ASYMMETRIC_KEYS bool - depends on IMA depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y default y @@ -319,3 +312,5 @@ config IMA_SECURE_AND_OR_TRUSTED_BOOT help This option is selected by architectures to enable secure and/or trusted boot based on IMA runtime policies. + +endif From 225cd4f67bd4c2c6541e7021dd4ebe0ca23d1994 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 17 Sep 2023 19:05:50 -0400 Subject: [PATCH 195/228] NFSv4: Fix a nfs4_state_manager() race [ Upstream commit ed1cc05aa1f7fe8197d300e914afc28ab9818f89 ] If the NFS4CLNT_RUN_MANAGER flag got set just before we cleared NFS4CLNT_MANAGER_RUNNING, then we might have won the race against nfs4_schedule_state_manager(), and are responsible for handling the recovery situation. Fixes: aeabb3c96186 ("NFSv4: Fix a NFSv4 state manager deadlock") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/nfs4state.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 10946b24c66f..afb617a4a7e4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2690,6 +2690,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); + if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, + &clp->cl_state)) { + memflags = memalloc_nofs_save(); + continue; + } + if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) { if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { nfs_client_return_marked_delegations(clp); From 6e3d9e5caba870287b857e59c4858fd0e90290cb Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Thu, 28 Sep 2023 17:28:07 -0300 Subject: [PATCH 196/228] modpost: add missing else to the "of" check [ Upstream commit cbc3d00cf88fda95dbcafee3b38655b7a8f2650a ] Without this 'else' statement, an "usb" name goes into two handlers: the first/previous 'if' statement _AND_ the for-loop over 'devtable', but the latter is useless as it has no 'usb' device_id entry anyway. Tested with allmodconfig before/after patch; no changes to *.mod.c: git checkout v6.6-rc3 make -j$(nproc) allmodconfig make -j$(nproc) olddefconfig make -j$(nproc) find . -name '*.mod.c' | cpio -pd /tmp/before # apply patch make -j$(nproc) find . -name '*.mod.c' | cpio -pd /tmp/after diff -r /tmp/before/ /tmp/after/ # no difference Fixes: acbef7b76629 ("modpost: fix module autoloading for OF devices with generic compatible property") Signed-off-by: Mauricio Faria de Oliveira Signed-off-by: Masahiro Yamada Signed-off-by: Sasha Levin --- scripts/mod/file2alias.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 2417dd1dee33..da4df53ee695 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1490,7 +1490,7 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, /* First handle the "special" cases */ if (sym_is(name, namelen, "usb")) do_usb_table(symval, sym->st_size, mod); - if (sym_is(name, namelen, "of")) + else if (sym_is(name, namelen, "of")) do_of_table(symval, sym->st_size, mod); else if (sym_is(name, namelen, "pnp")) do_pnp_device_entry(symval, sym->st_size, mod); From 2ea52a2fb8e87067e26bbab4efb8872639240eb0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Sep 2023 08:46:26 +0000 Subject: [PATCH 197/228] net: fix possible store tearing in neigh_periodic_work() [ Upstream commit 25563b581ba3a1f263a00e8c9a97f5e7363be6fd ] While looking at a related syzbot report involving neigh_periodic_work(), I found that I forgot to add an annotation when deleting an RCU protected item from a list. Readers use rcu_deference(*np), we need to use either rcu_assign_pointer() or WRITE_ONCE() on writer side to prevent store tearing. I use rcu_assign_pointer() to have lockdep support, this was the choice made in neigh_flush_dev(). Fixes: 767e97e1e0db ("neigh: RCU conversion of struct neighbour") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/neighbour.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3b642c412cf3..15267428c4f8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -935,7 +935,9 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); From 96b2e1090397217839fcd6c9b6d8f5d439e705ed Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 21 Sep 2023 11:41:19 +0100 Subject: [PATCH 198/228] ipv4, ipv6: Fix handling of transhdrlen in __ip{,6}_append_data() [ Upstream commit 9d4c75800f61e5d75c1659ba201b6c0c7ead3070 ] Including the transhdrlen in length is a problem when the packet is partially filled (e.g. something like send(MSG_MORE) happened previously) when appending to an IPv4 or IPv6 packet as we don't want to repeat the transport header or account for it twice. This can happen under some circumstances, such as splicing into an L2TP socket. The symptom observed is a warning in __ip6_append_data(): WARNING: CPU: 1 PID: 5042 at net/ipv6/ip6_output.c:1800 __ip6_append_data.isra.0+0x1be8/0x47f0 net/ipv6/ip6_output.c:1800 that occurs when MSG_SPLICE_PAGES is used to append more data to an already partially occupied skbuff. The warning occurs when 'copy' is larger than the amount of data in the message iterator. This is because the requested length includes the transport header length when it shouldn't. This can be triggered by, for example: sfd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_L2TP); bind(sfd, ...); // ::1 connect(sfd, ...); // ::1 port 7 send(sfd, buffer, 4100, MSG_MORE); sendfile(sfd, dfd, NULL, 1024); Fix this by only adding transhdrlen into the length if the write queue is empty in l2tp_ip6_sendmsg(), analogously to how UDP does things. l2tp_ip_sendmsg() looks like it won't suffer from this problem as it builds the UDP packet itself. Fixes: a32e0eec7042 ("l2tp: introduce L2TPv3 IP encapsulation support for IPv6") Reported-by: syzbot+62cbf263225ae13ff153@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/0000000000001c12b30605378ce8@google.com/ Suggested-by: Willem de Bruijn Signed-off-by: David Howells cc: Eric Dumazet cc: Willem de Bruijn cc: "David S. Miller" cc: David Ahern cc: Paolo Abeni cc: Jakub Kicinski cc: netdev@vger.kernel.org cc: bpf@vger.kernel.org cc: syzkaller-bugs@googlegroups.com Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/l2tp/l2tp_ip6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 382124d6f764..9746c624a550 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -508,7 +508,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) */ if (len > INT_MAX - transhdrlen) return -EMSGSIZE; - ulen = len + transhdrlen; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) @@ -629,6 +628,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: lock_sock(sk); + ulen = len + skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0; err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, From d44346dda7d4faca4a64b6233a3ea0a09dc615e4 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Fri, 22 Sep 2023 09:47:41 -0300 Subject: [PATCH 199/228] net: dsa: mv88e6xxx: Avoid EEPROM timeout when EEPROM is absent [ Upstream commit 6ccf50d4d4741e064ba35511a95402c63bbe21a8 ] Since commit 23d775f12dcd ("net: dsa: mv88e6xxx: Wait for EEPROM done before HW reset") the following error is seen on a imx8mn board with a 88E6320 switch: mv88e6085 30be0000.ethernet-1:00: Timeout waiting for EEPROM done This board does not have an EEPROM attached to the switch though. This problem is well explained by Andrew Lunn: "If there is an EEPROM, and the EEPROM contains a lot of data, it could be that when we perform a hardware reset towards the end of probe, it interrupts an I2C bus transaction, leaving the I2C bus in a bad state, and future reads of the EEPROM do not work. The work around for this was to poll the EEInt status and wait for it to go true before performing the hardware reset. However, we have discovered that for some boards which do not have an EEPROM, EEInt never indicates complete. As a result, mv88e6xxx_g1_wait_eeprom_done() spins for a second and then prints a warning. We probably need a different solution than calling mv88e6xxx_g1_wait_eeprom_done(). The datasheet for 6352 documents the EEPROM Command register: bit 15 is: EEPROM Unit Busy. This bit must be set to a one to start an EEPROM operation (see EEOp below). Only one EEPROM operation can be executing at one time so this bit must be zero before setting it to a one. When the requested EEPROM operation completes this bit will automatically be cleared to a zero. The transition of this bit from a one to a zero can be used to generate an interrupt (the EEInt in Global 1, offset 0x00). and more interesting is bit 11: Register Loader Running. This bit is set to one whenever the register loader is busy executing instructions contained in the EEPROM." Change to using mv88e6xxx_g2_eeprom_wait() to fix the timeout error when the EEPROM chip is not present. Fixes: 23d775f12dcd ("net: dsa: mv88e6xxx: Wait for EEPROM done before HW reset") Suggested-by: Andrew Lunn Signed-off-by: Fabio Estevam Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/mv88e6xxx/chip.c | 6 ++++-- drivers/net/dsa/mv88e6xxx/global1.c | 31 ----------------------------- drivers/net/dsa/mv88e6xxx/global1.h | 1 - drivers/net/dsa/mv88e6xxx/global2.c | 2 +- drivers/net/dsa/mv88e6xxx/global2.h | 1 + 5 files changed, 6 insertions(+), 35 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 177151298d72..53fbef9f4ce5 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2316,14 +2316,16 @@ static void mv88e6xxx_hardware_reset(struct mv88e6xxx_chip *chip) * from the wrong location resulting in the switch booting * to wrong mode and inoperable. */ - mv88e6xxx_g1_wait_eeprom_done(chip); + if (chip->info->ops->get_eeprom) + mv88e6xxx_g2_eeprom_wait(chip); gpiod_set_value_cansleep(gpiod, 1); usleep_range(10000, 20000); gpiod_set_value_cansleep(gpiod, 0); usleep_range(10000, 20000); - mv88e6xxx_g1_wait_eeprom_done(chip); + if (chip->info->ops->get_eeprom) + mv88e6xxx_g2_eeprom_wait(chip); } } diff --git a/drivers/net/dsa/mv88e6xxx/global1.c b/drivers/net/dsa/mv88e6xxx/global1.c index 9936ae69e5ee..ff43d9c9a7eb 100644 --- a/drivers/net/dsa/mv88e6xxx/global1.c +++ b/drivers/net/dsa/mv88e6xxx/global1.c @@ -75,37 +75,6 @@ static int mv88e6xxx_g1_wait_init_ready(struct mv88e6xxx_chip *chip) return mv88e6xxx_g1_wait_bit(chip, MV88E6XXX_G1_STS, bit, 1); } -void mv88e6xxx_g1_wait_eeprom_done(struct mv88e6xxx_chip *chip) -{ - const unsigned long timeout = jiffies + 1 * HZ; - u16 val; - int err; - - /* Wait up to 1 second for the switch to finish reading the - * EEPROM. - */ - while (time_before(jiffies, timeout)) { - err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_STS, &val); - if (err) { - dev_err(chip->dev, "Error reading status"); - return; - } - - /* If the switch is still resetting, it may not - * respond on the bus, and so MDIO read returns - * 0xffff. Differentiate between that, and waiting for - * the EEPROM to be done by bit 0 being set. - */ - if (val != 0xffff && - val & BIT(MV88E6XXX_G1_STS_IRQ_EEPROM_DONE)) - return; - - usleep_range(1000, 2000); - } - - dev_err(chip->dev, "Timeout waiting for EEPROM done"); -} - /* Offset 0x01: Switch MAC Address Register Bytes 0 & 1 * Offset 0x02: Switch MAC Address Register Bytes 2 & 3 * Offset 0x03: Switch MAC Address Register Bytes 4 & 5 diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h index e05abe61fa11..1e3546f8b072 100644 --- a/drivers/net/dsa/mv88e6xxx/global1.h +++ b/drivers/net/dsa/mv88e6xxx/global1.h @@ -278,7 +278,6 @@ int mv88e6xxx_g1_set_switch_mac(struct mv88e6xxx_chip *chip, u8 *addr); int mv88e6185_g1_reset(struct mv88e6xxx_chip *chip); int mv88e6352_g1_reset(struct mv88e6xxx_chip *chip); int mv88e6250_g1_reset(struct mv88e6xxx_chip *chip); -void mv88e6xxx_g1_wait_eeprom_done(struct mv88e6xxx_chip *chip); int mv88e6185_g1_ppu_enable(struct mv88e6xxx_chip *chip); int mv88e6185_g1_ppu_disable(struct mv88e6xxx_chip *chip); diff --git a/drivers/net/dsa/mv88e6xxx/global2.c b/drivers/net/dsa/mv88e6xxx/global2.c index 75b227d0f73b..8607b2445e1a 100644 --- a/drivers/net/dsa/mv88e6xxx/global2.c +++ b/drivers/net/dsa/mv88e6xxx/global2.c @@ -323,7 +323,7 @@ int mv88e6xxx_g2_pot_clear(struct mv88e6xxx_chip *chip) * Offset 0x15: EEPROM Addr (for 8-bit data access) */ -static int mv88e6xxx_g2_eeprom_wait(struct mv88e6xxx_chip *chip) +int mv88e6xxx_g2_eeprom_wait(struct mv88e6xxx_chip *chip) { int bit = __bf_shf(MV88E6XXX_G2_EEPROM_CMD_BUSY); int err; diff --git a/drivers/net/dsa/mv88e6xxx/global2.h b/drivers/net/dsa/mv88e6xxx/global2.h index 1f42ee656816..de63e3f08e5c 100644 --- a/drivers/net/dsa/mv88e6xxx/global2.h +++ b/drivers/net/dsa/mv88e6xxx/global2.h @@ -349,6 +349,7 @@ int mv88e6xxx_g2_trunk_clear(struct mv88e6xxx_chip *chip); int mv88e6xxx_g2_device_mapping_write(struct mv88e6xxx_chip *chip, int target, int port); +int mv88e6xxx_g2_eeprom_wait(struct mv88e6xxx_chip *chip); extern const struct mv88e6xxx_irq_ops mv88e6097_watchdog_ops; extern const struct mv88e6xxx_irq_ops mv88e6250_watchdog_ops; From 30bc4d7aebe33904b0f2d3aad4b4a9c6029ad0c5 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 24 Sep 2023 02:35:49 +0900 Subject: [PATCH 200/228] net: usb: smsc75xx: Fix uninit-value access in __smsc75xx_read_reg [ Upstream commit e9c65989920f7c28775ec4e0c11b483910fb67b8 ] syzbot reported the following uninit-value access issue: ===================================================== BUG: KMSAN: uninit-value in smsc75xx_wait_ready drivers/net/usb/smsc75xx.c:975 [inline] BUG: KMSAN: uninit-value in smsc75xx_bind+0x5c9/0x11e0 drivers/net/usb/smsc75xx.c:1482 CPU: 0 PID: 8696 Comm: kworker/0:3 Not tainted 5.8.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x21c/0x280 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 smsc75xx_wait_ready drivers/net/usb/smsc75xx.c:975 [inline] smsc75xx_bind+0x5c9/0x11e0 drivers/net/usb/smsc75xx.c:1482 usbnet_probe+0x1152/0x3f90 drivers/net/usb/usbnet.c:1737 usb_probe_interface+0xece/0x1550 drivers/usb/core/driver.c:374 really_probe+0xf20/0x20b0 drivers/base/dd.c:529 driver_probe_device+0x293/0x390 drivers/base/dd.c:701 __device_attach_driver+0x63f/0x830 drivers/base/dd.c:807 bus_for_each_drv+0x2ca/0x3f0 drivers/base/bus.c:431 __device_attach+0x4e2/0x7f0 drivers/base/dd.c:873 device_initial_probe+0x4a/0x60 drivers/base/dd.c:920 bus_probe_device+0x177/0x3d0 drivers/base/bus.c:491 device_add+0x3b0e/0x40d0 drivers/base/core.c:2680 usb_set_configuration+0x380f/0x3f10 drivers/usb/core/message.c:2032 usb_generic_driver_probe+0x138/0x300 drivers/usb/core/generic.c:241 usb_probe_device+0x311/0x490 drivers/usb/core/driver.c:272 really_probe+0xf20/0x20b0 drivers/base/dd.c:529 driver_probe_device+0x293/0x390 drivers/base/dd.c:701 __device_attach_driver+0x63f/0x830 drivers/base/dd.c:807 bus_for_each_drv+0x2ca/0x3f0 drivers/base/bus.c:431 __device_attach+0x4e2/0x7f0 drivers/base/dd.c:873 device_initial_probe+0x4a/0x60 drivers/base/dd.c:920 bus_probe_device+0x177/0x3d0 drivers/base/bus.c:491 device_add+0x3b0e/0x40d0 drivers/base/core.c:2680 usb_new_device+0x1bd4/0x2a30 drivers/usb/core/hub.c:2554 hub_port_connect drivers/usb/core/hub.c:5208 [inline] hub_port_connect_change drivers/usb/core/hub.c:5348 [inline] port_event drivers/usb/core/hub.c:5494 [inline] hub_event+0x5e7b/0x8a70 drivers/usb/core/hub.c:5576 process_one_work+0x1688/0x2140 kernel/workqueue.c:2269 worker_thread+0x10bc/0x2730 kernel/workqueue.c:2415 kthread+0x551/0x590 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:293 Local variable ----buf.i87@smsc75xx_bind created at: __smsc75xx_read_reg drivers/net/usb/smsc75xx.c:83 [inline] smsc75xx_wait_ready drivers/net/usb/smsc75xx.c:968 [inline] smsc75xx_bind+0x485/0x11e0 drivers/net/usb/smsc75xx.c:1482 __smsc75xx_read_reg drivers/net/usb/smsc75xx.c:83 [inline] smsc75xx_wait_ready drivers/net/usb/smsc75xx.c:968 [inline] smsc75xx_bind+0x485/0x11e0 drivers/net/usb/smsc75xx.c:1482 This issue is caused because usbnet_read_cmd() reads less bytes than requested (zero byte in the reproducer). In this case, 'buf' is not properly filled. This patch fixes the issue by returning -ENODATA if usbnet_read_cmd() reads less bytes than requested. Fixes: d0cad871703b ("smsc75xx: SMSC LAN75xx USB gigabit ethernet adapter driver") Reported-and-tested-by: syzbot+6966546b78d050bb0b5d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6966546b78d050bb0b5d Signed-off-by: Shigeru Yoshida Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230923173549.3284502-1-syoshida@redhat.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/usb/smsc75xx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index fb1389bd0939..6310841aeac7 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -90,7 +90,9 @@ static int __must_check __smsc75xx_read_reg(struct usbnet *dev, u32 index, ret = fn(dev, USB_VENDOR_REQUEST_READ_REGISTER, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, index, &buf, 4); - if (unlikely(ret < 0)) { + if (unlikely(ret < 4)) { + ret = ret < 0 ? ret : -ENODATA; + netdev_warn(dev->net, "Failed to read reg index 0x%08x: %d\n", index, ret); return ret; From dba849cc98113b145c6e720122942c00b8012bdb Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Fri, 8 Sep 2023 19:58:53 -0400 Subject: [PATCH 201/228] net: nfc: llcp: Add lock when modifying device list [ Upstream commit dfc7f7a988dad34c3bf4c053124fb26aa6c5f916 ] The device list needs its associated lock held when modifying it, or the list could become corrupted, as syzbot discovered. Reported-and-tested-by: syzbot+c1d0a03d305972dbbe14@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c1d0a03d305972dbbe14 Signed-off-by: Jeremy Cline Reviewed-by: Simon Horman Fixes: 6709d4b7bc2e ("net: nfc: Fix use-after-free caused by nfc_llcp_find_local") Link: https://lore.kernel.org/r/20230908235853.1319596-1-jeremy@jcline.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/nfc/llcp_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index ddfd159f64e1..b1107570eaee 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -1646,7 +1646,9 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) timer_setup(&local->sdreq_timer, nfc_llcp_sdreq_timer, 0); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); + spin_lock(&llcp_devices_lock); list_add(&local->list, &llcp_devices); + spin_unlock(&llcp_devices_lock); return 0; } From b212f361a5d1d936cdf2958de1e9dfac1bccf06b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 26 Sep 2023 17:04:43 +0300 Subject: [PATCH 202/228] net: ethernet: ti: am65-cpsw: Fix error code in am65_cpsw_nuss_init_tx_chns() [ Upstream commit 37d4f55567982e445f86dc0ff4ecfa72921abfe8 ] This accidentally returns success, but it should return a negative error code. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Signed-off-by: Dan Carpenter Reviewed-by: Roger Quadros Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index e4af1f506b83..d10324431354 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1496,6 +1496,7 @@ static int am65_cpsw_nuss_init_tx_chns(struct am65_cpsw_common *common) if (tx_chn->irq <= 0) { dev_err(dev, "Failed to get tx dma irq %d\n", tx_chn->irq); + ret = tx_chn->irq ?: -ENXIO; goto err; } From 0ba9348532bd66b012fa6c87152be9c4b987a393 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 3 Oct 2023 13:17:53 -0400 Subject: [PATCH 203/228] netfilter: handle the connecting collision properly in nf_conntrack_proto_sctp [ Upstream commit 8e56b063c86569e51eed1c5681ce6361fa97fc7a ] In Scenario A and B below, as the delayed INIT_ACK always changes the peer vtag, SCTP ct with the incorrect vtag may cause packet loss. Scenario A: INIT_ACK is delayed until the peer receives its own INIT_ACK 192.168.1.2 > 192.168.1.1: [INIT] [init tag: 1328086772] 192.168.1.1 > 192.168.1.2: [INIT] [init tag: 1414468151] 192.168.1.2 > 192.168.1.1: [INIT ACK] [init tag: 1328086772] 192.168.1.1 > 192.168.1.2: [INIT ACK] [init tag: 1650211246] * 192.168.1.2 > 192.168.1.1: [COOKIE ECHO] 192.168.1.1 > 192.168.1.2: [COOKIE ECHO] 192.168.1.2 > 192.168.1.1: [COOKIE ACK] Scenario B: INIT_ACK is delayed until the peer completes its own handshake 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885] 192.168.1.2 > 192.168.1.1: sctp (1) [INIT ACK] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [COOKIE ECHO] 192.168.1.2 > 192.168.1.1: sctp (1) [COOKIE ACK] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT ACK] [init tag: 3914796021] * This patch fixes it as below: In SCTP_CID_INIT processing: - clear ct->proto.sctp.init[!dir] if ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]. (Scenario E) - set ct->proto.sctp.init[dir]. In SCTP_CID_INIT_ACK processing: - drop it if !ct->proto.sctp.init[!dir] && ct->proto.sctp.vtag[!dir] && ct->proto.sctp.vtag[!dir] != ih->init_tag. (Scenario B, Scenario C) - drop it if ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && ct->proto.sctp.vtag[!dir] != ih->init_tag. (Scenario A) In SCTP_CID_COOKIE_ACK processing: - clear ct->proto.sctp.init[dir] and ct->proto.sctp.init[!dir]. (Scenario D) Also, it's important to allow the ct state to move forward with cookie_echo and cookie_ack from the opposite dir for the collision scenarios. There are also other Scenarios where it should allow the packet through, addressed by the processing above: Scenario C: new CT is created by INIT_ACK. Scenario D: start INIT on the existing ESTABLISHED ct. Scenario E: start INIT after the old collision on the existing ESTABLISHED ct. 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885] (both side are stopped, then start new connection again in hours) 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 242308742] Fixes: 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.") Signed-off-by: Xin Long Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin --- include/linux/netfilter/nf_conntrack_sctp.h | 1 + net/netfilter/nf_conntrack_proto_sctp.c | 43 ++++++++++++++++----- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_sctp.h b/include/linux/netfilter/nf_conntrack_sctp.h index 625f491b95de..fb31312825ae 100644 --- a/include/linux/netfilter/nf_conntrack_sctp.h +++ b/include/linux/netfilter/nf_conntrack_sctp.h @@ -9,6 +9,7 @@ struct ip_ct_sctp { enum sctp_conntrack state; __be32 vtag[IP_CT_DIR_MAX]; + u8 init[IP_CT_DIR_MAX]; u8 last_dir; u8 flags; }; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 21cbaf6dac33..e7545bcca805 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -112,7 +112,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA}, /* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/ /* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ +/* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */ /* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL}, /* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, /* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, @@ -126,7 +126,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { /* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV}, /* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV}, /* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ +/* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */ /* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV}, /* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV}, /* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS}, @@ -426,6 +426,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, /* (D) vtag must be same as init_vtag as found in INIT_ACK */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; + } else if (sch->type == SCTP_CID_COOKIE_ACK) { + ct->proto.sctp.init[dir] = 0; + ct->proto.sctp.init[!dir] = 0; } else if (sch->type == SCTP_CID_HEARTBEAT) { if (ct->proto.sctp.vtag[dir] == 0) { pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir); @@ -474,16 +477,18 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } /* If it is an INIT or an INIT ACK note down the vtag */ - if (sch->type == SCTP_CID_INIT || - sch->type == SCTP_CID_INIT_ACK) { - struct sctp_inithdr _inithdr, *ih; + if (sch->type == SCTP_CID_INIT) { + struct sctp_inithdr _ih, *ih; - ih = skb_header_pointer(skb, offset + sizeof(_sch), - sizeof(_inithdr), &_inithdr); - if (ih == NULL) + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) goto out_unlock; - pr_debug("Setting vtag %x for dir %d\n", - ih->init_tag, !dir); + + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir]) + ct->proto.sctp.init[!dir] = 0; + ct->proto.sctp.init[dir] = 1; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); ct->proto.sctp.vtag[!dir] = ih->init_tag; /* don't renew timeout on init retransmit so @@ -494,6 +499,24 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, old_state == SCTP_CONNTRACK_CLOSED && nf_ct_is_confirmed(ct)) ignore = true; + } else if (sch->type == SCTP_CID_INIT_ACK) { + struct sctp_inithdr _ih, *ih; + __be32 vtag; + + ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih); + if (!ih) + goto out_unlock; + + vtag = ct->proto.sctp.vtag[!dir]; + if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag) + goto out_unlock; + /* collision */ + if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] && + vtag != ih->init_tag) + goto out_unlock; + + pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir); + ct->proto.sctp.vtag[!dir] = ih->init_tag; } ct->proto.sctp.state = new_state; From 3a5142f017587b0aede42f19e0c5d05f174acea9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 28 Sep 2023 15:12:44 +0200 Subject: [PATCH 204/228] netfilter: nf_tables: nft_set_rbtree: fix spurious insertion failure [ Upstream commit 087388278e0f301f4c61ddffb1911d3a180f84b8 ] nft_rbtree_gc_elem() walks back and removes the end interval element that comes before the expired element. There is a small chance that we've cached this element as 'rbe_ge'. If this happens, we hold and test a pointer that has been queued for freeing. It also causes spurious insertion failures: $ cat test-testcases-sets-0044interval_overlap_0.1/testout.log Error: Could not process rule: File exists add element t s { 0 - 2 } ^^^^^^ Failed to insert 0 - 2 given: table ip t { set s { type inet_service flags interval,timeout timeout 2s gc-interval 2s } } The set (rbtree) is empty. The 'failure' doesn't happen on next attempt. Reason is that when we try to insert, the tree may hold an expired element that collides with the range we're adding. While we do evict/erase this element, we can trip over this check: if (rbe_ge && nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; rbe_ge was erased by the synchronous gc, we should not have done this check. Next attempt won't find it, so retry results in successful insertion. Restart in-kernel to avoid such spurious errors. Such restart are rare, unless userspace intentionally adds very large numbers of elements with very short timeouts while setting a huge gc interval. Even in this case, this cannot loop forever, on each retry an existing element has been removed. As the caller is holding the transaction mutex, its impossible for a second entity to add more expiring elements to the tree. After this it also becomes feasible to remove the async gc worker and perform all garbage collection from the commit path. Fixes: c9e6978e2725 ("netfilter: nft_set_rbtree: Switch to node list walk for overlap detection") Signed-off-by: Florian Westphal Signed-off-by: Sasha Levin --- net/netfilter/nft_set_rbtree.c | 46 +++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index cc32e19b4041..17abf17b673e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -235,10 +235,9 @@ static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, rb_erase(&rbe->node, &priv->root); } -static int nft_rbtree_gc_elem(const struct nft_set *__set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe, - u8 genmask) +static const struct nft_rbtree_elem * +nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe, u8 genmask) { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); @@ -248,7 +247,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); if (!gc) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* search for end interval coming before this element. * end intervals don't carry a timeout extension, they @@ -263,6 +262,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, prev = rb_prev(prev); } + rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); nft_rbtree_gc_remove(net, set, priv, rbe_prev); @@ -274,7 +274,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, */ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe_prev); } @@ -282,13 +282,13 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, nft_rbtree_gc_remove(net, set, priv, rbe); gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); if (WARN_ON_ONCE(!gc)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); nft_trans_gc_elem_add(gc, rbe); nft_trans_gc_queue_sync_done(gc); - return 0; + return rbe_prev; } static bool nft_rbtree_update_first(const struct nft_set *set, @@ -316,7 +316,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - int d, err; + int d; /* Descend the tree to search for an existing element greater than the * key value to insert that is greater than the new element. This is the @@ -365,9 +365,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, */ if (nft_set_elem_expired(&rbe->ext) && nft_set_elem_active(&rbe->ext, cur_genmask)) { - err = nft_rbtree_gc_elem(set, priv, rbe, genmask); - if (err < 0) - return err; + const struct nft_rbtree_elem *removed_end; + + removed_end = nft_rbtree_gc_elem(set, priv, rbe, genmask); + if (IS_ERR(removed_end)) + return PTR_ERR(removed_end); + + if (removed_end == rbe_le || removed_end == rbe_ge) + return -EAGAIN; continue; } @@ -488,11 +493,18 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *rbe = elem->priv; int err; - write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, ext); - write_seqcount_end(&priv->count); - write_unlock_bh(&priv->lock); + do { + if (fatal_signal_pending(current)) + return -EINTR; + + cond_resched(); + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + err = __nft_rbtree_insert(net, set, rbe, ext); + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + } while (err == -EAGAIN); return err; } From b9f1568ba37f4c821f0c561852f09338df252e1d Mon Sep 17 00:00:00 2001 From: Ben Wolsieffer Date: Wed, 27 Sep 2023 13:57:49 -0400 Subject: [PATCH 205/228] net: stmmac: dwmac-stm32: fix resume on STM32 MCU [ Upstream commit 6f195d6b0da3b689922ba9e302af2f49592fa9fc ] The STM32MP1 keeps clk_rx enabled during suspend, and therefore the driver does not enable the clock in stm32_dwmac_init() if the device was suspended. The problem is that this same code runs on STM32 MCUs, which do disable clk_rx during suspend, causing the clock to never be re-enabled on resume. This patch adds a variant flag to indicate that clk_rx remains enabled during suspend, and uses this to decide whether to enable the clock in stm32_dwmac_init() if the device was suspended. This approach fixes this specific bug with limited opportunity for unintended side-effects, but I have a follow up patch that will refactor the clock configuration and hopefully make it less error prone. Fixes: 6528e02cc9ff ("net: ethernet: stmmac: add adaptation for stm32mp157c.") Signed-off-by: Ben Wolsieffer Reviewed-by: Jacob Keller Link: https://lore.kernel.org/r/20230927175749.1419774-1-ben.wolsieffer@hefring.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c index 5d4df4c5254e..6623f5a07927 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c @@ -105,6 +105,7 @@ struct stm32_ops { int (*parse_data)(struct stm32_dwmac *dwmac, struct device *dev); u32 syscfg_eth_mask; + bool clk_rx_enable_in_suspend; }; static int stm32_dwmac_init(struct plat_stmmacenet_data *plat_dat) @@ -122,7 +123,8 @@ static int stm32_dwmac_init(struct plat_stmmacenet_data *plat_dat) if (ret) return ret; - if (!dwmac->dev->power.is_suspended) { + if (!dwmac->ops->clk_rx_enable_in_suspend || + !dwmac->dev->power.is_suspended) { ret = clk_prepare_enable(dwmac->clk_rx); if (ret) { clk_disable_unprepare(dwmac->clk_tx); @@ -515,7 +517,8 @@ static struct stm32_ops stm32mp1_dwmac_data = { .suspend = stm32mp1_suspend, .resume = stm32mp1_resume, .parse_data = stm32mp1_parse_data, - .syscfg_eth_mask = SYSCFG_MP1_ETH_MASK + .syscfg_eth_mask = SYSCFG_MP1_ETH_MASK, + .clk_rx_enable_in_suspend = true }; static const struct of_device_id stm32_dwmac_match[] = { From 6a24d0661fa389c241d935da38e0f6a5ee8eb1ae Mon Sep 17 00:00:00 2001 From: Chengfeng Ye Date: Wed, 27 Sep 2023 18:14:14 +0000 Subject: [PATCH 206/228] tipc: fix a potential deadlock on &tx->lock [ Upstream commit 08e50cf071847323414df0835109b6f3560d44f5 ] It seems that tipc_crypto_key_revoke() could be be invoked by wokequeue tipc_crypto_work_rx() under process context and timer/rx callback under softirq context, thus the lock acquisition on &tx->lock seems better use spin_lock_bh() to prevent possible deadlock. This flaw was found by an experimental static analysis tool I am developing for irq-related deadlock. tipc_crypto_work_rx() --> tipc_crypto_key_distr() --> tipc_bcast_xmit() --> tipc_bcbase_xmit() --> tipc_bearer_bc_xmit() --> tipc_crypto_xmit() --> tipc_ehdr_build() --> tipc_crypto_key_revoke() --> spin_lock(&tx->lock) --> tipc_disc_timeout() --> tipc_bearer_xmit_skb() --> tipc_crypto_xmit() --> tipc_ehdr_build() --> tipc_crypto_key_revoke() --> spin_lock(&tx->lock) Signed-off-by: Chengfeng Ye Reviewed-by: Jacob Keller Acked-by: Jon Maloy Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Link: https://lore.kernel.org/r/20230927181414.59928-1-dg573847474@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/tipc/crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 2784d6989211..b5aa0a835bce 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1445,14 +1445,14 @@ static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_key key; - spin_lock(&tx->lock); + spin_lock_bh(&tx->lock); key = tx->key; WARN_ON(!key.active || tx_key != key.active); /* Free the active key */ tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); - spin_unlock(&tx->lock); + spin_unlock_bh(&tx->lock); pr_warn("%s: key is revoked\n", tx->name); return -EKEYREVOKED; From 677aaa261e7ac0010b1e3ba4c3314f85f39caaf3 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 1 Oct 2023 11:12:38 -0400 Subject: [PATCH 207/228] tcp: fix quick-ack counting to count actual ACKs of new data [ Upstream commit 059217c18be6757b95bfd77ba53fb50b48b8a816 ] This commit fixes quick-ack counting so that it only considers that a quick-ack has been provided if we are sending an ACK that newly acknowledges data. The code was erroneously using the number of data segments in outgoing skbs when deciding how many quick-ack credits to remove. This logic does not make sense, and could cause poor performance in request-response workloads, like RPC traffic, where requests or responses can be multi-segment skbs. When a TCP connection decides to send N quick-acks, that is to accelerate the cwnd growth of the congestion control module controlling the remote endpoint of the TCP connection. That quick-ack decision is purely about the incoming data and outgoing ACKs. It has nothing to do with the outgoing data or the size of outgoing data. And in particular, an ACK only serves the intended purpose of allowing the remote congestion control to grow the congestion window quickly if the ACK is ACKing or SACKing new data. The fix is simple: only count packets as serving the goal of the quickack mechanism if they are ACKing/SACKing new data. We can tell whether this is the case by checking inet_csk_ack_scheduled(), since we schedule an ACK exactly when we are ACKing/SACKing new data. Fixes: fc6415bcb0f5 ("[TCP]: Fix quick-ack decrementing with TSO.") Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231001151239.1866845-1-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/net/tcp.h | 6 ++++-- net/ipv4/tcp_output.c | 7 +++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index b56f34602035..cb4b2fddd9eb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -337,12 +337,14 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -static inline void tcp_dec_quickack_mode(struct sock *sk, - const unsigned int pkts) +static inline void tcp_dec_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.quick) { + /* How many ACKs S/ACKing new data have we sent? */ + const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0; + if (pkts >= icsk->icsk_ack.quick) { icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 86e896351364..6c14d67715d1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -177,8 +177,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp, } /* Account for an ACK we sent. */ -static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, - u32 rcv_nxt) +static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); @@ -192,7 +191,7 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ - tcp_dec_quickack_mode(sk, pkts); + tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -1374,7 +1373,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); + tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); From ff346b01eba52f1657ecb2b34bf684d7c1addd7d Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 1 Oct 2023 11:12:39 -0400 Subject: [PATCH 208/228] tcp: fix delayed ACKs for MSS boundary condition [ Upstream commit 4720852ed9afb1c5ab84e96135cb5b73d5afde6f ] This commit fixes poor delayed ACK behavior that can cause poor TCP latency in a particular boundary condition: when an application makes a TCP socket write that is an exact multiple of the MSS size. The problem is that there is painful boundary discontinuity in the current delayed ACK behavior. With the current delayed ACK behavior, we have: (1) If an app reads data when > 1*MSS is unacknowledged, then tcp_cleanup_rbuf() ACKs immediately because of: tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || (2) If an app reads all received data, and the packets were < 1*MSS, and either (a) the app is not ping-pong or (b) we received two packets < 1*MSS, then tcp_cleanup_rbuf() ACKs immediately beecause of: ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !inet_csk_in_pingpong_mode(sk))) && (3) *However*: if an app reads exactly 1*MSS of data, tcp_cleanup_rbuf() does not send an immediate ACK. This is true even if the app is not ping-pong and the 1*MSS of data had the PSH bit set, suggesting the sending application completed an application write. Thus if the app is not ping-pong, we have this painful case where >1*MSS gets an immediate ACK, and <1*MSS gets an immediate ACK, but a write whose last skb is an exact multiple of 1*MSS can get a 40ms delayed ACK. This means that any app that transfers data in one direction and takes care to align write size or packet size with MSS can suffer this problem. With receive zero copy making 4KB MSS values more common, it is becoming more common to have application writes naturally align with MSS, and more applications are likely to encounter this delayed ACK problem. The fix in this commit is to refine the delayed ACK heuristics with a simple check: immediately ACK a received 1*MSS skb with PSH bit set if the app reads all data. Why? If an skb has a len of exactly 1*MSS and has the PSH bit set then it is likely the end of an application write. So more data may not be arriving soon, and yet the data sender may be waiting for an ACK if cwnd-bound or using TX zero copy. Thus we set ICSK_ACK_PUSHED in this case so that tcp_cleanup_rbuf() will send an ACK immediately if the app reads all of the data and is not ping-pong. Note that this logic is also executed for the case where len > MSS, but in that case this logic does not matter (and does not hurt) because tcp_cleanup_rbuf() will always ACK immediately if the app reads data and there is more than an MSS of unACKed data. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Cc: Xin Guo Link: https://lore.kernel.org/r/20231001151239.1866845-2-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/tcp_input.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b8d2c45edbe0..3f2b6a3adf6a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -242,6 +242,19 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) if (unlikely(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE)) tcp_gro_dev_warn(sk, skb, len); + /* If the skb has a len of exactly 1*MSS and has the PSH bit + * set then it is likely the end of an application write. So + * more data may not be arriving soon, and yet the data sender + * may be waiting for an ACK if cwnd-bound or using TX zero + * copy. So we set ICSK_ACK_PUSHED here so that + * tcp_cleanup_rbuf() will send an ACK immediately if the app + * reads all of the data and is not ping-pong. If len > MSS + * then this logic does not matter (and does not hurt) because + * tcp_cleanup_rbuf() will always ACK immediately if the app + * reads data and there is more than an MSS of unACKed data. + */ + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. From f87658493898b8fd9bac04ea96b0f710800325ce Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2023 10:58:45 -0400 Subject: [PATCH 209/228] sctp: update transport state when processing a dupcook packet [ Upstream commit 2222a78075f0c19ca18db53fd6623afb4aff602d ] During the 4-way handshake, the transport's state is set to ACTIVE in sctp_process_init() when processing INIT_ACK chunk on client or COOKIE_ECHO chunk on server. In the collision scenario below: 192.168.1.2 > 192.168.1.1: sctp (1) [INIT] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT] [init tag: 144230885] 192.168.1.2 > 192.168.1.1: sctp (1) [INIT ACK] [init tag: 3922216408] 192.168.1.1 > 192.168.1.2: sctp (1) [COOKIE ECHO] 192.168.1.2 > 192.168.1.1: sctp (1) [COOKIE ACK] 192.168.1.1 > 192.168.1.2: sctp (1) [INIT ACK] [init tag: 3914796021] when processing COOKIE_ECHO on 192.168.1.2, as it's in COOKIE_WAIT state, sctp_sf_do_dupcook_b() is called by sctp_sf_do_5_2_4_dupcook() where it creates a new association and sets its transport to ACTIVE then updates to the old association in sctp_assoc_update(). However, in sctp_assoc_update(), it will skip the transport update if it finds a transport with the same ipaddr already existing in the old asoc, and this causes the old asoc's transport state not to move to ACTIVE after the handshake. This means if DATA retransmission happens at this moment, it won't be able to enter PF state because of the check 'transport->state == SCTP_ACTIVE' in sctp_do_8_2_transport_strike(). This patch fixes it by updating the transport in sctp_assoc_update() with sctp_assoc_add_peer() where it updates the transport state if there is already a transport with the same ipaddr exists in the old asoc. Signed-off-by: Xin Long Reviewed-by: Simon Horman Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://lore.kernel.org/r/fd17356abe49713ded425250cc1ae51e9f5846c6.1696172325.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sctp/associola.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 2d4ec6187755..765eb617776b 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1151,8 +1151,7 @@ int sctp_assoc_update(struct sctp_association *asoc, /* Add any peer addresses from the new association. */ list_for_each_entry(trans, &new->peer.transport_addr_list, transports) - if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr) && - !sctp_assoc_add_peer(asoc, &trans->ipaddr, + if (!sctp_assoc_add_peer(asoc, &trans->ipaddr, GFP_ATOMIC, trans->state)) return -ENOMEM; From 492241613cf44accb4d6059ddda4f4bab836afaa Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Oct 2023 11:04:20 -0400 Subject: [PATCH 210/228] sctp: update hb timer immediately after users change hb_interval [ Upstream commit 1f4e803cd9c9166eb8b6c8b0b8e4124f7499fc07 ] Currently, when hb_interval is changed by users, it won't take effect until the next expiry of hb timer. As the default value is 30s, users have to wait up to 30s to wait its hb_interval update to work. This becomes pretty bad in containers where a much smaller value is usually set on hb_interval. This patch improves it by resetting the hb timer immediately once the value of hb_interval is updated by users. Note that we don't address the already existing 'problem' when sending a heartbeat 'on demand' if one hb has just been sent(from the timer) mentioned in: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg590224.html Signed-off-by: Xin Long Reviewed-by: Simon Horman Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Link: https://lore.kernel.org/r/75465785f8ee5df2fb3acdca9b8fafdc18984098.1696172660.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sctp/socket.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 68d53e3f0d07..bc4fe944ef85 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2452,6 +2452,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, if (trans) { trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + sctp_transport_reset_hb_timer(trans); } else if (asoc) { asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); From 82d87c944ea8d6bac436b88611ea9bf9be068349 Mon Sep 17 00:00:00 2001 From: Ivan Babrou Date: Mon, 4 Jan 2021 15:57:18 -0800 Subject: [PATCH 211/228] cpupower: add Makefile dependencies for install targets commit fb7791e213a64495ec2336869b868fcd8af14346 upstream. This allows building cpupower in parallel rather than serially. Signed-off-by: Ivan Babrou Signed-off-by: Shuah Khan Cc: Hauke Mehrtens Signed-off-by: Greg Kroah-Hartman --- tools/power/cpupower/Makefile | 8 ++++---- tools/power/cpupower/bench/Makefile | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index c7bcddbd486d..3b1594447f29 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -270,14 +270,14 @@ clean: $(MAKE) -C bench O=$(OUTPUT) clean -install-lib: +install-lib: libcpupower $(INSTALL) -d $(DESTDIR)${libdir} $(CP) $(OUTPUT)libcpupower.so* $(DESTDIR)${libdir}/ $(INSTALL) -d $(DESTDIR)${includedir} $(INSTALL_DATA) lib/cpufreq.h $(DESTDIR)${includedir}/cpufreq.h $(INSTALL_DATA) lib/cpuidle.h $(DESTDIR)${includedir}/cpuidle.h -install-tools: +install-tools: $(OUTPUT)cpupower $(INSTALL) -d $(DESTDIR)${bindir} $(INSTALL_PROGRAM) $(OUTPUT)cpupower $(DESTDIR)${bindir} $(INSTALL) -d $(DESTDIR)${bash_completion_dir} @@ -293,14 +293,14 @@ install-man: $(INSTALL_DATA) -D man/cpupower-info.1 $(DESTDIR)${mandir}/man1/cpupower-info.1 $(INSTALL_DATA) -D man/cpupower-monitor.1 $(DESTDIR)${mandir}/man1/cpupower-monitor.1 -install-gmo: +install-gmo: create-gmo $(INSTALL) -d $(DESTDIR)${localedir} for HLANG in $(LANGUAGES); do \ echo '$(INSTALL_DATA) -D $(OUTPUT)po/$$HLANG.gmo $(DESTDIR)${localedir}/$$HLANG/LC_MESSAGES/cpupower.mo'; \ $(INSTALL_DATA) -D $(OUTPUT)po/$$HLANG.gmo $(DESTDIR)${localedir}/$$HLANG/LC_MESSAGES/cpupower.mo; \ done; -install-bench: +install-bench: compile-bench @#DESTDIR must be set from outside to survive @sbindir=$(sbindir) bindir=$(bindir) docdir=$(docdir) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) install diff --git a/tools/power/cpupower/bench/Makefile b/tools/power/cpupower/bench/Makefile index f68b4bc55273..d9d9923af85c 100644 --- a/tools/power/cpupower/bench/Makefile +++ b/tools/power/cpupower/bench/Makefile @@ -27,7 +27,7 @@ $(OUTPUT)cpufreq-bench: $(OBJS) all: $(OUTPUT)cpufreq-bench -install: +install: $(OUTPUT)cpufreq-bench mkdir -p $(DESTDIR)/$(sbindir) mkdir -p $(DESTDIR)/$(bindir) mkdir -p $(DESTDIR)/$(docdir) From ccd87fe7a0f6f42bbe1c577125d432ebdcc59198 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 20 Sep 2023 13:51:16 +0300 Subject: [PATCH 212/228] dm zoned: free dmz->ddev array in dmz_put_zoned_devices commit 9850ccd5dd88075b2b7fd28d96299d5535f58cc5 upstream. Commit 4dba12881f88 ("dm zoned: support arbitrary number of devices") made the pointers to additional zoned devices to be stored in a dynamically allocated dmz->ddev array. However, this array is not freed. Rename dmz_put_zoned_device to dmz_put_zoned_devices and fix it to free the dmz->ddev array when cleaning up zoned device information. Remove NULL assignment for all dmz->ddev elements and just free the dmz->ddev array instead. Found by Linux Verification Center (linuxtesting.org). Fixes: 4dba12881f88 ("dm zoned: support arbitrary number of devices") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-zoned-target.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 7e88df64d197..48fc723f1ac8 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -750,17 +750,16 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path, /* * Cleanup zoned device information. */ -static void dmz_put_zoned_device(struct dm_target *ti) +static void dmz_put_zoned_devices(struct dm_target *ti) { struct dmz_target *dmz = ti->private; int i; - for (i = 0; i < dmz->nr_ddevs; i++) { - if (dmz->ddev[i]) { + for (i = 0; i < dmz->nr_ddevs; i++) + if (dmz->ddev[i]) dm_put_device(ti, dmz->ddev[i]); - dmz->ddev[i] = NULL; - } - } + + kfree(dmz->ddev); } static int dmz_fixup_devices(struct dm_target *ti) @@ -951,7 +950,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) err_meta: dmz_dtr_metadata(dmz->metadata); err_dev: - dmz_put_zoned_device(ti); + dmz_put_zoned_devices(ti); err: kfree(dmz->dev); kfree(dmz); @@ -982,7 +981,7 @@ static void dmz_dtr(struct dm_target *ti) bioset_exit(&dmz->bio_set); - dmz_put_zoned_device(ti); + dmz_put_zoned_devices(ti); mutex_destroy(&dmz->chunk_lock); From b74f12f98b7f3bdc57879a25035a071dd9142fe5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 4 Oct 2023 21:17:49 +0300 Subject: [PATCH 213/228] RDMA/core: Require admin capabilities to set system parameters commit c38d23a54445f9a8aa6831fafc9af0496ba02f9e upstream. Like any other set command, require admin permissions to do it. Cc: stable@vger.kernel.org Fixes: 2b34c5580226 ("RDMA/core: Add command to set ib_core device net namspace sharing mode") Link: https://lore.kernel.org/r/75d329fdd7381b52cbdf87910bef16c9965abb1f.1696443438.git.leon@kernel.org Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/nldev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index f7f80707af4b..f8dfec7ad7cc 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -2148,6 +2148,7 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { }, [RDMA_NLDEV_CMD_SYS_SET] = { .doit = nldev_set_sys_set_doit, + .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NLDEV_CMD_STAT_SET] = { .doit = nldev_stat_set_doit, From 204c2d485f860e0d8fcc9468b2388e6a727dd18d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 8 Sep 2023 10:03:50 +0300 Subject: [PATCH 214/228] of: dynamic: Fix potential memory leak in of_changeset_action() commit 55e95bfccf6db8d26a66c46e1de50d53c59a6774 upstream. Smatch complains that the error path where "action" is invalid leaks the "ce" allocation: drivers/of/dynamic.c:935 of_changeset_action() warn: possible memory leak of 'ce' Fix this by doing the validation before the allocation. Note that there is not any actual problem with upstream kernels. All callers of of_changeset_action() are static inlines with fixed action values. Fixes: 914d9d831e61 ("of: dynamic: Refactor action prints to not use "%pOF" inside devtree_lock") Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202309011059.EOdr4im9-lkp@intel.com/ Signed-off-by: Dan Carpenter Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/7dfaf999-30ad-491c-9615-fb1138db121c@moroto.mountain Signed-off-by: Rob Herring Signed-off-by: Greg Kroah-Hartman --- drivers/of/dynamic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c index be26346085fa..7d0232af9c23 100644 --- a/drivers/of/dynamic.c +++ b/drivers/of/dynamic.c @@ -893,13 +893,13 @@ int of_changeset_action(struct of_changeset *ocs, unsigned long action, { struct of_changeset_entry *ce; + if (WARN_ON(action >= ARRAY_SIZE(action_names))) + return -EINVAL; + ce = kzalloc(sizeof(*ce), GFP_KERNEL); if (!ce) return -ENOMEM; - if (WARN_ON(action >= ARRAY_SIZE(action_names))) - return -EINVAL; - /* get a reference to the node */ ce->action = action; ce->np = of_node_get(np); From d7d8f1a679ece1edd12267f3c29e0533f50b53bb Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2023 07:55:56 +0200 Subject: [PATCH 215/228] IB/mlx4: Fix the size of a buffer in add_port_entries() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d7f393430a17c2bfcdf805462a5aa80be4285b27 upstream. In order to be sure that 'buff' is never truncated, its size should be 12, not 11. When building with W=1, this fixes the following warnings: drivers/infiniband/hw/mlx4/sysfs.c: In function ‘add_port_entries’: drivers/infiniband/hw/mlx4/sysfs.c:268:34: error: ‘sprintf’ may write a terminating nul past the end of the destination [-Werror=format-overflow=] 268 | sprintf(buff, "%d", i); | ^ drivers/infiniband/hw/mlx4/sysfs.c:268:17: note: ‘sprintf’ output between 2 and 12 bytes into a destination of size 11 268 | sprintf(buff, "%d", i); | ^~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/hw/mlx4/sysfs.c:286:34: error: ‘sprintf’ may write a terminating nul past the end of the destination [-Werror=format-overflow=] 286 | sprintf(buff, "%d", i); | ^ drivers/infiniband/hw/mlx4/sysfs.c:286:17: note: ‘sprintf’ output between 2 and 12 bytes into a destination of size 11 286 | sprintf(buff, "%d", i); | ^~~~~~~~~~~~~~~~~~~~~~ Fixes: c1e7e466120b ("IB/mlx4: Add iov directory in sysfs under the ib device") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/0bb1443eb47308bc9be30232cc23004c4d4cf43e.1695448530.git.christophe.jaillet@wanadoo.fr Signed-off-by: Leon Romanovsky Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx4/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c index ea1f3a081b05..6c3a23ee3bc7 100644 --- a/drivers/infiniband/hw/mlx4/sysfs.c +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -221,7 +221,7 @@ void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, static int add_port_entries(struct mlx4_ib_dev *device, int port_num) { int i; - char buff[11]; + char buff[12]; struct mlx4_ib_iov_port *port = NULL; int ret = 0 ; struct ib_port_attr attr; From 6ad972e668708f965f963b6886ff67efc5603e9d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 3 Oct 2023 09:39:26 +0200 Subject: [PATCH 216/228] gpio: aspeed: fix the GPIO number passed to pinctrl_gpio_set_config() commit f9315f17bf778cb8079a29639419fcc8a41a3c84 upstream. pinctrl_gpio_set_config() expects the GPIO number from the global GPIO numberspace, not the controller-relative offset, which needs to be added to the chip base. Fixes: 5ae4cb94b313 ("gpio: aspeed: Add debounce support") Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Reviewed-by: Andrew Jeffery Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-aspeed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c index e0d5d80ec8e0..bbd04a63fb12 100644 --- a/drivers/gpio/gpio-aspeed.c +++ b/drivers/gpio/gpio-aspeed.c @@ -966,7 +966,7 @@ static int aspeed_gpio_set_config(struct gpio_chip *chip, unsigned int offset, else if (param == PIN_CONFIG_BIAS_DISABLE || param == PIN_CONFIG_BIAS_PULL_DOWN || param == PIN_CONFIG_DRIVE_STRENGTH) - return pinctrl_gpio_set_config(offset, config); + return pinctrl_gpio_set_config(chip->base + offset, config); else if (param == PIN_CONFIG_DRIVE_OPEN_DRAIN || param == PIN_CONFIG_DRIVE_OPEN_SOURCE) /* Return -ENOTSUPP to trigger emulation, as per datasheet */ From 36953b4da78bc296d203a25330c3192bcad1ccd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Duje=20Mihanovi=C4=87?= Date: Fri, 29 Sep 2023 17:41:57 +0200 Subject: [PATCH 217/228] gpio: pxa: disable pinctrl calls for MMP_GPIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f0575116507b981e6a810e78ce3c9040395b958b upstream. Similarly to PXA3xx and MMP2, pinctrl-single isn't capable of setting pin direction on MMP either. Fixes: a770d946371e ("gpio: pxa: add pin control gpio direction and request") Signed-off-by: Duje Mihanović Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-pxa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c index 0cb6600b8eee..1dbeaf6d0045 100644 --- a/drivers/gpio/gpio-pxa.c +++ b/drivers/gpio/gpio-pxa.c @@ -243,6 +243,7 @@ static bool pxa_gpio_has_pinctrl(void) switch (gpio_type) { case PXA3XX_GPIO: case MMP2_GPIO: + case MMP_GPIO: return false; default: From 7de0e42444e9bc18214eff4aaa04de230eb9eb3d Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Wed, 27 Sep 2023 12:05:11 +0300 Subject: [PATCH 218/228] RDMA/cma: Initialize ib_sa_multicast structure to 0 when join commit e0fe97efdb00f0f32b038a4836406a82886aec9c upstream. Initialize the structure to 0 so that it's fields won't have random values. For example fields like rec.traffic_class (as well as rec.flow_label and rec.sl) is used to generate the user AH through: cma_iboe_join_multicast cma_make_mc_event ib_init_ah_from_mcmember And a random traffic_class causes a random IP DSCP in RoCEv2. Fixes: b5de0c60cc30 ("RDMA/cma: Fix use after free race in roce multicast join") Signed-off-by: Mark Zhang Link: https://lore.kernel.org/r/20230927090511.603595-1-markzhang@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 805678f6fe57..0603d069de92 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4723,7 +4723,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; - struct ib_sa_multicast ib; + struct ib_sa_multicast ib = {}; enum ib_gid_type gid_type; bool send_only; From 626868282c361463f1da40255c346887aa8e44bd Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 11 Sep 2023 15:18:06 +0300 Subject: [PATCH 219/228] RDMA/cma: Fix truncation compilation warning in make_cma_ports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 18126c767658ae8a831257c6cb7776c5ba5e7249 upstream. The following compilation error is false alarm as RDMA devices don't have such large amount of ports to actually cause to format truncation. drivers/infiniband/core/cma_configfs.c: In function ‘make_cma_ports’: drivers/infiniband/core/cma_configfs.c:223:57: error: ‘snprintf’ output may be truncated before the last format character [-Werror=format-truncation=] 223 | snprintf(port_str, sizeof(port_str), "%u", i + 1); | ^ drivers/infiniband/core/cma_configfs.c:223:17: note: ‘snprintf’ output between 2 and 11 bytes into a destination of size 10 223 | snprintf(port_str, sizeof(port_str), "%u", i + 1); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors make[5]: *** [scripts/Makefile.build:243: drivers/infiniband/core/cma_configfs.o] Error 1 Fixes: 045959db65c6 ("IB/cma: Add configfs for rdma_cm") Link: https://lore.kernel.org/r/a7e3b347ee134167fa6a3787c56ef231a04bc8c2.1694434639.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/cma_configfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 35d1ec1095f9..d7cc8d5dcf35 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -221,7 +221,7 @@ static int make_cma_ports(struct cma_dev_group *cma_dev_group, } for (i = 0; i < ports_num; i++) { - char port_str[10]; + char port_str[11]; ports[i].port_num = i + 1; snprintf(port_str, sizeof(port_str), "%u", i + 1); From 5a4a6a47e0740653e416d5dda370c8aed2d7a3d4 Mon Sep 17 00:00:00 2001 From: Konstantin Meskhidze Date: Tue, 5 Sep 2023 18:32:58 +0800 Subject: [PATCH 220/228] RDMA/uverbs: Fix typo of sizeof argument commit c489800e0d48097fc6afebd862c6afa039110a36 upstream. Since size of 'hdr' pointer and '*hdr' structure is equal on 64-bit machines issue probably didn't cause any wrong behavior. But anyway, fixing of typo is required. Fixes: da0f60df7bd5 ("RDMA/uverbs: Prohibit write() calls with too small buffers") Co-developed-by: Ivanov Mikhail Signed-off-by: Ivanov Mikhail Signed-off-by: Konstantin Meskhidze Link: https://lore.kernel.org/r/20230905103258.1738246-1-konstantin.meskhidze@huawei.com Signed-off-by: Leon Romanovsky Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/core/uverbs_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 099f5acc749e..f2fcdb70d903 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -535,7 +535,7 @@ static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr, if (hdr->in_words * 4 != count) return -EINVAL; - if (count < method_elm->req_size + sizeof(hdr)) { + if (count < method_elm->req_size + sizeof(*hdr)) { /* * rdma-core v18 and v19 have a bug where they send DESTROY_CQ * with a 16 byte write instead of 24. Old kernels didn't From 0d520cdb0cd095eac5d00078dfd318408c9b5eed Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Tue, 5 Sep 2023 16:58:22 +0200 Subject: [PATCH 221/228] RDMA/siw: Fix connection failure handling commit 53a3f777049771496f791504e7dc8ef017cba590 upstream. In case immediate MPA request processing fails, the newly created endpoint unlinks the listening endpoint and is ready to be dropped. This special case was not handled correctly by the code handling the later TCP socket close, causing a NULL dereference crash in siw_cm_work_handler() when dereferencing a NULL listener. We now also cancel the useless MPA timeout, if immediate MPA request processing fails. This patch furthermore simplifies MPA processing in general: Scheduling a useless TCP socket read in sk_data_ready() upcall is now surpressed, if the socket is already moved out of TCP_ESTABLISHED state. Fixes: 6c52fdc244b5 ("rdma/siw: connection management") Signed-off-by: Bernard Metzler Link: https://lore.kernel.org/r/20230905145822.446263-1-bmt@zurich.ibm.com Signed-off-by: Leon Romanovsky Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/sw/siw/siw_cm.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index de5ab282ac74..df5f675993c7 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -973,6 +973,7 @@ static void siw_accept_newconn(struct siw_cep *cep) siw_cep_put(cep); new_cep->listen_cep = NULL; if (rv) { + siw_cancel_mpatimer(new_cep); siw_cep_set_free(new_cep); goto error; } @@ -1097,9 +1098,12 @@ static void siw_cm_work_handler(struct work_struct *w) /* * Socket close before MPA request received. */ - siw_dbg_cep(cep, "no mpareq: drop listener\n"); - siw_cep_put(cep->listen_cep); - cep->listen_cep = NULL; + if (cep->listen_cep) { + siw_dbg_cep(cep, + "no mpareq: drop listener\n"); + siw_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + } } } release_cep = 1; @@ -1222,7 +1226,11 @@ static void siw_cm_llp_data_ready(struct sock *sk) if (!cep) goto out; - siw_dbg_cep(cep, "state: %d\n", cep->state); + siw_dbg_cep(cep, "cep state: %d, socket state %d\n", + cep->state, sk->sk_state); + + if (sk->sk_state != TCP_ESTABLISHED) + goto out; switch (cep->state) { case SIW_EPSTATE_RDMA_MODE: From cfc333393ae66fe00b84c6917bc44f93bedb638a Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 20 Sep 2023 13:01:56 +0300 Subject: [PATCH 222/228] RDMA/mlx5: Fix NULL string error commit dab994bcc609a172bfdab15a0d4cb7e50e8b5458 upstream. checkpath is complaining about NULL string, change it to 'Unknown'. Fixes: 37aa5c36aa70 ("IB/mlx5: Add UARs write-combining and non-cached mapping") Signed-off-by: Shay Drory Link: https://lore.kernel.org/r/8638e5c14fadbde5fa9961874feae917073af920.1695203958.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 215d6618839b..d36436d4277a 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2069,7 +2069,7 @@ static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) case MLX5_IB_MMAP_DEVICE_MEM: return "Device Memory"; default: - return NULL; + return "Unknown"; } } From c17446c0080571c2ac9038e47d8a6f856182d6e9 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Tue, 19 Sep 2023 17:51:40 +0000 Subject: [PATCH 223/228] parisc: Restore __ldcw_align for PA-RISC 2.0 processors commit 914988e099fc658436fbd7b8f240160c352b6552 upstream. Back in 2005, Kyle McMartin removed the 16-byte alignment for ldcw semaphores on PA 2.0 machines (CONFIG_PA20). This broke spinlocks on pre PA8800 processors. The main symptom was random faults in mmap'd memory (e.g., gcc compilations, etc). Unfortunately, the errata for this ldcw change is lost. The issue is the 16-byte alignment required for ldcw semaphore instructions can only be reduced to natural alignment when the ldcw operation can be handled coherently in cache. Only PA8800 and PA8900 processors actually support doing the operation in cache. Aligning the spinlock dynamically adds two integer instructions to each spinlock. Tested on rp3440, c8000 and a500. Signed-off-by: John David Anglin Link: https://lore.kernel.org/linux-parisc/6b332788-2227-127f-ba6d-55e99ecf4ed8@bell.net/T/#t Link: https://lore.kernel.org/linux-parisc/20050609050702.GB4641@roadwarrior.mcmartin.ca/ Cc: stable@vger.kernel.org Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/include/asm/ldcw.h | 36 +++++++++++++----------- arch/parisc/include/asm/spinlock_types.h | 5 ---- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h index 6d28b5514699..10a061d6899c 100644 --- a/arch/parisc/include/asm/ldcw.h +++ b/arch/parisc/include/asm/ldcw.h @@ -2,14 +2,28 @@ #ifndef __PARISC_LDCW_H #define __PARISC_LDCW_H -#ifndef CONFIG_PA20 /* Because kmalloc only guarantees 8-byte alignment for kmalloc'd data, and GCC only guarantees 8-byte alignment for stack locals, we can't be assured of 16-byte alignment for atomic lock data even if we specify "__attribute ((aligned(16)))" in the type declaration. So, we use a struct containing an array of four ints for the atomic lock type and dynamically select the 16-byte aligned int from the array - for the semaphore. */ + for the semaphore. */ + +/* From: "Jim Hull" + I've attached a summary of the change, but basically, for PA 2.0, as + long as the ",CO" (coherent operation) completer is implemented, then the + 16-byte alignment requirement for ldcw and ldcd is relaxed, and instead + they only require "natural" alignment (4-byte for ldcw, 8-byte for + ldcd). + + Although the cache control hint is accepted by all PA 2.0 processors, + it is only implemented on PA8800/PA8900 CPUs. Prior PA8X00 CPUs still + require 16-byte alignment. If the address is unaligned, the operation + of the instruction is undefined. The ldcw instruction does not generate + unaligned data reference traps so misaligned accesses are not detected. + This hid the problem for years. So, restore the 16-byte alignment dropped + by Kyle McMartin in "Remove __ldcw_align for PA-RISC 2.0 processors". */ #define __PA_LDCW_ALIGNMENT 16 #define __PA_LDCW_ALIGN_ORDER 4 @@ -19,22 +33,12 @@ & ~(__PA_LDCW_ALIGNMENT - 1); \ (volatile unsigned int *) __ret; \ }) -#define __LDCW "ldcw" -#else /*CONFIG_PA20*/ -/* From: "Jim Hull" - I've attached a summary of the change, but basically, for PA 2.0, as - long as the ",CO" (coherent operation) completer is specified, then the - 16-byte alignment requirement for ldcw and ldcd is relaxed, and instead - they only require "natural" alignment (4-byte for ldcw, 8-byte for - ldcd). */ - -#define __PA_LDCW_ALIGNMENT 4 -#define __PA_LDCW_ALIGN_ORDER 2 -#define __ldcw_align(a) (&(a)->slock) +#ifdef CONFIG_PA20 #define __LDCW "ldcw,co" - -#endif /*!CONFIG_PA20*/ +#else +#define __LDCW "ldcw" +#endif /* LDCW, the only atomic read-write operation PA-RISC has. *sigh*. We don't explicitly expose that "*a" may be written as reload diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h index ca39ee350c3f..35c5086b74d7 100644 --- a/arch/parisc/include/asm/spinlock_types.h +++ b/arch/parisc/include/asm/spinlock_types.h @@ -3,13 +3,8 @@ #define __ASM_SPINLOCK_TYPES_H typedef struct { -#ifdef CONFIG_PA20 - volatile unsigned int slock; -# define __ARCH_SPIN_LOCK_UNLOCKED { 1 } -#else volatile unsigned int lock[4]; # define __ARCH_SPIN_LOCK_UNLOCKED { { 1, 1, 1, 1 } } -#endif } arch_spinlock_t; From 84f6b686df2d44e662cf5f169b47d7590a00144a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Aug 2023 23:59:03 +0200 Subject: [PATCH 224/228] netfilter: nf_tables: fix kdoc warnings after gc rework commit 08713cb006b6f07434f276c5ee214fb20c7fd965 upstream. Jakub Kicinski says: We've got some new kdoc warnings here: net/netfilter/nft_set_pipapo.c:1557: warning: Function parameter or member '_set' not described in 'pipapo_gc' net/netfilter/nft_set_pipapo.c:1557: warning: Excess function parameter 'set' description in 'pipapo_gc' include/net/netfilter/nf_tables.h:577: warning: Function parameter or member 'dead' not described in 'nft_set' Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20230810104638.746e46f1@kernel.org/ Signed-off-by: Florian Westphal Signed-off-by: Greg Kroah-Hartman --- include/net/netfilter/nf_tables.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 152cd46915d6..1d59a109417d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -439,6 +439,7 @@ struct nft_set_type { * @expr: stateful expression * @ops: set ops * @flags: set flags + * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length From a7d86a77c33ba1c357a7504341172cc1507f0698 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 5 Sep 2023 23:13:56 +0200 Subject: [PATCH 225/228] netfilter: nftables: exthdr: fix 4-byte stack OOB write commit fd94d9dadee58e09b49075240fe83423eb1dcd36 upstream. If priv->len is a multiple of 4, then dst[len / 4] can write past the destination array which leads to stack corruption. This construct is necessary to clean the remainder of the register in case ->len is NOT a multiple of the register size, so make it conditional just like nft_payload.c does. The bug was added in 4.1 cycle and then copied/inherited when tcp/sctp and ip option support was added. Bug reported by Zero Day Initiative project (ZDI-CAN-21950, ZDI-CAN-21951, ZDI-CAN-21961). Fixes: 49499c3e6e18 ("netfilter: nf_tables: switch registers to 32 bit addressing") Fixes: 935b7f643018 ("netfilter: nft_exthdr: add TCP option matching") Fixes: 133dc203d77d ("netfilter: nft_exthdr: Support SCTP chunks") Fixes: dbb5281a1f84 ("netfilter: nf_tables: add support for matching IPv4 options") Signed-off-by: Florian Westphal Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nft_exthdr.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index eb183c024ac4..cb69a299f10c 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -35,6 +35,14 @@ static unsigned int optlen(const u8 *opt, unsigned int offset) return opt[offset + 1]; } +static int nft_skb_copy_to_reg(const struct sk_buff *skb, int offset, u32 *dest, unsigned int len) +{ + if (len % NFT_REG32_SIZE) + dest[len / NFT_REG32_SIZE] = 0; + + return skb_copy_bits(skb, offset, dest, len); +} + static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -56,8 +64,7 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: @@ -153,8 +160,7 @@ static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: @@ -210,7 +216,8 @@ static void nft_exthdr_tcp_eval(const struct nft_expr *expr, if (priv->flags & NFT_EXTHDR_F_PRESENT) { *dest = 1; } else { - dest[priv->len / NFT_REG32_SIZE] = 0; + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; memcpy(dest, opt + offset, priv->len); } @@ -388,9 +395,8 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, offset + ntohs(sch->length) > pkt->skb->len) break; - dest[priv->len / NFT_REG32_SIZE] = 0; - if (skb_copy_bits(pkt->skb, offset + priv->offset, - dest, priv->len) < 0) + if (nft_skb_copy_to_reg(pkt->skb, offset + priv->offset, + dest, priv->len) < 0) break; return; } From 1e3d016a95067ab3e6fcd245ba67b644a6b7d698 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 10 Nov 2020 15:20:55 +0100 Subject: [PATCH 226/228] mmc: renesas_sdhi: only reset SCC when its pointer is populated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 45bffc371fefd8537804b001080a47c6b69d5efa upstream. Only re-initialize SCC and tuning when an SCC was found during probe(). This is currently a noop because all R-Car Gen2+ are considered to have an SCC. But this will change in a later patch, so we need this preparation. Signed-off-by: Wolfram Sang Reviewed-by: Niklas Söderlund Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20201110142058.36393-2-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/renesas_sdhi_core.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 95abd421d0d2..be4c2a848b52 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -556,16 +556,18 @@ static void renesas_sdhi_reset(struct tmio_mmc_host *host) { struct renesas_sdhi *priv = host_to_priv(host); - renesas_sdhi_reset_scc(host, priv); - renesas_sdhi_reset_hs400_mode(host, priv); - priv->needs_adjust_hs400 = false; + if (priv->scc_ctl) { + renesas_sdhi_reset_scc(host, priv); + renesas_sdhi_reset_hs400_mode(host, priv); + priv->needs_adjust_hs400 = false; - sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | - sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); + sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | + sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); - sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL, - ~SH_MOBILE_SDHI_SCC_RVSCNTL_RVSEN & - sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL)); + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL, + ~SH_MOBILE_SDHI_SCC_RVSCNTL_RVSEN & + sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL)); + } if (host->pdata->flags & TMIO_MMC_MIN_RCAR2) sd_ctrl_write32_as_16_and_16(host, CTL_IRQ_MASK, From 660627c71bc1098aa94e5f208f14748b105b73bc Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 28 Aug 2023 08:09:47 +0200 Subject: [PATCH 227/228] xen/events: replace evtchn_rwlock with RCU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 87797fad6cce28ec9be3c13f031776ff4f104cfc upstream. In unprivileged Xen guests event handling can cause a deadlock with Xen console handling. The evtchn_rwlock and the hvc_lock are taken in opposite sequence in __hvc_poll() and in Xen console IRQ handling. Normally this is no problem, as the evtchn_rwlock is taken as a reader in both paths, but as soon as an event channel is being closed, the lock will be taken as a writer, which will cause read_lock() to block: CPU0 CPU1 CPU2 (IRQ handling) (__hvc_poll()) (closing event channel) read_lock(evtchn_rwlock) spin_lock(hvc_lock) write_lock(evtchn_rwlock) [blocks] spin_lock(hvc_lock) [blocks] read_lock(evtchn_rwlock) [blocks due to writer waiting, and not in_interrupt()] This issue can be avoided by replacing evtchn_rwlock with RCU in xen_free_irq(). Note that RCU is used only to delay freeing of the irq_info memory. There is no RCU based dereferencing or replacement of pointers involved. In order to avoid potential races between removing the irq_info reference and handling of interrupts, set the irq_info pointer to NULL only when freeing its memory. The IRQ itself must be freed at that time, too, as otherwise the same IRQ number could be allocated again before handling of the old instance would have been finished. This is XSA-441 / CVE-2023-34324. Fixes: 54c9de89895e ("xen/events: add a new "late EOI" evtchn framework") Reported-by: Marek Marczykowski-Górecki Signed-off-by: Juergen Gross Reviewed-by: Julien Grall Signed-off-by: Greg Kroah-Hartman --- drivers/xen/events/events_base.c | 87 +++++++++++++++++--------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index fba78daee449..52891546e697 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -94,6 +95,7 @@ enum xen_irq_type { struct irq_info { struct list_head list; struct list_head eoi_list; + struct rcu_work rwork; short refcnt; short spurious_cnt; short type; /* type */ @@ -141,23 +143,13 @@ const struct evtchn_ops *evtchn_ops; */ static DEFINE_MUTEX(irq_mapping_update_lock); -/* - * Lock protecting event handling loop against removing event channels. - * Adding of event channels is no issue as the associated IRQ becomes active - * only after everything is setup (before request_[threaded_]irq() the handler - * can't be entered for an event, as the event channel will be unmasked only - * then). - */ -static DEFINE_RWLOCK(evtchn_rwlock); - /* * Lock hierarchy: * * irq_mapping_update_lock - * evtchn_rwlock - * IRQ-desc lock - * percpu eoi_list_lock - * irq_info->lock + * IRQ-desc lock + * percpu eoi_list_lock + * irq_info->lock */ static LIST_HEAD(xen_irq_list_head); @@ -272,6 +264,22 @@ static void set_info_for_irq(unsigned int irq, struct irq_info *info) irq_set_chip_data(irq, info); } +static void delayed_free_irq(struct work_struct *work) +{ + struct irq_info *info = container_of(to_rcu_work(work), struct irq_info, + rwork); + unsigned int irq = info->irq; + + /* Remove the info pointer only now, with no potential users left. */ + set_info_for_irq(irq, NULL); + + kfree(info); + + /* Legacy IRQ descriptors are managed by the arch. */ + if (irq >= nr_legacy_irqs()) + irq_free_desc(irq); +} + /* Constructors for packed IRQ information. */ static int xen_irq_info_common_setup(struct irq_info *info, unsigned irq, @@ -606,33 +614,36 @@ static void xen_irq_lateeoi_worker(struct work_struct *work) eoi = container_of(to_delayed_work(work), struct lateeoi_work, delayed); - read_lock_irqsave(&evtchn_rwlock, flags); + rcu_read_lock(); while (true) { - spin_lock(&eoi->eoi_list_lock); + spin_lock_irqsave(&eoi->eoi_list_lock, flags); info = list_first_entry_or_null(&eoi->eoi_list, struct irq_info, eoi_list); - if (info == NULL || now < info->eoi_time) { - spin_unlock(&eoi->eoi_list_lock); + if (info == NULL) + break; + + if (now < info->eoi_time) { + mod_delayed_work_on(info->eoi_cpu, system_wq, + &eoi->delayed, + info->eoi_time - now); break; } list_del_init(&info->eoi_list); - spin_unlock(&eoi->eoi_list_lock); + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); info->eoi_time = 0; xen_irq_lateeoi_locked(info, false); } - if (info) - mod_delayed_work_on(info->eoi_cpu, system_wq, - &eoi->delayed, info->eoi_time - now); + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); - read_unlock_irqrestore(&evtchn_rwlock, flags); + rcu_read_unlock(); } static void xen_cpu_init_eoi(unsigned int cpu) @@ -647,16 +658,15 @@ static void xen_cpu_init_eoi(unsigned int cpu) void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags) { struct irq_info *info; - unsigned long flags; - read_lock_irqsave(&evtchn_rwlock, flags); + rcu_read_lock(); info = info_for_irq(irq); if (info) xen_irq_lateeoi_locked(info, eoi_flags & XEN_EOI_FLAG_SPURIOUS); - read_unlock_irqrestore(&evtchn_rwlock, flags); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(xen_irq_lateeoi); @@ -675,6 +685,7 @@ static void xen_irq_init(unsigned irq) info->type = IRQT_UNBOUND; info->refcnt = -1; + INIT_RCU_WORK(&info->rwork, delayed_free_irq); set_info_for_irq(irq, info); @@ -727,31 +738,18 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi) static void xen_free_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); - unsigned long flags; if (WARN_ON(!info)) return; - write_lock_irqsave(&evtchn_rwlock, flags); - if (!list_empty(&info->eoi_list)) lateeoi_list_del(info); list_del(&info->list); - set_info_for_irq(irq, NULL); - WARN_ON(info->refcnt > 0); - write_unlock_irqrestore(&evtchn_rwlock, flags); - - kfree(info); - - /* Legacy IRQ descriptors are managed by the arch. */ - if (irq < nr_legacy_irqs()) - return; - - irq_free_desc(irq); + queue_rcu_work(system_wq, &info->rwork); } static void xen_evtchn_close(evtchn_port_t port) @@ -1639,7 +1637,14 @@ static void __xen_evtchn_do_upcall(void) int cpu = smp_processor_id(); struct evtchn_loop_ctrl ctrl = { 0 }; - read_lock(&evtchn_rwlock); + /* + * When closing an event channel the associated IRQ must not be freed + * until all cpus have left the event handling loop. This is ensured + * by taking the rcu_read_lock() while handling events, as freeing of + * the IRQ is handled via queue_rcu_work() _after_ closing the event + * channel. + */ + rcu_read_lock(); do { vcpu_info->evtchn_upcall_pending = 0; @@ -1652,7 +1657,7 @@ static void __xen_evtchn_do_upcall(void) } while (vcpu_info->evtchn_upcall_pending); - read_unlock(&evtchn_rwlock); + rcu_read_unlock(); /* * Increment irq_epoch only now to defer EOIs only for From a8d812240fdd12949c8344379b01d340e36726ba Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 10 Oct 2023 21:53:40 +0200 Subject: [PATCH 228/228] Linux 5.10.198 Link: https://lore.kernel.org/r/20231009130126.697995596@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Shuah Khan Tested-by: Jon Hunter Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 12986f3532a9..470e11dcf2a3 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 10 -SUBLEVEL = 197 +SUBLEVEL = 198 EXTRAVERSION = NAME = Dare mighty things