From 0fc4dcc13f090c941abfab453a24945a4005b350 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Tue, 29 Jun 2021 11:39:07 +0200
Subject: [PATCH 1/9] bpf, devmap: Convert remaining READ_ONCE() to
 rcu_dereference_check()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There were a couple of READ_ONCE()-invocations left-over by the devmap
RCU conversion. Convert these to rcu_dereference_check() as well to avoid
complaints from sparse.

Fixes: 782347b6bcad ("xdp: Add proper __rcu annotations to redirect map entries")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210629093907.573598-1-toke@redhat.com
---
 kernel/bpf/devmap.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2546dafd6672..fdc20892837c 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -558,7 +558,8 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
 
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		for (i = 0; i < map->max_entries; i++) {
-			dst = READ_ONCE(dtab->netdev_map[i]);
+			dst = rcu_dereference_check(dtab->netdev_map[i],
+						    rcu_read_lock_bh_held());
 			if (!is_valid_dst(dst, xdp, exclude_ifindex))
 				continue;
 
@@ -654,7 +655,8 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
 
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		for (i = 0; i < map->max_entries; i++) {
-			dst = READ_ONCE(dtab->netdev_map[i]);
+			dst = rcu_dereference_check(dtab->netdev_map[i],
+						    rcu_read_lock_bh_held());
 			if (!dst || dst->dev->ifindex == exclude_ifindex)
 				continue;
 

From 5a0ae9872d5cb5f27590eed168d4b3b144350ed7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 5 Jul 2021 12:38:41 +0200
Subject: [PATCH 2/9] bpf, samples: Add -fno-asynchronous-unwind-tables to BPF
 Clang invocation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The samples/bpf Makefile currently compiles BPF files in a way that will
produce an .eh_frame section, which will in turn confuse libbpf and produce
errors when loading BPF programs, like:

  libbpf: elf: skipping unrecognized data section(32) .eh_frame
  libbpf: elf: skipping relo section(33) .rel.eh_frame for section(32) .eh_frame

Fix this by instruction Clang not to produce this section, as it's useless
for BPF anyway.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210705103841.180260-1-toke@redhat.com
---
 samples/bpf/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 520434ea966f..036998d11ded 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -331,6 +331,7 @@ $(obj)/%.o: $(src)/%.c
 		-Wno-gnu-variable-sized-type-not-at-end \
 		-Wno-address-of-packed-member -Wno-tautological-compare \
 		-Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
+		-fno-asynchronous-unwind-tables \
 		-I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
 		-O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \
 		$(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \

From 2620e92ae6ed83260eb46d214554cd308ee35d92 Mon Sep 17 00:00:00 2001
From: Wang Hai <wanghai38@huawei.com>
Date: Mon, 28 Jun 2021 17:18:15 +0800
Subject: [PATCH 3/9] bpf, samples: Fix xdpsock with '-M' parameter missing
 unload process

Execute the following command and exit, then execute it again, the following
error will be reported:

  $ sudo ./samples/bpf/xdpsock -i ens4f2 -M
  ^C
  $ sudo ./samples/bpf/xdpsock -i ens4f2 -M
  libbpf: elf: skipping unrecognized data section(16) .eh_frame
  libbpf: elf: skipping relo section(17) .rel.eh_frame for section(16) .eh_frame
  libbpf: Kernel error message: XDP program already attached
  ERROR: link set xdp fd failed

Commit c9d27c9e8dc7 ("samples: bpf: Do not unload prog within xdpsock") removed
the unloading prog code because of the presence of bpf_link. This is fine if
XDP_SHARED_UMEM is disabled, but if it is enabled, unloading the prog is still
needed.

Fixes: c9d27c9e8dc7 ("samples: bpf: Do not unload prog within xdpsock")
Signed-off-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Cc: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20210628091815.2373487-1-wanghai38@huawei.com
---
 samples/bpf/xdpsock_user.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 53e300f860bb..33d0bdebbed8 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -96,6 +96,7 @@ static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
 static int opt_timeout = 1000;
 static bool opt_need_wakeup = true;
 static u32 opt_num_xsks = 1;
+static u32 prog_id;
 static bool opt_busy_poll;
 static bool opt_reduced_cap;
 
@@ -461,6 +462,23 @@ static void *poller(void *arg)
 	return NULL;
 }
 
+static void remove_xdp_program(void)
+{
+	u32 curr_prog_id = 0;
+
+	if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
+		printf("bpf_get_link_xdp_id failed\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if (prog_id == curr_prog_id)
+		bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+	else if (!curr_prog_id)
+		printf("couldn't find a prog id on a given interface\n");
+	else
+		printf("program on interface changed, not removing\n");
+}
+
 static void int_exit(int sig)
 {
 	benchmark_done = true;
@@ -471,6 +489,9 @@ static void __exit_with_error(int error, const char *file, const char *func,
 {
 	fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
 		line, error, strerror(error));
+
+	if (opt_num_xsks > 1)
+		remove_xdp_program();
 	exit(EXIT_FAILURE);
 }
 
@@ -490,6 +511,9 @@ static void xdpsock_cleanup(void)
 		if (write(sock, &cmd, sizeof(int)) < 0)
 			exit_with_error(errno);
 	}
+
+	if (opt_num_xsks > 1)
+		remove_xdp_program();
 }
 
 static void swap_mac_addresses(void *data)
@@ -857,6 +881,10 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
 	if (ret)
 		exit_with_error(-ret);
 
+	ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
+	if (ret)
+		exit_with_error(-ret);
+
 	xsk->app_stats.rx_empty_polls = 0;
 	xsk->app_stats.fill_fail_polls = 0;
 	xsk->app_stats.copy_tx_sendtos = 0;

From bc832065b60f973771ff3e657214bb21b559833c Mon Sep 17 00:00:00 2001
From: Gu Shengxian <gushengxian@yulong.com>
Date: Mon, 5 Jul 2021 18:35:43 -0700
Subject: [PATCH 4/9] bpftool: Properly close va_list 'ap' by va_end() on error

va_list 'ap' was opened but not closed by va_end() in error case. It should
be closed by va_end() before the return.

Fixes: aa52bcbe0e72 ("tools: bpftool: Fix json dump crash on powerpc")
Signed-off-by: Gu Shengxian <gushengxian@yulong.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: https://lore.kernel.org/bpf/20210706013543.671114-1-gushengxian507419@gmail.com
---
 tools/bpf/bpftool/jit_disasm.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c
index e7e7eee9f172..24734f2249d6 100644
--- a/tools/bpf/bpftool/jit_disasm.c
+++ b/tools/bpf/bpftool/jit_disasm.c
@@ -43,11 +43,13 @@ static int fprintf_json(void *out, const char *fmt, ...)
 {
 	va_list ap;
 	char *s;
+	int err;
 
 	va_start(ap, fmt);
-	if (vasprintf(&s, fmt, ap) < 0)
-		return -1;
+	err = vasprintf(&s, fmt, ap);
 	va_end(ap);
+	if (err < 0)
+		return -1;
 
 	if (!oper_count) {
 		int i;

From af0efa050caa66e8f304c42c94c76cb6c480cb7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Tue, 6 Jul 2021 14:23:55 +0200
Subject: [PATCH 5/9] libbpf: Restore errno return for functions that were
 already returning it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The update to streamline libbpf error reporting intended to change all
functions to return the errno as a negative return value if
LIBBPF_STRICT_DIRECT_ERRS is set. However, if the flag is *not* set, the
return value changes for the two functions that were already returning a
negative errno unconditionally: bpf_link__unpin() and perf_buffer__poll().

This is a user-visible API change that breaks applications; so let's revert
these two functions back to unconditionally returning a negative errno
value.

Fixes: e9fc3ce99b34 ("libbpf: Streamline error reporting for high-level APIs")
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210706122355.236082-1-toke@redhat.com
---
 tools/lib/bpf/libbpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1e04ce724240..6f5e2757bb3c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10136,7 +10136,7 @@ int bpf_link__unpin(struct bpf_link *link)
 
 	err = unlink(link->pin_path);
 	if (err != 0)
-		return libbpf_err_errno(err);
+		return -errno;
 
 	pr_debug("link fd=%d: unpinned from %s\n", link->fd, link->pin_path);
 	zfree(&link->pin_path);
@@ -11197,7 +11197,7 @@ int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms)
 
 	cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, timeout_ms);
 	if (cnt < 0)
-		return libbpf_err_errno(cnt);
+		return -errno;
 
 	for (i = 0; i < cnt; i++) {
 		struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr;

From 5616e895ecc56db8ba959e53638031a21353e0e2 Mon Sep 17 00:00:00 2001
From: SanjayKumar Jeyakumar <vjsanjay@gmail.com>
Date: Wed, 7 Jul 2021 10:59:14 +0530
Subject: [PATCH 6/9] tools/runqslower: Use __state instead of state

Commit 2f064a59a11f ("sched: Change task_struct::state") renamed task->state
to task->__state in task_struct. Fix runqslower to use the new name of the
field.

Fixes: 2f064a59a11f ("sched: Change task_struct::state")
Signed-off-by: SanjayKumar Jeyakumar <vjsanjay@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210707052914.21473-1-vjsanjay@gmail.com
---
 tools/bpf/runqslower/runqslower.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c
index 645530ca7e98..ab9353f2fd46 100644
--- a/tools/bpf/runqslower/runqslower.bpf.c
+++ b/tools/bpf/runqslower/runqslower.bpf.c
@@ -74,7 +74,7 @@ int handle__sched_switch(u64 *ctx)
 	u32 pid;
 
 	/* ivcsw: treat like an enqueue event and store timestamp */
-	if (prev->state == TASK_RUNNING)
+	if (prev->__state == TASK_RUNNING)
 		trace_enqueue(prev);
 
 	pid = next->pid;

From 1d719254c139fb62fb8056fb496b6fd007e71550 Mon Sep 17 00:00:00 2001
From: Wei Li <liwei391@huawei.com>
Date: Mon, 28 Jun 2021 11:04:09 +0800
Subject: [PATCH 7/9] tools: bpf: Fix error in 'make -C tools/ bpf_install'

make[2]: *** No rule to make target 'install'.  Stop.
make[1]: *** [Makefile:122: runqslower_install] Error 2
make: *** [Makefile:116: bpf_install] Error 2

There is no rule for target 'install' in tools/bpf/runqslower/Makefile,
and there is no need to install it, so just remove 'runqslower_install'.

Fixes: 9c01546d26d2 ("tools/bpf: Add runqslower tool to tools/bpf")
Signed-off-by: Wei Li <liwei391@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210628030409.3459095-1-liwei391@huawei.com
---
 tools/bpf/Makefile | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile
index 39bb322707b4..b11cfc86a3d0 100644
--- a/tools/bpf/Makefile
+++ b/tools/bpf/Makefile
@@ -97,7 +97,7 @@ clean: bpftool_clean runqslower_clean resolve_btfids_clean
 	$(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpf
 	$(Q)$(RM) -r -- $(OUTPUT)feature
 
-install: $(PROGS) bpftool_install runqslower_install
+install: $(PROGS) bpftool_install
 	$(call QUIET_INSTALL, bpf_jit_disasm)
 	$(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/bin
 	$(Q)$(INSTALL) $(OUTPUT)bpf_jit_disasm $(DESTDIR)$(prefix)/bin/bpf_jit_disasm
@@ -118,9 +118,6 @@ bpftool_clean:
 runqslower:
 	$(call descend,runqslower)
 
-runqslower_install:
-	$(call descend,runqslower,install)
-
 runqslower_clean:
 	$(call descend,runqslower,clean)
 
@@ -131,5 +128,5 @@ resolve_btfids_clean:
 	$(call descend,resolve_btfids,clean)
 
 .PHONY: all install clean bpftool bpftool_install bpftool_clean \
-	runqslower runqslower_install runqslower_clean \
+	runqslower runqslower_clean \
 	resolve_btfids resolve_btfids_clean

From f263a81451c12da5a342d90572e317e611846f2c Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 7 Jul 2021 15:38:47 -0700
Subject: [PATCH 8/9] bpf: Track subprog poke descriptors correctly and fix
 use-after-free

Subprograms are calling map_poke_track(), but on program release there is no
hook to call map_poke_untrack(). However, on program release, the aux memory
(and poke descriptor table) is freed even though we still have a reference to
it in the element list of the map aux data. When we run map_poke_run(), we then
end up accessing free'd memory, triggering KASAN in prog_array_map_poke_run():

  [...]
  [  402.824689] BUG: KASAN: use-after-free in prog_array_map_poke_run+0xc2/0x34e
  [  402.824698] Read of size 4 at addr ffff8881905a7940 by task hubble-fgs/4337
  [  402.824705] CPU: 1 PID: 4337 Comm: hubble-fgs Tainted: G          I       5.12.0+ #399
  [  402.824715] Call Trace:
  [  402.824719]  dump_stack+0x93/0xc2
  [  402.824727]  print_address_description.constprop.0+0x1a/0x140
  [  402.824736]  ? prog_array_map_poke_run+0xc2/0x34e
  [  402.824740]  ? prog_array_map_poke_run+0xc2/0x34e
  [  402.824744]  kasan_report.cold+0x7c/0xd8
  [  402.824752]  ? prog_array_map_poke_run+0xc2/0x34e
  [  402.824757]  prog_array_map_poke_run+0xc2/0x34e
  [  402.824765]  bpf_fd_array_map_update_elem+0x124/0x1a0
  [...]

The elements concerned are walked as follows:

    for (i = 0; i < elem->aux->size_poke_tab; i++) {
           poke = &elem->aux->poke_tab[i];
    [...]

The access to size_poke_tab is a 4 byte read, verified by checking offsets
in the KASAN dump:

  [  402.825004] The buggy address belongs to the object at ffff8881905a7800
                 which belongs to the cache kmalloc-1k of size 1024
  [  402.825008] The buggy address is located 320 bytes inside of
                 1024-byte region [ffff8881905a7800, ffff8881905a7c00)

The pahole output of bpf_prog_aux:

  struct bpf_prog_aux {
    [...]
    /* --- cacheline 5 boundary (320 bytes) --- */
    u32                        size_poke_tab;        /*   320     4 */
    [...]

In general, subprograms do not necessarily manage their own data structures.
For example, BTF func_info and linfo are just pointers to the main program
structure. This allows reference counting and cleanup to be done on the latter
which simplifies their management a bit. The aux->poke_tab struct, however,
did not follow this logic. The initial proposed fix for this use-after-free
bug further embedded poke data tracking into the subprogram with proper
reference counting. However, Daniel and Alexei questioned why we were treating
these objects special; I agree, its unnecessary. The fix here removes the per
subprogram poke table allocation and map tracking and instead simply points
the aux->poke_tab pointer at the main programs poke table. This way, map
tracking is simplified to the main program and we do not need to manage them
per subprogram.

This also means, bpf_prog_free_deferred(), which unwinds the program reference
counting and kfrees objects, needs to ensure that we don't try to double free
the poke_tab when free'ing the subprog structures. This is easily solved by
NULL'ing the poke_tab pointer. The second detail is to ensure that per
subprogram JIT logic only does fixups on poke_tab[] entries it owns. To do
this, we add a pointer in the poke structure to point at the subprogram value
so JITs can easily check while walking the poke_tab structure if the current
entry belongs to the current program. The aux pointer is stable and therefore
suitable for such comparison. On the jit_subprogs() error path, we omit
cleaning up the poke->aux field because these are only ever referenced from
the JIT side, but on error we will never make it to the JIT, so its fine to
leave them dangling. Removing these pointers would complicate the error path
for no reason. However, we do need to untrack all poke descriptors from the
main program as otherwise they could race with the freeing of JIT memory from
the subprograms. Lastly, a748c6975dea3 ("bpf: propagate poke descriptors to
subprograms") had an off-by-one on the subprogram instruction index range
check as it was testing 'insn_idx >= subprog_start && insn_idx <= subprog_end'.
However, subprog_end is the next subprogram's start instruction.

Fixes: a748c6975dea3 ("bpf: propagate poke descriptors to subprograms")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210707223848.14580-2-john.fastabend@gmail.com
---
 arch/x86/net/bpf_jit_comp.c |  3 ++
 include/linux/bpf.h         |  1 +
 kernel/bpf/core.c           |  8 ++++-
 kernel/bpf/verifier.c       | 60 +++++++++++++------------------------
 4 files changed, 32 insertions(+), 40 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index e835164189f1..4b951458c9fc 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -570,6 +570,9 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
 
 	for (i = 0; i < prog->aux->size_poke_tab; i++) {
 		poke = &prog->aux->poke_tab[i];
+		if (poke->aux && poke->aux != prog->aux)
+			continue;
+
 		WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable));
 
 		if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f309fc1509f2..e8e2b0393ca9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -780,6 +780,7 @@ struct bpf_jit_poke_descriptor {
 	void *tailcall_target;
 	void *tailcall_bypass;
 	void *bypass_addr;
+	void *aux;
 	union {
 		struct {
 			struct bpf_map *map;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 034ad93a1ad7..9b1577498373 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2236,8 +2236,14 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 #endif
 	if (aux->dst_trampoline)
 		bpf_trampoline_put(aux->dst_trampoline);
-	for (i = 0; i < aux->func_cnt; i++)
+	for (i = 0; i < aux->func_cnt; i++) {
+		/* We can just unlink the subprog poke descriptor table as
+		 * it was originally linked to the main program and is also
+		 * released along with it.
+		 */
+		aux->func[i]->aux->poke_tab = NULL;
 		bpf_jit_free(aux->func[i]);
+	}
 	if (aux->func_cnt) {
 		kfree(aux->func);
 		bpf_prog_unlock_free(aux->prog);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index be38bb930bf1..42a4063de7cd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12121,33 +12121,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			goto out_free;
 		func[i]->is_func = 1;
 		func[i]->aux->func_idx = i;
-		/* the btf and func_info will be freed only at prog->aux */
+		/* Below members will be freed only at prog->aux */
 		func[i]->aux->btf = prog->aux->btf;
 		func[i]->aux->func_info = prog->aux->func_info;
+		func[i]->aux->poke_tab = prog->aux->poke_tab;
+		func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
 
 		for (j = 0; j < prog->aux->size_poke_tab; j++) {
-			u32 insn_idx = prog->aux->poke_tab[j].insn_idx;
-			int ret;
+			struct bpf_jit_poke_descriptor *poke;
 
-			if (!(insn_idx >= subprog_start &&
-			      insn_idx <= subprog_end))
-				continue;
-
-			ret = bpf_jit_add_poke_descriptor(func[i],
-							  &prog->aux->poke_tab[j]);
-			if (ret < 0) {
-				verbose(env, "adding tail call poke descriptor failed\n");
-				goto out_free;
-			}
-
-			func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1;
-
-			map_ptr = func[i]->aux->poke_tab[ret].tail_call.map;
-			ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux);
-			if (ret < 0) {
-				verbose(env, "tracking tail call prog failed\n");
-				goto out_free;
-			}
+			poke = &prog->aux->poke_tab[j];
+			if (poke->insn_idx < subprog_end &&
+			    poke->insn_idx >= subprog_start)
+				poke->aux = func[i]->aux;
 		}
 
 		/* Use bpf_prog_F_tag to indicate functions in stack traces.
@@ -12178,18 +12164,6 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		cond_resched();
 	}
 
-	/* Untrack main program's aux structs so that during map_poke_run()
-	 * we will not stumble upon the unfilled poke descriptors; each
-	 * of the main program's poke descs got distributed across subprogs
-	 * and got tracked onto map, so we are sure that none of them will
-	 * be missed after the operation below
-	 */
-	for (i = 0; i < prog->aux->size_poke_tab; i++) {
-		map_ptr = prog->aux->poke_tab[i].tail_call.map;
-
-		map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
-	}
-
 	/* at this point all bpf functions were successfully JITed
 	 * now populate all bpf_calls with correct addresses and
 	 * run last pass of JIT
@@ -12267,14 +12241,22 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
+	/* We failed JIT'ing, so at this point we need to unregister poke
+	 * descriptors from subprogs, so that kernel is not attempting to
+	 * patch it anymore as we're freeing the subprog JIT memory.
+	 */
+	for (i = 0; i < prog->aux->size_poke_tab; i++) {
+		map_ptr = prog->aux->poke_tab[i].tail_call.map;
+		map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+	}
+	/* At this point we're guaranteed that poke descriptors are not
+	 * live anymore. We can just unlink its descriptor table as it's
+	 * released with the main prog.
+	 */
 	for (i = 0; i < env->subprog_cnt; i++) {
 		if (!func[i])
 			continue;
-
-		for (j = 0; j < func[i]->aux->size_poke_tab; j++) {
-			map_ptr = func[i]->aux->poke_tab[j].tail_call.map;
-			map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux);
-		}
+		func[i]->aux->poke_tab = NULL;
 		bpf_jit_free(func[i]);
 	}
 	kfree(func);

From 1fb5ba29ad0835c5cbfc69a27f9c2733cb65726e Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 7 Jul 2021 15:38:48 -0700
Subject: [PATCH 9/9] bpf: Selftest to verify mixing bpf2bpf calls and
 tailcalls with insn patch

This adds some extra noise to the tailcall_bpf2bpf4 tests that will cause
verify to patch insns. This then moves around subprog start/end insn
index and poke descriptor insn index to ensure that verify and JIT will
continue to track these correctly.

If done correctly verifier should pass this program same as before and
JIT should emit tail call logic.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210707223848.14580-3-john.fastabend@gmail.com
---
 .../selftests/bpf/prog_tests/tailcalls.c      | 36 +++++++++++++------
 .../selftests/bpf/progs/tailcall_bpf2bpf4.c   | 18 ++++++++++
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index ee27d68d2a1c..b5940e6ca67c 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -715,6 +715,8 @@ static void test_tailcall_bpf2bpf_3(void)
 	bpf_object__close(obj);
 }
 
+#include "tailcall_bpf2bpf4.skel.h"
+
 /* test_tailcall_bpf2bpf_4 checks that tailcall counter is correctly preserved
  * across tailcalls combined with bpf2bpf calls. for making sure that tailcall
  * counter behaves correctly, bpf program will go through following flow:
@@ -727,10 +729,15 @@ static void test_tailcall_bpf2bpf_3(void)
  * the loop begins. At the end of the test make sure that the global counter is
  * equal to 31, because tailcall counter includes the first two tailcalls
  * whereas global counter is incremented only on loop presented on flow above.
+ *
+ * The noise parameter is used to insert bpf_map_update calls into the logic
+ * to force verifier to patch instructions. This allows us to ensure jump
+ * logic remains correct with instruction movement.
  */
-static void test_tailcall_bpf2bpf_4(void)
+static void test_tailcall_bpf2bpf_4(bool noise)
 {
-	int err, map_fd, prog_fd, main_fd, data_fd, i, val;
+	int err, map_fd, prog_fd, main_fd, data_fd, i;
+	struct tailcall_bpf2bpf4__bss val;
 	struct bpf_map *prog_array, *data_map;
 	struct bpf_program *prog;
 	struct bpf_object *obj;
@@ -774,11 +781,6 @@ static void test_tailcall_bpf2bpf_4(void)
 			goto out;
 	}
 
-	err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
-				&duration, &retval, NULL);
-	CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n",
-	      err, errno, retval);
-
 	data_map = bpf_object__find_map_by_name(obj, "tailcall.bss");
 	if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map)))
 		return;
@@ -787,10 +789,22 @@ static void test_tailcall_bpf2bpf_4(void)
 	if (CHECK_FAIL(map_fd < 0))
 		return;
 
+	i = 0;
+	val.noise = noise;
+	val.count = 0;
+	err = bpf_map_update_elem(data_fd, &i, &val, BPF_ANY);
+	if (CHECK_FAIL(err))
+		goto out;
+
+	err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
+				&duration, &retval, NULL);
+	CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n",
+	      err, errno, retval);
+
 	i = 0;
 	err = bpf_map_lookup_elem(data_fd, &i, &val);
-	CHECK(err || val != 31, "tailcall count", "err %d errno %d count %d\n",
-	      err, errno, val);
+	CHECK(err || val.count != 31, "tailcall count", "err %d errno %d count %d\n",
+	      err, errno, val.count);
 
 out:
 	bpf_object__close(obj);
@@ -815,5 +829,7 @@ void test_tailcalls(void)
 	if (test__start_subtest("tailcall_bpf2bpf_3"))
 		test_tailcall_bpf2bpf_3();
 	if (test__start_subtest("tailcall_bpf2bpf_4"))
-		test_tailcall_bpf2bpf_4();
+		test_tailcall_bpf2bpf_4(false);
+	if (test__start_subtest("tailcall_bpf2bpf_5"))
+		test_tailcall_bpf2bpf_4(true);
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
index 77df6d4db895..e89368a50b97 100644
--- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
@@ -2,6 +2,13 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} nop_table SEC(".maps");
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
 	__uint(max_entries, 3);
@@ -10,10 +17,21 @@ struct {
 } jmp_table SEC(".maps");
 
 int count = 0;
+int noise = 0;
+
+__always_inline int subprog_noise(void)
+{
+	__u32 key = 0;
+
+	bpf_map_lookup_elem(&nop_table, &key);
+	return 0;
+}
 
 __noinline
 int subprog_tail_2(struct __sk_buff *skb)
 {
+	if (noise)
+		subprog_noise();
 	bpf_tail_call_static(skb, &jmp_table, 2);
 	return skb->len * 3;
 }