From 8ac2441e0b7b7d2e76c47e0997a6634abd80cb26 Mon Sep 17 00:00:00 2001
From: Tushar Dave <tushar.n.dave@oracle.com>
Date: Fri, 9 Feb 2018 11:56:12 -0800
Subject: [PATCH 01/18] samples/bpf: adjust rlimit RLIMIT_MEMLOCK for
 xdp_redirect

Default rlimit RLIMIT_MEMLOCK is 64KB, causes bpf map failure.
e.g.
[root@labbpf]# ./xdp_redirect $(</sys/class/net/eth2/ifindex) \
> $(</sys/class/net/eth3/ifindex)
failed to create a map: 1 Operation not permitted

The failure is seen when executing xdp_redirect while xdp_monitor
is already runnig.

Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/xdp_redirect_user.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index d54e91eb6cbf..b701b5c21342 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -20,6 +20,7 @@
 #include <string.h>
 #include <unistd.h>
 #include <libgen.h>
+#include <sys/resource.h>
 
 #include "bpf_load.h"
 #include "bpf_util.h"
@@ -75,6 +76,7 @@ static void usage(const char *prog)
 
 int main(int argc, char **argv)
 {
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
 	const char *optstr = "SN";
 	char filename[256];
 	int ret, opt, key = 0;
@@ -98,6 +100,11 @@ int main(int argc, char **argv)
 		return 1;
 	}
 
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
 	ifindex_in = strtoul(argv[optind], NULL, 0);
 	ifindex_out = strtoul(argv[optind + 1], NULL, 0);
 	printf("input: %d output: %d\n", ifindex_in, ifindex_out);

From 9492686c53f3a98e7027d1079db1471ab20e17de Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Tue, 13 Feb 2018 13:42:49 +0900
Subject: [PATCH 02/18] bpf: samples/sockmap fix Makefile for build error

While building samples/sockmap, undefined reference error is thrown
for `nla_dump_errormsg'.
Linking tools/lib/bpf/nlattr.o as a fix

Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/sockmap/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile
index 73f1da4d116c..9bf2881bd11b 100644
--- a/samples/sockmap/Makefile
+++ b/samples/sockmap/Makefile
@@ -2,7 +2,7 @@
 hostprogs-y := sockmap
 
 # Libbpf dependencies
-LIBBPF := ../../tools/lib/bpf/bpf.o
+LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/

From 444890c3ce7d74bdc20f2bf930b8476b98d3e972 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Tue, 13 Feb 2018 13:44:22 +0900
Subject: [PATCH 03/18] bpf: samples/sockmap detach sock ops program

samples/sockops program keeps the sock_ops program attached to cgroup.
Fixed this by detaching program before exit.

Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/sockmap/sockmap_user.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 7c25c0c112bc..95a54a89a532 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -566,6 +566,7 @@ run:
 	else
 		fprintf(stderr, "unknown test\n");
 out:
+	bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS);
 	close(s1);
 	close(s2);
 	close(p1);

From 297dd12cb104151797fd649433a2157b585f1718 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 13 Feb 2018 14:15:36 +0100
Subject: [PATCH 04/18] net: avoid including xdp.h in filter.h

If is sufficient with a forward declaration of struct xdp_rxq_info in
linux/filter.h, which avoids including net/xdp.h.  This was originally
suggested by John Fastabend during the review phase, but wasn't
included in the final patchset revision.  Thus, this followup.

Suggested-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 276932d75975..fdb691b520c0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -20,7 +20,6 @@
 #include <linux/set_memory.h>
 #include <linux/kallsyms.h>
 
-#include <net/xdp.h>
 #include <net/sch_generic.h>
 
 #include <uapi/linux/filter.h>
@@ -30,6 +29,7 @@ struct sk_buff;
 struct sock;
 struct seccomp_data;
 struct bpf_prog_aux;
+struct xdp_rxq_info;
 
 /* ArgX, context and stack frame pointer register positions. Note,
  * Arg1, Arg2, Arg3, etc are used as argument mappings of function

From 41757dcb0c3d8446c549e55163c9fd9561fcf599 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 13 Feb 2018 14:19:15 +0100
Subject: [PATCH 05/18] selftests/bpf: fix Makefile for cgroup_helpers.c

The current selftests Makefile construct result in cgroup_helpers.c
gets compiled together with all the TEST_GEN_PROGS. And it also result
in invoking the libbpf Makefile two times (tools/lib/bpf).

These issues were introduced in commit 9d1f15941967 ("bpf: move
cgroup_helpers from samples/bpf/ to tools/testing/selftesting/bpf/").

The only test program that requires the cgroup helpers is 'test_dev_cgroup'.

Thus, create a make target $(OUTPUT)/test_dev_cgroup that extend[1]
the 'prerequisite' for the 'stem' %-style pattern in ../lib.mk,
for this particular test program.

Reviewers notice the make-rules in tools/testing/selftests/lib.mk
differ from the normal kernel kbuild rules, and it is practical
to use 'make -p' to follow how these 'Implicit/static pattern stem'
gets expanded.

[1] https://www.gnu.org/software/make/manual/html_node/Static-Usage.html

Fixes: 9d1f15941967 ("bpf: move cgroup_helpers from samples/bpf/ to tools/testing/selftesting/bpf/")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 5c43c187f27c..8567a858b789 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,12 +35,14 @@ TEST_GEN_PROGS_EXTENDED = test_libbpf_open
 
 include ../lib.mk
 
-BPFOBJ := $(OUTPUT)/libbpf.a cgroup_helpers.c
+BPFOBJ := $(OUTPUT)/libbpf.a
 
 $(TEST_GEN_PROGS): $(BPFOBJ)
 
 $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
 
+$(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
+
 .PHONY: force
 
 # force a rebuild of BPFOBJ when its dependencies are updated

From 615a9474985799c8b48645b8e95a9b9f0691f56a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 13 Feb 2018 10:35:05 -0800
Subject: [PATCH 06/18] tools/bpf: adjust rlimit RLIMIT_MEMLOCK for
 test_tcpbpf_user

The default rlimit RLIMIT_MEMLOCK is 64KB. In certain cases,
e.g. in a test machine mimicking our production system, this test may
fail due to unable to charge the required memory for map creation:
   # ./test_tcpbpf_user
   libbpf: failed to create map (name: 'global_map'): Operation not permitted
   libbpf: failed to load object 'test_tcpbpf_kern.o'
   FAILED: load_bpf_file failed for: test_tcpbpf_kern.o

Changing the default rlimit RLIMIT_MEMLOCK to unlimited makes
the test always pass.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_tcpbpf_user.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c
index 95a370f3d378..5d73db416460 100644
--- a/tools/testing/selftests/bpf/test_tcpbpf_user.c
+++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
@@ -11,6 +11,8 @@
 #include <linux/ptrace.h>
 #include <linux/bpf.h>
 #include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/resource.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -42,6 +44,7 @@ static int bpf_find_map(const char *test, struct bpf_object *obj,
 
 int main(int argc, char **argv)
 {
+	struct rlimit limit  = { RLIM_INFINITY, RLIM_INFINITY };
 	const char *file = "test_tcpbpf_kern.o";
 	struct tcpbpf_globals g = {0};
 	int cg_fd, prog_fd, map_fd;
@@ -54,6 +57,9 @@ int main(int argc, char **argv)
 	int pid;
 	int rv;
 
+	if (setrlimit(RLIMIT_MEMLOCK, &limit) < 0)
+		perror("Unable to lift memlock rlimit");
+
 	if (argc > 1 && strcmp(argv[1], "-d") == 0)
 		debug_flag = true;
 

From 95f87a9706d0a55ff02a652bc8f1b3f7d51bf5eb Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Wed, 14 Feb 2018 13:50:34 -0800
Subject: [PATCH 07/18] selftests/bpf: Print unexpected output on fail

This makes it easier to debug off-hand when the error message isn't
exactly as expected.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index c0f16e93f9bd..6cf9bd6f08b7 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -11291,7 +11291,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 			goto fail_log;
 		}
 		if (!strstr(bpf_vlog, expected_err) && !reject_from_alignment) {
-			printf("FAIL\nUnexpected error message!\n");
+			printf("FAIL\nUnexpected error message!\n\tEXP: %s\n\tRES: %s\n",
+			      expected_err, bpf_vlog);
 			goto fail_log;
 		}
 	}

From d0a0e4956f6c20754ef67db6dfb9746e85ecdcb5 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Wed, 14 Feb 2018 13:50:35 -0800
Subject: [PATCH 08/18] selftests/bpf: Count tests skipped by unpriv

When priviliged tests are skipped due to user rights, count the number of
skipped tests so it's more obvious that the test did not check everything.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 6cf9bd6f08b7..7ab02526c403 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -11378,7 +11378,7 @@ out:
 
 static int do_test(bool unpriv, unsigned int from, unsigned int to)
 {
-	int i, passes = 0, errors = 0;
+	int i, passes = 0, errors = 0, skips = 0;
 
 	for (i = from; i < to; i++) {
 		struct bpf_test *test = &tests[i];
@@ -11395,13 +11395,17 @@ static int do_test(bool unpriv, unsigned int from, unsigned int to)
 				set_admin(true);
 		}
 
-		if (!unpriv) {
+		if (unpriv) {
+			printf("#%d/p %s SKIP\n", i, test->descr);
+			skips++;
+		} else {
 			printf("#%d/p %s ", i, test->descr);
 			do_test_single(test, false, &passes, &errors);
 		}
 	}
 
-	printf("Summary: %d PASSED, %d FAILED\n", passes, errors);
+	printf("Summary: %d PASSED, %d SKIPPED, %d FAILED\n", passes,
+	       skips, errors);
 	return errors ? EXIT_FAILURE : EXIT_SUCCESS;
 }
 

From 0a67487403683852e05fea97208ad7b0ed820115 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Wed, 14 Feb 2018 13:50:36 -0800
Subject: [PATCH 09/18] selftests/bpf: Only run tests if !bpf_disabled

The "kernel.unprivileged_bpf_disabled" sysctl, if enabled, causes all
unprivileged tests to fail because it permanently disables unprivileged
BPF access for the currently running kernel. Skip the relevant tests if
the user attempts to run the testsuite with this sysctl enabled.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 26 ++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 7ab02526c403..2971ba2829ac 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -57,6 +57,9 @@
 #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS	(1 << 0)
 #define F_LOAD_WITH_STRICT_ALIGNMENT		(1 << 1)
 
+#define UNPRIV_SYSCTL "kernel/unprivileged_bpf_disabled"
+static bool unpriv_disabled = false;
+
 struct bpf_test {
 	const char *descr;
 	struct bpf_insn	insns[MAX_INSNS];
@@ -11376,6 +11379,17 @@ out:
 	return ret;
 }
 
+static void get_unpriv_disabled()
+{
+	char buf[2];
+	FILE *fd;
+
+	fd = fopen("/proc/sys/"UNPRIV_SYSCTL, "r");
+	if (fgets(buf, 2, fd) == buf && atoi(buf))
+		unpriv_disabled = true;
+	fclose(fd);
+}
+
 static int do_test(bool unpriv, unsigned int from, unsigned int to)
 {
 	int i, passes = 0, errors = 0, skips = 0;
@@ -11386,7 +11400,10 @@ static int do_test(bool unpriv, unsigned int from, unsigned int to)
 		/* Program types that are not supported by non-root we
 		 * skip right away.
 		 */
-		if (!test->prog_type) {
+		if (!test->prog_type && unpriv_disabled) {
+			printf("#%d/u %s SKIP\n", i, test->descr);
+			skips++;
+		} else if (!test->prog_type) {
 			if (!unpriv)
 				set_admin(false);
 			printf("#%d/u %s ", i, test->descr);
@@ -11433,6 +11450,13 @@ int main(int argc, char **argv)
 		}
 	}
 
+	get_unpriv_disabled();
+	if (unpriv && unpriv_disabled) {
+		printf("Cannot run as unprivileged user with sysctl %s.\n",
+		       UNPRIV_SYSCTL);
+		return EXIT_FAILURE;
+	}
+
 	setrlimit(RLIMIT_MEMLOCK, unpriv ? &rlim : &rinf);
 	return do_test(unpriv, from, to);
 }

From 544bdebc6fcb68c2e8075bc2d3b68e39789d4165 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Wed, 14 Feb 2018 13:50:37 -0800
Subject: [PATCH 10/18] bpf: Remove unused callee_saved array

This array appears to be completely unused, remove it.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5fb69a85d967..3c74b163eaeb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -508,10 +508,6 @@ err:
 static const int caller_saved[CALLER_SAVED_REGS] = {
 	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
-#define CALLEE_SAVED_REGS 5
-static const int callee_saved[CALLEE_SAVED_REGS] = {
-	BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9
-};
 
 static void __mark_reg_not_init(struct bpf_reg_state *reg);
 

From ee07862f7b4594d390b978f6636a6a6191632ab3 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Fri, 23 Feb 2018 14:58:41 +0800
Subject: [PATCH 11/18] bpf: NULL pointer check is not needed in
 BPF_CGROUP_RUN_PROG_INET_SOCK

sk is already allocated in inet_create/inet6_create, hence when
BPF_CGROUP_RUN_PROG_INET_SOCK is executed sk will never be NULL.

The logic is as bellow,
	sk = sk_alloc();
	if (!sk)
		goto out;
	BPF_CGROUP_RUN_PROG_INET_SOCK(sk);

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index a7f16e0f8d68..8a4566691c8f 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -96,7 +96,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
 ({									       \
 	int __ret = 0;							       \
-	if (cgroup_bpf_enabled && sk) {					       \
+	if (cgroup_bpf_enabled) {					       \
 		__ret = __cgroup_bpf_run_filter_sk(sk,			       \
 						 BPF_CGROUP_INET_SOCK_CREATE); \
 	}								       \

From 88e69a1fcc1e67dec3025af64736a84532528242 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 24 Feb 2018 01:07:58 +0100
Subject: [PATCH 12/18] bpf, x64: save one byte per shl/shr/sar when imm is 1

When we shift by one, we can use a different encoding where imm
is not explicitly needed, which saves 1 byte per such op.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4923d92f918d..4bc36bd1b97a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -640,7 +640,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			case BPF_RSH: b3 = 0xE8; break;
 			case BPF_ARSH: b3 = 0xF8; break;
 			}
-			EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
+
+			if (imm32 == 1)
+				EMIT2(0xD1, add_1reg(b3, dst_reg));
+			else
+				EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
 			break;
 
 		case BPF_ALU | BPF_LSH | BPF_X:

From 6fe8b9c1f41dfe3209dabc5bd0726e003a065288 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 24 Feb 2018 01:07:59 +0100
Subject: [PATCH 13/18] bpf, x64: save several bytes by using mov over movabsq
 when possible

While analyzing some of the more complex BPF programs from Cilium,
I found that LLVM generally prefers to emit LD_IMM64 instead of MOV32
BPF instructions for loading unsigned 32-bit immediates into a
register. Given we cannot change the current/stable LLVM versions
that are already out there, lets optimize this case such that the
JIT prefers to emit 'mov %eax, imm32' over 'movabsq %rax, imm64'
whenever suitable in order to reduce the image size by 4-5 bytes per
such load in the typical case, reducing image size on some of the
bigger programs by up to 4%. emit_mov_imm32() and emit_mov_imm64()
have been added as helpers.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 125 +++++++++++++++++++++---------------
 1 file changed, 74 insertions(+), 51 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4bc36bd1b97a..f3e5cd8c1e68 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -60,7 +60,12 @@ static bool is_imm8(int value)
 
 static bool is_simm32(s64 value)
 {
-	return value == (s64) (s32) value;
+	return value == (s64)(s32)value;
+}
+
+static bool is_uimm32(u64 value)
+{
+	return value == (u64)(u32)value;
 }
 
 /* mov dst, src */
@@ -355,6 +360,68 @@ static void emit_load_skb_data_hlen(u8 **pprog)
 	*pprog = prog;
 }
 
+static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
+			   u32 dst_reg, const u32 imm32)
+{
+	u8 *prog = *pprog;
+	u8 b1, b2, b3;
+	int cnt = 0;
+
+	/* optimization: if imm32 is positive, use 'mov %eax, imm32'
+	 * (which zero-extends imm32) to save 2 bytes.
+	 */
+	if (sign_propagate && (s32)imm32 < 0) {
+		/* 'mov %rax, imm32' sign extends imm32 */
+		b1 = add_1mod(0x48, dst_reg);
+		b2 = 0xC7;
+		b3 = 0xC0;
+		EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
+		goto done;
+	}
+
+	/* optimization: if imm32 is zero, use 'xor %eax, %eax'
+	 * to save 3 bytes.
+	 */
+	if (imm32 == 0) {
+		if (is_ereg(dst_reg))
+			EMIT1(add_2mod(0x40, dst_reg, dst_reg));
+		b2 = 0x31; /* xor */
+		b3 = 0xC0;
+		EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
+		goto done;
+	}
+
+	/* mov %eax, imm32 */
+	if (is_ereg(dst_reg))
+		EMIT1(add_1mod(0x40, dst_reg));
+	EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
+done:
+	*pprog = prog;
+}
+
+static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
+			   const u32 imm32_hi, const u32 imm32_lo)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
+		/* For emitting plain u32, where sign bit must not be
+		 * propagated LLVM tends to load imm64 over mov32
+		 * directly, so save couple of bytes by just doing
+		 * 'mov %eax, imm32' instead.
+		 */
+		emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
+	} else {
+		/* movabsq %rax, imm64 */
+		EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
+		EMIT(imm32_lo, 4);
+		EMIT(imm32_hi, 4);
+	}
+
+	*pprog = prog;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
@@ -377,7 +444,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		const s32 imm32 = insn->imm;
 		u32 dst_reg = insn->dst_reg;
 		u32 src_reg = insn->src_reg;
-		u8 b1 = 0, b2 = 0, b3 = 0;
+		u8 b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
 		bool reload_skb_data;
@@ -485,58 +552,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			break;
 
 		case BPF_ALU64 | BPF_MOV | BPF_K:
-			/* optimization: if imm32 is positive,
-			 * use 'mov eax, imm32' (which zero-extends imm32)
-			 * to save 2 bytes
-			 */
-			if (imm32 < 0) {
-				/* 'mov rax, imm32' sign extends imm32 */
-				b1 = add_1mod(0x48, dst_reg);
-				b2 = 0xC7;
-				b3 = 0xC0;
-				EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
-				break;
-			}
-
 		case BPF_ALU | BPF_MOV | BPF_K:
-			/* optimization: if imm32 is zero, use 'xor <dst>,<dst>'
-			 * to save 3 bytes.
-			 */
-			if (imm32 == 0) {
-				if (is_ereg(dst_reg))
-					EMIT1(add_2mod(0x40, dst_reg, dst_reg));
-				b2 = 0x31; /* xor */
-				b3 = 0xC0;
-				EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
-				break;
-			}
-
-			/* mov %eax, imm32 */
-			if (is_ereg(dst_reg))
-				EMIT1(add_1mod(0x40, dst_reg));
-			EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
+			emit_mov_imm32(&prog, BPF_CLASS(insn->code) == BPF_ALU64,
+				       dst_reg, imm32);
 			break;
 
 		case BPF_LD | BPF_IMM | BPF_DW:
-			/* optimization: if imm64 is zero, use 'xor <dst>,<dst>'
-			 * to save 7 bytes.
-			 */
-			if (insn[0].imm == 0 && insn[1].imm == 0) {
-				b1 = add_2mod(0x48, dst_reg, dst_reg);
-				b2 = 0x31; /* xor */
-				b3 = 0xC0;
-				EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg));
-
-				insn++;
-				i++;
-				break;
-			}
-
-			/* movabsq %rax, imm64 */
-			EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
-			EMIT(insn[0].imm, 4);
-			EMIT(insn[1].imm, 4);
-
+			emit_mov_imm64(&prog, dst_reg, insn[1].imm, insn[0].imm);
 			insn++;
 			i++;
 			break;
@@ -604,7 +626,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT_mov(BPF_REG_0, src_reg);
 			else
 				/* mov rax, imm32 */
-				EMIT3_off32(0x48, 0xC7, 0xC0, imm32);
+				emit_mov_imm32(&prog, true,
+					       BPF_REG_0, imm32);
 
 			if (BPF_CLASS(insn->code) == BPF_ALU64)
 				EMIT1(add_1mod(0x48, AUX_REG));

From d806a0cf2a1ddb97c91d902ef1c8219e1e2b2c4c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 24 Feb 2018 01:08:00 +0100
Subject: [PATCH 14/18] bpf, x64: save several bytes when mul dest is r0/r3
 anyway

Instead of unconditionally performing push/pop on rax/rdx
in case of multiplication, we can save a few bytes in case
of dest register being either BPF r0 (rax) or r3 (rdx)
since the result is written in there anyway.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f3e5cd8c1e68..9895ca383023 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -615,8 +615,10 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_MUL | BPF_X:
 		case BPF_ALU64 | BPF_MUL | BPF_K:
 		case BPF_ALU64 | BPF_MUL | BPF_X:
-			EMIT1(0x50); /* push rax */
-			EMIT1(0x52); /* push rdx */
+			if (dst_reg != BPF_REG_0)
+				EMIT1(0x50); /* push rax */
+			if (dst_reg != BPF_REG_3)
+				EMIT1(0x52); /* push rdx */
 
 			/* mov r11, dst_reg */
 			EMIT_mov(AUX_REG, dst_reg);
@@ -636,14 +638,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			/* mul(q) r11 */
 			EMIT2(0xF7, add_1reg(0xE0, AUX_REG));
 
-			/* mov r11, rax */
-			EMIT_mov(AUX_REG, BPF_REG_0);
-
-			EMIT1(0x5A); /* pop rdx */
-			EMIT1(0x58); /* pop rax */
-
-			/* mov dst_reg, r11 */
-			EMIT_mov(dst_reg, AUX_REG);
+			if (dst_reg != BPF_REG_3)
+				EMIT1(0x5A); /* pop rdx */
+			if (dst_reg != BPF_REG_0) {
+				/* mov dst_reg, rax */
+				EMIT_mov(dst_reg, BPF_REG_0);
+				EMIT1(0x58); /* pop rax */
+			}
 			break;
 
 			/* shifts */

From 4c38e2f386b4fc5fd95d1203c74819948e2e903d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 24 Feb 2018 01:08:01 +0100
Subject: [PATCH 15/18] bpf, x64: save few bytes when mul is in alu32

Add a generic emit_mov_reg() helper in order to reuse it in BPF
multiplication to load the src into rax, we can save a few bytes
in alu32 while doing so.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 43 ++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 9895ca383023..5b8fc1326aa1 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -422,6 +422,24 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
 	*pprog = prog;
 }
 
+static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (is64) {
+		/* mov dst, src */
+		EMIT_mov(dst_reg, src_reg);
+	} else {
+		/* mov32 dst, src */
+		if (is_ereg(dst_reg) || is_ereg(src_reg))
+			EMIT1(add_2mod(0x40, dst_reg, src_reg));
+		EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
+	}
+
+	*pprog = prog;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
@@ -480,16 +498,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
 			break;
 
-			/* mov dst, src */
 		case BPF_ALU64 | BPF_MOV | BPF_X:
-			EMIT_mov(dst_reg, src_reg);
-			break;
-
-			/* mov32 dst, src */
 		case BPF_ALU | BPF_MOV | BPF_X:
-			if (is_ereg(dst_reg) || is_ereg(src_reg))
-				EMIT1(add_2mod(0x40, dst_reg, src_reg));
-			EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
+			emit_mov_reg(&prog,
+				     BPF_CLASS(insn->code) == BPF_ALU64,
+				     dst_reg, src_reg);
 			break;
 
 			/* neg dst */
@@ -615,6 +628,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_MUL | BPF_X:
 		case BPF_ALU64 | BPF_MUL | BPF_K:
 		case BPF_ALU64 | BPF_MUL | BPF_X:
+		{
+			bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+
 			if (dst_reg != BPF_REG_0)
 				EMIT1(0x50); /* push rax */
 			if (dst_reg != BPF_REG_3)
@@ -624,14 +640,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			EMIT_mov(AUX_REG, dst_reg);
 
 			if (BPF_SRC(insn->code) == BPF_X)
-				/* mov rax, src_reg */
-				EMIT_mov(BPF_REG_0, src_reg);
+				emit_mov_reg(&prog, is64, BPF_REG_0, src_reg);
 			else
-				/* mov rax, imm32 */
-				emit_mov_imm32(&prog, true,
-					       BPF_REG_0, imm32);
+				emit_mov_imm32(&prog, is64, BPF_REG_0, imm32);
 
-			if (BPF_CLASS(insn->code) == BPF_ALU64)
+			if (is64)
 				EMIT1(add_1mod(0x48, AUX_REG));
 			else if (is_ereg(AUX_REG))
 				EMIT1(add_1mod(0x40, AUX_REG));
@@ -646,7 +659,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(0x58); /* pop rax */
 			}
 			break;
-
+		}
 			/* shifts */
 		case BPF_ALU | BPF_LSH | BPF_K:
 		case BPF_ALU | BPF_RSH | BPF_K:

From 0869175220b339b81de48872c8198c3ed75782e3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 24 Feb 2018 01:08:02 +0100
Subject: [PATCH 16/18] bpf, x64: save 5 bytes in prologue when ebpf insns came
 from cbpf

While it's rather cumbersome to reduce prologue for cBPF->eBPF
migrations wrt spill/fill for r15 which is callee saved register
due to bpf_error path in bpf_jit.S that is both used by migrations
as well as native eBPF, we can still trivially save 5 bytes in
prologue for the former since tail calls can never be used there.
cBPF->eBPF migrations also have their own custom prologue in BPF
asm that xors A and X reg anyway, so it's fine we skip this here.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5b8fc1326aa1..70f9748da7aa 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -216,7 +216,7 @@ struct jit_context {
 /* emit x64 prologue code for BPF program and check it's size.
  * bpf_tail_call helper will skip it while jumping into another program
  */
-static void emit_prologue(u8 **pprog, u32 stack_depth)
+static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 {
 	u8 *prog = *pprog;
 	int cnt = 0;
@@ -251,18 +251,21 @@ static void emit_prologue(u8 **pprog, u32 stack_depth)
 	/* mov qword ptr [rbp+24],r15 */
 	EMIT4(0x4C, 0x89, 0x7D, 24);
 
-	/* Clear the tail call counter (tail_call_cnt): for eBPF tail calls
-	 * we need to reset the counter to 0. It's done in two instructions,
-	 * resetting rax register to 0 (xor on eax gets 0 extended), and
-	 * moving it to the counter location.
-	 */
+	if (!ebpf_from_cbpf) {
+		/* Clear the tail call counter (tail_call_cnt): for eBPF tail
+		 * calls we need to reset the counter to 0. It's done in two
+		 * instructions, resetting rax register to 0, and moving it
+		 * to the counter location.
+		 */
 
-	/* xor eax, eax */
-	EMIT2(0x31, 0xc0);
-	/* mov qword ptr [rbp+32], rax */
-	EMIT4(0x48, 0x89, 0x45, 32);
+		/* xor eax, eax */
+		EMIT2(0x31, 0xc0);
+		/* mov qword ptr [rbp+32], rax */
+		EMIT4(0x48, 0x89, 0x45, 32);
+
+		BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+	}
 
-	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
 	*pprog = prog;
 }
 
@@ -453,7 +456,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	int proglen = 0;
 	u8 *prog = temp;
 
-	emit_prologue(&prog, bpf_prog->aux->stack_depth);
+	emit_prologue(&prog, bpf_prog->aux->stack_depth,
+		      bpf_prog_was_classic(bpf_prog));
 
 	if (seen_ld_abs)
 		emit_load_skb_data_hlen(&prog);

From 23d191a82c133c31bb85aa4b4b26851cd4a4b4ac Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 24 Feb 2018 01:08:03 +0100
Subject: [PATCH 17/18] bpf: add various jit test cases

Add few test cases that check the rnu-time results under JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c | 89 +++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 2971ba2829ac..c987d3a2426f 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -11140,6 +11140,95 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 	},
+	{
+		"jit: lsh, rsh, arsh by 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_MOV64_IMM(BPF_REG_1, 0xff),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 1),
+			BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 1),
+			BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0xff, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x7f, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 2,
+	},
+	{
+		"jit: mov32 for ldimm64, 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_LD_IMM64(BPF_REG_1, 0xfeffffffffffffffULL),
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32),
+			BPF_LD_IMM64(BPF_REG_2, 0xfeffffffULL),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 2,
+	},
+	{
+		"jit: mov32 for ldimm64, 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_LD_IMM64(BPF_REG_1, 0x1ffffffffULL),
+			BPF_LD_IMM64(BPF_REG_2, 0xffffffffULL),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 2,
+	},
+	{
+		"jit: various mul tests",
+		.insns = {
+			BPF_LD_IMM64(BPF_REG_2, 0xeeff0d413122ULL),
+			BPF_LD_IMM64(BPF_REG_0, 0xfefefeULL),
+			BPF_LD_IMM64(BPF_REG_1, 0xefefefULL),
+			BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_1),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_LD_IMM64(BPF_REG_3, 0xfefefeULL),
+			BPF_ALU64_REG(BPF_MUL, BPF_REG_3, BPF_REG_1),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_REG(BPF_REG_2, BPF_REG_2),
+			BPF_LD_IMM64(BPF_REG_0, 0xfefefeULL),
+			BPF_ALU32_REG(BPF_MUL, BPF_REG_0, BPF_REG_1),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_LD_IMM64(BPF_REG_3, 0xfefefeULL),
+			BPF_ALU32_REG(BPF_MUL, BPF_REG_3, BPF_REG_1),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_2, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_LD_IMM64(BPF_REG_0, 0x952a7bbcULL),
+			BPF_LD_IMM64(BPF_REG_1, 0xfefefeULL),
+			BPF_LD_IMM64(BPF_REG_2, 0xeeff0d413122ULL),
+			BPF_ALU32_REG(BPF_MUL, BPF_REG_2, BPF_REG_1),
+			BPF_JMP_REG(BPF_JEQ, BPF_REG_2, BPF_REG_0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.retval = 2,
+	},
+
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)

From c53507778998d45543b27266742d04cd384de356 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Mon, 26 Feb 2018 09:19:12 +0800
Subject: [PATCH 18/18] samples/bpf: Add program for CPU state statistics

CPU is active when have running tasks on it and CPUFreq governor can
select different operating points (OPP) according to different workload;
we use 'pstate' to present CPU state which have running tasks with one
specific OPP.  On the other hand, CPU is idle which only idle task on
it, CPUIdle governor can select one specific idle state to power off
hardware logics; we use 'cstate' to present CPU idle state.

Based on trace events 'cpu_idle' and 'cpu_frequency' we can accomplish
the duration statistics for every state.  Every time when CPU enters
into or exits from idle states, the trace event 'cpu_idle' is recorded;
trace event 'cpu_frequency' records the event for CPU OPP changing, so
it's easily to know how long time the CPU stays in the specified OPP,
and the CPU must be not in any idle state.

This patch is to utilize the mentioned trace events for pstate and
cstate statistics.  To achieve more accurate profiling data, the program
uses below sequence to insure CPU running/idle time aren't missed:

- Before profiling the user space program wakes up all CPUs for once, so
  can avoid to missing account time for CPU staying in idle state for
  long time; the program forces to set 'scaling_max_freq' to lowest
  frequency and then restore 'scaling_max_freq' to highest frequency,
  this can ensure the frequency to be set to lowest frequency and later
  after start to run workload the frequency can be easily to be changed
  to higher frequency;

- User space program reads map data and update statistics for every 5s,
  so this is same with other sample bpf programs for avoiding big
  overload introduced by bpf program self;

- When send signal to terminate program, the signal handler wakes up
  all CPUs, set lowest frequency and restore highest frequency to
  'scaling_max_freq'; this is exactly same with the first step so
  avoid to missing account CPU pstate and cstate time during last
  stage.  Finally it reports the latest statistics.

The program has been tested on Hikey board with octa CA53 CPUs, below
is one example for statistics result, the format mainly follows up
Jesper Dangaard Brouer suggestion.

Jesper reminds to 'get printf to pretty print with thousands separators
use %' and setlocale(LC_NUMERIC, "en_US")', tried three different arm64
GCC toolchains (5.4.0 20160609, 6.2.1 20161016, 6.3.0 20170516) but all
of them cannot support printf flag character %' on arm64 platform, so go
back print number without grouping mode.

CPU states statistics:
state(ms)  cstate-0    cstate-1    cstate-2    pstate-0    pstate-1    pstate-2    pstate-3    pstate-4
CPU-0      767         6111        111863      561         31          756         853         190
CPU-1      241         10606       107956      484         125         646         990         85
CPU-2      413         19721       98735       636         84          696         757         89
CPU-3      84          11711       79989       17516       909         4811        5773        341
CPU-4      152         19610       98229       444         53          649         708         1283
CPU-5      185         8781        108697      666         91          671         677         1365
CPU-6      157         21964       95825       581         67          566         684         1284
CPU-7      125         15238       102704      398         20          665         786         1197

Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile       |   4 +
 samples/bpf/cpustat_kern.c | 281 +++++++++++++++++++++++++++++++++++++
 samples/bpf/cpustat_user.c | 219 +++++++++++++++++++++++++++++
 3 files changed, 504 insertions(+)
 create mode 100644 samples/bpf/cpustat_kern.c
 create mode 100644 samples/bpf/cpustat_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ec3fc8d88e87..2c2a587e0942 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
+hostprogs-y += cpustat
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -93,6 +94,7 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
+cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -144,6 +146,7 @@ always += xdp_monitor_kern.o
 always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
+always += cpustat_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -188,6 +191,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
+HOSTLOADLIBES_cpustat += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
new file mode 100644
index 000000000000..68c84da065b1
--- /dev/null
+++ b/samples/bpf/cpustat_kern.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/*
+ * The CPU number, cstate number and pstate number are based
+ * on 96boards Hikey with octa CA53 CPUs.
+ *
+ * Every CPU have three idle states for cstate:
+ *   WFI, CPU_OFF, CLUSTER_OFF
+ *
+ * Every CPU have 5 operating points:
+ *   208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
+ *
+ * This code is based on these assumption and other platforms
+ * need to adjust these definitions.
+ */
+#define MAX_CPU			8
+#define MAX_PSTATE_ENTRIES	5
+#define MAX_CSTATE_ENTRIES	3
+
+static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
+
+/*
+ * my_map structure is used to record cstate and pstate index and
+ * timestamp (Idx, Ts), when new event incoming we need to update
+ * combination for new state index and timestamp (Idx`, Ts`).
+ *
+ * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
+ * interval for the previous state: Duration(Idx) = Ts` - Ts.
+ *
+ * Every CPU has one below array for recording state index and
+ * timestamp, and record for cstate and pstate saperately:
+ *
+ * +--------------------------+
+ * | cstate timestamp         |
+ * +--------------------------+
+ * | cstate index             |
+ * +--------------------------+
+ * | pstate timestamp         |
+ * +--------------------------+
+ * | pstate index             |
+ * +--------------------------+
+ */
+#define MAP_OFF_CSTATE_TIME	0
+#define MAP_OFF_CSTATE_IDX	1
+#define MAP_OFF_PSTATE_TIME	2
+#define MAP_OFF_PSTATE_IDX	3
+#define MAP_OFF_NUM		4
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = MAX_CPU * MAP_OFF_NUM,
+};
+
+/* cstate_duration records duration time for every idle state per CPU */
+struct bpf_map_def SEC("maps") cstate_duration = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = MAX_CPU * MAX_CSTATE_ENTRIES,
+};
+
+/* pstate_duration records duration time for every operating point per CPU */
+struct bpf_map_def SEC("maps") pstate_duration = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u64),
+	.max_entries = MAX_CPU * MAX_PSTATE_ENTRIES,
+};
+
+/*
+ * The trace events for cpu_idle and cpu_frequency are taken from:
+ * /sys/kernel/debug/tracing/events/power/cpu_idle/format
+ * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
+ *
+ * These two events have same format, so define one common structure.
+ */
+struct cpu_args {
+	u64 pad;
+	u32 state;
+	u32 cpu_id;
+};
+
+/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
+static u32 find_cpu_pstate_idx(u32 frequency)
+{
+	u32 i;
+
+	for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
+		if (frequency == cpu_opps[i])
+			return i;
+	}
+
+	return i;
+}
+
+SEC("tracepoint/power/cpu_idle")
+int bpf_prog1(struct cpu_args *ctx)
+{
+	u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
+	u32 key, cpu, pstate_idx;
+	u64 *val;
+
+	if (ctx->cpu_id > MAX_CPU)
+		return 0;
+
+	cpu = ctx->cpu_id;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
+	cts = bpf_map_lookup_elem(&my_map, &key);
+	if (!cts)
+		return 0;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
+	cstate = bpf_map_lookup_elem(&my_map, &key);
+	if (!cstate)
+		return 0;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
+	pts = bpf_map_lookup_elem(&my_map, &key);
+	if (!pts)
+		return 0;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
+	pstate = bpf_map_lookup_elem(&my_map, &key);
+	if (!pstate)
+		return 0;
+
+	prev_state = *cstate;
+	*cstate = ctx->state;
+
+	if (!*cts) {
+		*cts = bpf_ktime_get_ns();
+		return 0;
+	}
+
+	cur_ts = bpf_ktime_get_ns();
+	delta = cur_ts - *cts;
+	*cts = cur_ts;
+
+	/*
+	 * When state doesn't equal to (u32)-1, the cpu will enter
+	 * one idle state; for this case we need to record interval
+	 * for the pstate.
+	 *
+	 *                 OPP2
+	 *            +---------------------+
+	 *     OPP1   |                     |
+	 *   ---------+                     |
+	 *                                  |  Idle state
+	 *                                  +---------------
+	 *
+	 *            |<- pstate duration ->|
+	 *            ^                     ^
+	 *           pts                  cur_ts
+	 */
+	if (ctx->state != (u32)-1) {
+
+		/* record pstate after have first cpu_frequency event */
+		if (!*pts)
+			return 0;
+
+		delta = cur_ts - *pts;
+
+		pstate_idx = find_cpu_pstate_idx(*pstate);
+		if (pstate_idx >= MAX_PSTATE_ENTRIES)
+			return 0;
+
+		key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
+		val = bpf_map_lookup_elem(&pstate_duration, &key);
+		if (val)
+			__sync_fetch_and_add((long *)val, delta);
+
+	/*
+	 * When state equal to (u32)-1, the cpu just exits from one
+	 * specific idle state; for this case we need to record
+	 * interval for the pstate.
+	 *
+	 *       OPP2
+	 *   -----------+
+	 *              |                          OPP1
+	 *              |                     +-----------
+	 *              |     Idle state      |
+	 *              +---------------------+
+	 *
+	 *              |<- cstate duration ->|
+	 *              ^                     ^
+	 *             cts                  cur_ts
+	 */
+	} else {
+
+		key = cpu * MAX_CSTATE_ENTRIES + prev_state;
+		val = bpf_map_lookup_elem(&cstate_duration, &key);
+		if (val)
+			__sync_fetch_and_add((long *)val, delta);
+	}
+
+	/* Update timestamp for pstate as new start time */
+	if (*pts)
+		*pts = cur_ts;
+
+	return 0;
+}
+
+SEC("tracepoint/power/cpu_frequency")
+int bpf_prog2(struct cpu_args *ctx)
+{
+	u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
+	u32 key, cpu, pstate_idx;
+	u64 *val;
+
+	cpu = ctx->cpu_id;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
+	pts = bpf_map_lookup_elem(&my_map, &key);
+	if (!pts)
+		return 0;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
+	pstate = bpf_map_lookup_elem(&my_map, &key);
+	if (!pstate)
+		return 0;
+
+	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
+	cstate = bpf_map_lookup_elem(&my_map, &key);
+	if (!cstate)
+		return 0;
+
+	prev_state = *pstate;
+	*pstate = ctx->state;
+
+	if (!*pts) {
+		*pts = bpf_ktime_get_ns();
+		return 0;
+	}
+
+	cur_ts = bpf_ktime_get_ns();
+	delta = cur_ts - *pts;
+	*pts = cur_ts;
+
+	/* When CPU is in idle, bail out to skip pstate statistics */
+	if (*cstate != (u32)(-1))
+		return 0;
+
+	/*
+	 * The cpu changes to another different OPP (in below diagram
+	 * change frequency from OPP3 to OPP1), need recording interval
+	 * for previous frequency OPP3 and update timestamp as start
+	 * time for new frequency OPP1.
+	 *
+	 *                 OPP3
+	 *            +---------------------+
+	 *     OPP2   |                     |
+	 *   ---------+                     |
+	 *                                  |    OPP1
+	 *                                  +---------------
+	 *
+	 *            |<- pstate duration ->|
+	 *            ^                     ^
+	 *           pts                  cur_ts
+	 */
+	pstate_idx = find_cpu_pstate_idx(*pstate);
+	if (pstate_idx >= MAX_PSTATE_ENTRIES)
+		return 0;
+
+	key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
+	val = bpf_map_lookup_elem(&pstate_duration, &key);
+	if (val)
+		__sync_fetch_and_add((long *)val, delta);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c
new file mode 100644
index 000000000000..2b4cd1ae57c5
--- /dev/null
+++ b/samples/bpf/cpustat_user.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sched.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <locale.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_CPU			8
+#define MAX_PSTATE_ENTRIES	5
+#define MAX_CSTATE_ENTRIES	3
+#define MAX_STARS		40
+
+#define CPUFREQ_MAX_SYSFS_PATH	"/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq"
+#define CPUFREQ_LOWEST_FREQ	"208000"
+#define CPUFREQ_HIGHEST_FREQ	"12000000"
+
+struct cpu_stat_data {
+	unsigned long cstate[MAX_CSTATE_ENTRIES];
+	unsigned long pstate[MAX_PSTATE_ENTRIES];
+};
+
+static struct cpu_stat_data stat_data[MAX_CPU];
+
+static void cpu_stat_print(void)
+{
+	int i, j;
+	char state_str[sizeof("cstate-9")];
+	struct cpu_stat_data *data;
+
+	/* Clear screen */
+	printf("\033[2J");
+
+	/* Header */
+	printf("\nCPU states statistics:\n");
+	printf("%-10s ", "state(ms)");
+
+	for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
+		sprintf(state_str, "cstate-%d", i);
+		printf("%-11s ", state_str);
+	}
+
+	for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
+		sprintf(state_str, "pstate-%d", i);
+		printf("%-11s ", state_str);
+	}
+
+	printf("\n");
+
+	for (j = 0; j < MAX_CPU; j++) {
+		data = &stat_data[j];
+
+		printf("CPU-%-6d ", j);
+		for (i = 0; i < MAX_CSTATE_ENTRIES; i++)
+			printf("%-11ld ", data->cstate[i] / 1000000);
+
+		for (i = 0; i < MAX_PSTATE_ENTRIES; i++)
+			printf("%-11ld ", data->pstate[i] / 1000000);
+
+		printf("\n");
+	}
+}
+
+static void cpu_stat_update(int cstate_fd, int pstate_fd)
+{
+	unsigned long key, value;
+	int c, i;
+
+	for (c = 0; c < MAX_CPU; c++) {
+		for (i = 0; i < MAX_CSTATE_ENTRIES; i++) {
+			key = c * MAX_CSTATE_ENTRIES + i;
+			bpf_map_lookup_elem(cstate_fd, &key, &value);
+			stat_data[c].cstate[i] = value;
+		}
+
+		for (i = 0; i < MAX_PSTATE_ENTRIES; i++) {
+			key = c * MAX_PSTATE_ENTRIES + i;
+			bpf_map_lookup_elem(pstate_fd, &key, &value);
+			stat_data[c].pstate[i] = value;
+		}
+	}
+}
+
+/*
+ * This function is copied from 'idlestat' tool function
+ * idlestat_wake_all() in idlestate.c.
+ *
+ * It sets the self running task affinity to cpus one by one so can wake up
+ * the specific CPU to handle scheduling; this results in all cpus can be
+ * waken up once and produce ftrace event 'trace_cpu_idle'.
+ */
+static int cpu_stat_inject_cpu_idle_event(void)
+{
+	int rcpu, i, ret;
+	cpu_set_t cpumask;
+	cpu_set_t original_cpumask;
+
+	ret = sysconf(_SC_NPROCESSORS_CONF);
+	if (ret < 0)
+		return -1;
+
+	rcpu = sched_getcpu();
+	if (rcpu < 0)
+		return -1;
+
+	/* Keep track of the CPUs we will run on */
+	sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask);
+
+	for (i = 0; i < ret; i++) {
+
+		/* Pointless to wake up ourself */
+		if (i == rcpu)
+			continue;
+
+		/* Pointless to wake CPUs we will not run on */
+		if (!CPU_ISSET(i, &original_cpumask))
+			continue;
+
+		CPU_ZERO(&cpumask);
+		CPU_SET(i, &cpumask);
+
+		sched_setaffinity(0, sizeof(cpumask), &cpumask);
+	}
+
+	/* Enable all the CPUs of the original mask */
+	sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask);
+	return 0;
+}
+
+/*
+ * It's possible to have no any frequency change for long time and cannot
+ * get ftrace event 'trace_cpu_frequency' for long period, this introduces
+ * big deviation for pstate statistics.
+ *
+ * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz
+ * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to
+ * the maximum frequency value 1.2GHz.
+ */
+static int cpu_stat_inject_cpu_frequency_event(void)
+{
+	int len, fd;
+
+	fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY);
+	if (fd < 0) {
+		printf("failed to open scaling_max_freq, errno=%d\n", errno);
+		return fd;
+	}
+
+	len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ));
+	if (len < 0) {
+		printf("failed to open scaling_max_freq, errno=%d\n", errno);
+		goto err;
+	}
+
+	len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ));
+	if (len < 0) {
+		printf("failed to open scaling_max_freq, errno=%d\n", errno);
+		goto err;
+	}
+
+err:
+	close(fd);
+	return len;
+}
+
+static void int_exit(int sig)
+{
+	cpu_stat_inject_cpu_idle_event();
+	cpu_stat_inject_cpu_frequency_event();
+	cpu_stat_update(map_fd[1], map_fd[2]);
+	cpu_stat_print();
+	exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	char filename[256];
+	int ret;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	ret = cpu_stat_inject_cpu_idle_event();
+	if (ret < 0)
+		return 1;
+
+	ret = cpu_stat_inject_cpu_frequency_event();
+	if (ret < 0)
+		return 1;
+
+	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
+
+	while (1) {
+		cpu_stat_update(map_fd[1], map_fd[2]);
+		cpu_stat_print();
+		sleep(5);
+	}
+
+	return 0;
+}