From 0e53d9e5e82056555020c47ee0c7a087147be084 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 27 Mar 2020 16:58:50 +0100
Subject: [PATCH 1/7] bpf: Enable retrieval of socket cookie for bind/post-bind
 hook

We currently make heavy use of the socket cookie in BPF's connect(),
sendmsg() and recvmsg() hooks for load-balancing decisions. However,
it is currently not enabled/implemented in BPF {post-}bind hooks
where it can later be used in combination for correlation in the tc
egress path, for example.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/e9d71f310715332f12d238cc650c1edc5be55119.1585323121.git.daniel@iogearbox.net
---
 net/core/filter.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 96350a743539..0b6682517d45 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4117,6 +4117,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
+{
+	return sock_gen_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
+	.func		= bpf_get_socket_cookie_sock,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
+
 BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
 {
 	return sock_gen_cookie(ctx->sk);
@@ -5954,6 +5966,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_cookie_sock_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}

From fcf752ea8c9493acc33280e72112ea6108a0d104 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 27 Mar 2020 16:58:51 +0100
Subject: [PATCH 2/7] bpf: Enable perf event rb output for bpf cgroup progs

Currently, connect(), sendmsg(), recvmsg() and bind-related hooks
are all lacking perf event rb output in order to push notifications
or monitoring events up to user space. Back in commit a5a3a828cd00
("bpf: add perf event notificaton support for sock_ops"), I've worked
with Sowmini to enable them for sock_ops where the context part is
not used (as opposed to skbs for example where the packet data can
be appended). Make the bpf_sockopt_event_output() helper generic and
enable it for mentioned hooks.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/69c39daf87e076b31e52473c902e9bfd37559124.1585323121.git.daniel@iogearbox.net
---
 net/core/filter.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 0b6682517d45..6cb7e0e24473 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4159,8 +4159,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
-BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
-	   struct bpf_map *, map, u64, flags, void *, data, u64, size)
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags,
+	   void *, data, u64, size)
 {
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
@@ -4168,8 +4168,8 @@ BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
 	return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
 }
 
-static const struct bpf_func_proto bpf_sockopt_event_output_proto =  {
-	.func		= bpf_sockopt_event_output,
+static const struct bpf_func_proto bpf_event_output_data_proto =  {
+	.func		= bpf_event_output_data,
 	.gpl_only       = true,
 	.ret_type       = RET_INTEGER,
 	.arg1_type      = ARG_PTR_TO_CTX,
@@ -5968,6 +5968,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_sock_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_event_output_data_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -5994,6 +5996,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_sock_addr_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_event_output_data_proto;
 #ifdef CONFIG_INET
 	case BPF_FUNC_sk_lookup_tcp:
 		return &bpf_sock_addr_sk_lookup_tcp_proto;
@@ -6236,7 +6240,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_perf_event_output:
-		return &bpf_sockopt_event_output_proto;
+		return &bpf_event_output_data_proto;
 	case BPF_FUNC_sk_storage_get:
 		return &bpf_sk_storage_get_proto;
 	case BPF_FUNC_sk_storage_delete:

From f318903c0bf42448b4c884732df2bbb0ef7a2284 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 27 Mar 2020 16:58:52 +0100
Subject: [PATCH 3/7] bpf: Add netns cookie and enable it for bpf cgroup hooks

In Cilium we're mainly using BPF cgroup hooks today in order to implement
kube-proxy free Kubernetes service translation for ClusterIP, NodePort (*),
ExternalIP, and LoadBalancer as well as HostPort mapping [0] for all traffic
between Cilium managed nodes. While this works in its current shape and avoids
packet-level NAT for inter Cilium managed node traffic, there is one major
limitation we're facing today, that is, lack of netns awareness.

In Kubernetes, the concept of Pods (which hold one or multiple containers)
has been built around network namespaces, so while we can use the global scope
of attaching to root BPF cgroup hooks also to our advantage (e.g. for exposing
NodePort ports on loopback addresses), we also have the need to differentiate
between initial network namespaces and non-initial one. For example, ExternalIP
services mandate that non-local service IPs are not to be translated from the
host (initial) network namespace as one example. Right now, we have an ugly
work-around in place where non-local service IPs for ExternalIP services are
not xlated from connect() and friends BPF hooks but instead via less efficient
packet-level NAT on the veth tc ingress hook for Pod traffic.

On top of determining whether we're in initial or non-initial network namespace
we also have a need for a socket-cookie like mechanism for network namespaces
scope. Socket cookies have the nice property that they can be combined as part
of the key structure e.g. for BPF LRU maps without having to worry that the
cookie could be recycled. We are planning to use this for our sessionAffinity
implementation for services. Therefore, add a new bpf_get_netns_cookie() helper
which would resolve both use cases at once: bpf_get_netns_cookie(NULL) would
provide the cookie for the initial network namespace while passing the context
instead of NULL would provide the cookie from the application's network namespace.
We're using a hole, so no size increase; the assignment happens only once.
Therefore this allows for a comparison on initial namespace as well as regular
cookie usage as we have today with socket cookies. We could later on enable
this helper for other program types as well as we would see need.

  (*) Both externalTrafficPolicy={Local|Cluster} types
  [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/c47d2346982693a9cf9da0e12690453aded4c788.1585323121.git.daniel@iogearbox.net
---
 include/linux/bpf.h            |  1 +
 include/net/net_namespace.h    | 10 +++++++++
 include/uapi/linux/bpf.h       | 16 ++++++++++++++-
 kernel/bpf/verifier.c          | 16 +++++++++------
 net/core/filter.c              | 37 ++++++++++++++++++++++++++++++++++
 net/core/net_namespace.c       | 15 ++++++++++++++
 tools/include/uapi/linux/bpf.h | 16 ++++++++++++++-
 7 files changed, 103 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bdb981c204fa..78046c570596 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -233,6 +233,7 @@ enum bpf_arg_type {
 	ARG_CONST_SIZE_OR_ZERO,	/* number of bytes accessed from memory or 0 */
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
+	ARG_PTR_TO_CTX_OR_NULL,	/* pointer to context or NULL */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
 	ARG_PTR_TO_SOCK_COMMON,	/* pointer to sock_common */
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 854d39ef1ca3..1c6edfdb9a2c 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -168,6 +168,9 @@ struct net {
 #ifdef CONFIG_XFRM
 	struct netns_xfrm	xfrm;
 #endif
+
+	atomic64_t		net_cookie; /* written once */
+
 #if IS_ENABLED(CONFIG_IP_VS)
 	struct netns_ipvs	*ipvs;
 #endif
@@ -273,6 +276,8 @@ static inline int check_net(const struct net *net)
 
 void net_drop_ns(void *);
 
+u64 net_gen_cookie(struct net *net);
+
 #else
 
 static inline struct net *get_net(struct net *net)
@@ -300,6 +305,11 @@ static inline int check_net(const struct net *net)
 	return 1;
 }
 
+static inline u64 net_gen_cookie(struct net *net)
+{
+	return 0;
+}
+
 #define net_drop_ns NULL
 #endif
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5d01c5c7e598..bd81c4555206 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2950,6 +2950,19 @@ union bpf_attr {
  *		restricted to raw_tracepoint bpf programs.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_netns_cookie(void *ctx)
+ * 	Description
+ * 		Retrieve the cookie (generated by the kernel) of the network
+ * 		namespace the input *ctx* is associated with. The network
+ * 		namespace cookie remains stable for its lifetime and provides
+ * 		a global identifier that can be assumed unique. If *ctx* is
+ * 		NULL, then the helper returns the cookie for the initial
+ * 		network namespace. The cookie itself is very similar to that
+ * 		of bpf_get_socket_cookie() helper, but for network namespaces
+ * 		instead of sockets.
+ * 	Return
+ * 		A 8-byte long opaque number.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3073,7 +3086,8 @@ union bpf_attr {
 	FN(jiffies64),			\
 	FN(read_branch_records),	\
 	FN(get_ns_current_pid_tgid),	\
-	FN(xdp_output),
+	FN(xdp_output),			\
+	FN(get_netns_cookie),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2ea2a868324e..46ba86c540e2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3461,13 +3461,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = CONST_PTR_TO_MAP;
 		if (type != expected_type)
 			goto err_type;
-	} else if (arg_type == ARG_PTR_TO_CTX) {
+	} else if (arg_type == ARG_PTR_TO_CTX ||
+		   arg_type == ARG_PTR_TO_CTX_OR_NULL) {
 		expected_type = PTR_TO_CTX;
-		if (type != expected_type)
-			goto err_type;
-		err = check_ctx_reg(env, reg, regno);
-		if (err < 0)
-			return err;
+		if (!(register_is_null(reg) &&
+		      arg_type == ARG_PTR_TO_CTX_OR_NULL)) {
+			if (type != expected_type)
+				goto err_type;
+			err = check_ctx_reg(env, reg, regno);
+			if (err < 0)
+				return err;
+		}
 	} else if (arg_type == ARG_PTR_TO_SOCK_COMMON) {
 		expected_type = PTR_TO_SOCK_COMMON;
 		/* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
diff --git a/net/core/filter.c b/net/core/filter.c
index 6cb7e0e24473..e249a499cbe5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4141,6 +4141,39 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+static u64 __bpf_get_netns_cookie(struct sock *sk)
+{
+#ifdef CONFIG_NET_NS
+	return net_gen_cookie(sk ? sk->sk_net.net : &init_net);
+#else
+	return 0;
+#endif
+}
+
+BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
+{
+	return __bpf_get_netns_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
+	.func		= bpf_get_netns_cookie_sock,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+	return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
+	.func		= bpf_get_netns_cookie_sock_addr,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX_OR_NULL,
+};
+
 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
 {
 	struct sock *sk = sk_to_full_sk(skb->sk);
@@ -5968,6 +6001,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_sock_proto;
+	case BPF_FUNC_get_netns_cookie:
+		return &bpf_get_netns_cookie_sock_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
@@ -5994,6 +6029,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		}
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_sock_addr_proto;
+	case BPF_FUNC_get_netns_cookie:
+		return &bpf_get_netns_cookie_sock_addr_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_perf_event_output:
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 757cc1d084e7..190ca66a383b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
 
 static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
 
+static atomic64_t cookie_gen;
+
+u64 net_gen_cookie(struct net *net)
+{
+	while (1) {
+		u64 res = atomic64_read(&net->net_cookie);
+
+		if (res)
+			return res;
+		res = atomic64_inc_return(&cookie_gen);
+		atomic64_cmpxchg(&net->net_cookie, 0, res);
+	}
+}
+
 static struct net_generic *net_alloc_generic(void)
 {
 	struct net_generic *ng;
@@ -1087,6 +1101,7 @@ static int __init net_ns_init(void)
 		panic("Could not allocate generic netns");
 
 	rcu_assign_pointer(init_net.gen, ng);
+	net_gen_cookie(&init_net);
 
 	down_write(&pernet_ops_rwsem);
 	if (setup_net(&init_net, &init_user_ns))
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5d01c5c7e598..bd81c4555206 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2950,6 +2950,19 @@ union bpf_attr {
  *		restricted to raw_tracepoint bpf programs.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_netns_cookie(void *ctx)
+ * 	Description
+ * 		Retrieve the cookie (generated by the kernel) of the network
+ * 		namespace the input *ctx* is associated with. The network
+ * 		namespace cookie remains stable for its lifetime and provides
+ * 		a global identifier that can be assumed unique. If *ctx* is
+ * 		NULL, then the helper returns the cookie for the initial
+ * 		network namespace. The cookie itself is very similar to that
+ * 		of bpf_get_socket_cookie() helper, but for network namespaces
+ * 		instead of sockets.
+ * 	Return
+ * 		A 8-byte long opaque number.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3073,7 +3086,8 @@ union bpf_attr {
 	FN(jiffies64),			\
 	FN(read_branch_records),	\
 	FN(get_ns_current_pid_tgid),	\
-	FN(xdp_output),
+	FN(xdp_output),			\
+	FN(get_netns_cookie),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call

From 5a52ae4e32a61ad06ef67f0b3123adbdbac4fb83 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 27 Mar 2020 16:58:53 +0100
Subject: [PATCH 4/7] bpf: Allow to retrieve cgroup v1 classid from v2 hooks

Today, Kubernetes is still operating on cgroups v1, however, it is
possible to retrieve the task's classid based on 'current' out of
connect(), sendmsg(), recvmsg() and bind-related hooks for orchestrators
which attach to the root cgroup v2 hook in a mixed env like in case
of Cilium, for example, in order to then correlate certain pod traffic
and use it as part of the key for BPF map lookups.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/555e1c69db7376c0947007b4951c260e1074efc3.1585323121.git.daniel@iogearbox.net
---
 include/net/cls_cgroup.h |  7 ++++++-
 net/core/filter.c        | 21 +++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 4295de3e6a4b..7e78e7d6f015 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -45,9 +45,14 @@ static inline void sock_update_classid(struct sock_cgroup_data *skcd)
 	sock_cgroup_set_classid(skcd, classid);
 }
 
+static inline u32 __task_get_classid(struct task_struct *task)
+{
+	return task_cls_state(task)->classid;
+}
+
 static inline u32 task_get_classid(const struct sk_buff *skb)
 {
-	u32 classid = task_cls_state(current)->classid;
+	u32 classid = __task_get_classid(current);
 
 	/* Due to the nature of the classifier it is required to ignore all
 	 * packets originating from softirq context as accessing `current'
diff --git a/net/core/filter.c b/net/core/filter.c
index e249a499cbe5..3083c7746ee0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2642,6 +2642,19 @@ static const struct bpf_func_proto bpf_msg_pop_data_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+#ifdef CONFIG_CGROUP_NET_CLASSID
+BPF_CALL_0(bpf_get_cgroup_classid_curr)
+{
+	return __task_get_classid(current);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
+	.func		= bpf_get_cgroup_classid_curr,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+#endif
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -6005,6 +6018,10 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_netns_cookie_sock_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_curr_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6035,6 +6052,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
+#ifdef CONFIG_CGROUP_NET_CLASSID
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_curr_proto;
+#endif
 #ifdef CONFIG_INET
 	case BPF_FUNC_sk_lookup_tcp:
 		return &bpf_sock_addr_sk_lookup_tcp_proto;

From 0f09abd105da6c37713d2b253730a86cb45e127a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 27 Mar 2020 16:58:54 +0100
Subject: [PATCH 5/7] bpf: Enable bpf cgroup hooks to retrieve cgroup v2 and
 ancestor id

Enable the bpf_get_current_cgroup_id() helper for connect(), sendmsg(),
recvmsg() and bind-related hooks in order to retrieve the cgroup v2
context which can then be used as part of the key for BPF map lookups,
for example. Given these hooks operate in process context 'current' is
always valid and pointing to the app that is performing mentioned
syscalls if it's subject to a v2 cgroup. Also with same motivation of
commit 7723628101aa ("bpf: Introduce bpf_skb_ancestor_cgroup_id helper")
enable retrieval of ancestor from current so the cgroup id can be used
for policy lookups which can then forbid connect() / bind(), for example.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/d2a7ef42530ad299e3cbb245e6c12374b72145ef.1585323121.git.daniel@iogearbox.net
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 21 ++++++++++++++++++++-
 kernel/bpf/core.c              |  1 +
 kernel/bpf/helpers.c           | 18 ++++++++++++++++++
 net/core/filter.c              | 12 ++++++++++++
 tools/include/uapi/linux/bpf.h | 21 ++++++++++++++++++++-
 6 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 78046c570596..372708eeaecd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1501,6 +1501,7 @@ extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
+extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
 extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd81c4555206..222ba11966e3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2963,6 +2963,24 @@ union bpf_attr {
  * 		instead of sockets.
  * 	Return
  * 		A 8-byte long opaque number.
+ *
+ * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
+ * 	Description
+ * 		Return id of cgroup v2 that is ancestor of the cgroup associated
+ * 		with the current task at the *ancestor_level*. The root cgroup
+ * 		is at *ancestor_level* zero and each step down the hierarchy
+ * 		increments the level. If *ancestor_level* == level of cgroup
+ * 		associated with the current task, then return value will be the
+ * 		same as that of **bpf_get_current_cgroup_id**\ ().
+ *
+ * 		The helper is useful to implement policies based on cgroups
+ * 		that are upper in hierarchy than immediate cgroup associated
+ * 		with the current task.
+ *
+ * 		The format of returned id and helper limitations are same as in
+ * 		**bpf_get_current_cgroup_id**\ ().
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3087,7 +3105,8 @@ union bpf_attr {
 	FN(read_branch_records),	\
 	FN(get_ns_current_pid_tgid),	\
 	FN(xdp_output),			\
-	FN(get_netns_cookie),
+	FN(get_netns_cookie),		\
+	FN(get_current_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 914f3463aa41..916f5132a984 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2156,6 +2156,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
+const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
 const struct bpf_func_proto bpf_get_local_storage_proto __weak;
 const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 01878db15eaf..bafc53ddd350 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -340,6 +340,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
+{
+	struct cgroup *cgrp = task_dfl_cgroup(current);
+	struct cgroup *ancestor;
+
+	ancestor = cgroup_ancestor(cgrp, ancestor_level);
+	if (!ancestor)
+		return 0;
+	return cgroup_id(ancestor);
+}
+
+const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
+	.func		= bpf_get_current_ancestor_cgroup_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+};
+
 #ifdef CONFIG_CGROUP_BPF
 DECLARE_PER_CPU(struct bpf_cgroup_storage*,
 		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
diff --git a/net/core/filter.c b/net/core/filter.c
index 3083c7746ee0..5cec3ac9e3dd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6018,6 +6018,12 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_netns_cookie_sock_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
+#ifdef CONFIG_CGROUPS
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
+	case BPF_FUNC_get_current_ancestor_cgroup_id:
+		return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
 #ifdef CONFIG_CGROUP_NET_CLASSID
 	case BPF_FUNC_get_cgroup_classid:
 		return &bpf_get_cgroup_classid_curr_proto;
@@ -6052,6 +6058,12 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
+#ifdef CONFIG_CGROUPS
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
+	case BPF_FUNC_get_current_ancestor_cgroup_id:
+		return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
 #ifdef CONFIG_CGROUP_NET_CLASSID
 	case BPF_FUNC_get_cgroup_classid:
 		return &bpf_get_cgroup_classid_curr_proto;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bd81c4555206..222ba11966e3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2963,6 +2963,24 @@ union bpf_attr {
  * 		instead of sockets.
  * 	Return
  * 		A 8-byte long opaque number.
+ *
+ * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
+ * 	Description
+ * 		Return id of cgroup v2 that is ancestor of the cgroup associated
+ * 		with the current task at the *ancestor_level*. The root cgroup
+ * 		is at *ancestor_level* zero and each step down the hierarchy
+ * 		increments the level. If *ancestor_level* == level of cgroup
+ * 		associated with the current task, then return value will be the
+ * 		same as that of **bpf_get_current_cgroup_id**\ ().
+ *
+ * 		The helper is useful to implement policies based on cgroups
+ * 		that are upper in hierarchy than immediate cgroup associated
+ * 		with the current task.
+ *
+ * 		The format of returned id and helper limitations are same as in
+ * 		**bpf_get_current_cgroup_id**\ ().
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3087,7 +3105,8 @@ union bpf_attr {
 	FN(read_branch_records),	\
 	FN(get_ns_current_pid_tgid),	\
 	FN(xdp_output),			\
-	FN(get_netns_cookie),
+	FN(get_netns_cookie),		\
+	FN(get_current_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call

From 834ebca8456c5f7a03d3351227b7c59318ccab62 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 27 Mar 2020 16:58:55 +0100
Subject: [PATCH 6/7] bpf: Enable retrival of pid/tgid/comm from bpf cgroup
 hooks

We already have the bpf_get_current_uid_gid() helper enabled, and
given we now have perf event RB output available for connect(),
sendmsg(), recvmsg() and bind-related hooks, add a trivial change
to enable bpf_get_current_pid_tgid() and bpf_get_current_comm()
as well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/18744744ed93c06343be8b41edcfd858706f39d7.1585323121.git.daniel@iogearbox.net
---
 net/core/filter.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 5cec3ac9e3dd..bb4a196c8809 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6018,6 +6018,10 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_netns_cookie_sock_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
+	case BPF_FUNC_get_current_pid_tgid:
+		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_comm:
+		return &bpf_get_current_comm_proto;
 #ifdef CONFIG_CGROUPS
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
@@ -6058,6 +6062,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
+	case BPF_FUNC_get_current_pid_tgid:
+		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_comm:
+		return &bpf_get_current_comm_proto;
 #ifdef CONFIG_CGROUPS
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;

From 23599ada0e9173e92d3cc26254b1e18e542c469d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 27 Mar 2020 16:58:56 +0100
Subject: [PATCH 7/7] bpf: Add selftest cases for ctx_or_null argument type

Add various tests to make sure the verifier keeps catching them:

  # ./test_verifier
  [...]
  #230/p pass ctx or null check, 1: ctx OK
  #231/p pass ctx or null check, 2: null OK
  #232/p pass ctx or null check, 3: 1 OK
  #233/p pass ctx or null check, 4: ctx - const OK
  #234/p pass ctx or null check, 5: null (connect) OK
  #235/p pass ctx or null check, 6: null (bind) OK
  #236/p pass ctx or null check, 7: ctx (bind) OK
  #237/p pass ctx or null check, 8: null (bind) OK
  [...]
  Summary: 1595 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/c74758d07b1b678036465ef7f068a49e9efd3548.1585323121.git.daniel@iogearbox.net
---
 tools/testing/selftests/bpf/verifier/ctx.c | 105 +++++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/ctx.c b/tools/testing/selftests/bpf/verifier/ctx.c
index 92762c08f5e3..93d6b1641481 100644
--- a/tools/testing/selftests/bpf/verifier/ctx.c
+++ b/tools/testing/selftests/bpf/verifier/ctx.c
@@ -91,3 +91,108 @@
 	.result = REJECT,
 	.errstr = "variable ctx access var_off=(0x0; 0x4)",
 },
+{
+	"pass ctx or null check, 1: ctx",
+	.insns = {
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			     BPF_FUNC_get_netns_cookie),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	.expected_attach_type = BPF_CGROUP_UDP6_SENDMSG,
+	.result = ACCEPT,
+},
+{
+	"pass ctx or null check, 2: null",
+	.insns = {
+		BPF_MOV64_IMM(BPF_REG_1, 0),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			     BPF_FUNC_get_netns_cookie),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	.expected_attach_type = BPF_CGROUP_UDP6_SENDMSG,
+	.result = ACCEPT,
+},
+{
+	"pass ctx or null check, 3: 1",
+	.insns = {
+		BPF_MOV64_IMM(BPF_REG_1, 1),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			     BPF_FUNC_get_netns_cookie),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	.expected_attach_type = BPF_CGROUP_UDP6_SENDMSG,
+	.result = REJECT,
+	.errstr = "R1 type=inv expected=ctx",
+},
+{
+	"pass ctx or null check, 4: ctx - const",
+	.insns = {
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			     BPF_FUNC_get_netns_cookie),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	.expected_attach_type = BPF_CGROUP_UDP6_SENDMSG,
+	.result = REJECT,
+	.errstr = "dereference of modified ctx ptr",
+},
+{
+	"pass ctx or null check, 5: null (connect)",
+	.insns = {
+		BPF_MOV64_IMM(BPF_REG_1, 0),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			     BPF_FUNC_get_netns_cookie),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	.expected_attach_type = BPF_CGROUP_INET4_CONNECT,
+	.result = ACCEPT,
+},
+{
+	"pass ctx or null check, 6: null (bind)",
+	.insns = {
+		BPF_MOV64_IMM(BPF_REG_1, 0),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			     BPF_FUNC_get_netns_cookie),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+	.result = ACCEPT,
+},
+{
+	"pass ctx or null check, 7: ctx (bind)",
+	.insns = {
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			     BPF_FUNC_get_socket_cookie),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+	.result = ACCEPT,
+},
+{
+	"pass ctx or null check, 8: null (bind)",
+	.insns = {
+		BPF_MOV64_IMM(BPF_REG_1, 0),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			     BPF_FUNC_get_socket_cookie),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	.expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+	.result = REJECT,
+	.errstr = "R1 type=inv expected=ctx",
+},