2020-01-09 03:35:08 +03:00
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
2022-01-14 19:39:46 +03:00
# include <linux/init.h>
2020-01-09 03:35:08 +03:00
# include <linux/types.h>
# include <linux/bpf_verifier.h>
# include <linux/bpf.h>
# include <linux/btf.h>
2021-03-25 04:52:01 +03:00
# include <linux/btf_ids.h>
2020-01-09 03:35:08 +03:00
# include <linux/filter.h>
# include <net/tcp.h>
2020-03-20 18:21:01 +03:00
# include <net/bpf_sk_storage.h>
2020-01-09 03:35:08 +03:00
bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt
This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use
case is to allow a bpf-tcp-cc switching to another cc during init().
For example, when the tcp flow is not ecn ready, the bpf_dctcp
can switch to another cc by calling setsockopt(TCP_CONGESTION).
During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be
called and this could cause a recursion but it is stopped by the
current trampoline's logic (in the prog->active counter).
While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()),
the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring
bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not
available to the bpf-tcp-cc's release(). This will avoid release()
making setsockopt() call that will potentially allocate new resources.
Although the bpf-tcp-cc already has a more powerful way to read tcp_sock
from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and
bpf_setsockopt are available together. Thus, bpf_getsockopt() is also
added to all tcp_congestion_ops except release().
When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION)
to switch to a new cc, the old bpf-tcp-cc will be released by
bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map
after a rcu grace period because the trampoline's image cannot be freed
while the old bpf-tcp-cc is still running.
bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's
tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size
of icsk_ca_priv has already been raised a few times to avoid
extra kmalloc and memory referencing. The only exception is the
kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv.
To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer
value stored in icsk_ca_priv after switching and without over-complicating
the bpf's verifier for this one exception in tcp_cdg, this patch does not
allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be
implemented and then use the bpf_sk_storage as the extended storage.
bpf_sk_setsockopt proto has only been recently added and used
in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the
same proto instead of adding a new proto specifically for bpf-tcp-cc.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com
2021-08-24 20:30:07 +03:00
/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */
2024-01-20 01:50:02 +03:00
static struct bpf_struct_ops bpf_tcp_congestion_ops ;
bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt
This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use
case is to allow a bpf-tcp-cc switching to another cc during init().
For example, when the tcp flow is not ecn ready, the bpf_dctcp
can switch to another cc by calling setsockopt(TCP_CONGESTION).
During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be
called and this could cause a recursion but it is stopped by the
current trampoline's logic (in the prog->active counter).
While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()),
the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring
bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not
available to the bpf-tcp-cc's release(). This will avoid release()
making setsockopt() call that will potentially allocate new resources.
Although the bpf-tcp-cc already has a more powerful way to read tcp_sock
from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and
bpf_setsockopt are available together. Thus, bpf_getsockopt() is also
added to all tcp_congestion_ops except release().
When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION)
to switch to a new cc, the old bpf-tcp-cc will be released by
bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map
after a rcu grace period because the trampoline's image cannot be freed
while the old bpf-tcp-cc is still running.
bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's
tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size
of icsk_ca_priv has already been raised a few times to avoid
extra kmalloc and memory referencing. The only exception is the
kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv.
To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer
value stored in icsk_ca_priv after switching and without over-complicating
the bpf's verifier for this one exception in tcp_cdg, this patch does not
allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be
implemented and then use the bpf_sk_storage as the extended storage.
bpf_sk_setsockopt proto has only been recently added and used
in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the
same proto instead of adding a new proto specifically for bpf-tcp-cc.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com
2021-08-24 20:30:07 +03:00
2020-01-09 03:35:08 +03:00
static u32 unsupported_ops [ ] = {
offsetof ( struct tcp_congestion_ops , get_info ) ,
} ;
static const struct btf_type * tcp_sock_type ;
static u32 tcp_sock_id , sock_id ;
2024-01-20 01:49:54 +03:00
static const struct btf_type * tcp_congestion_ops_type ;
2020-01-09 03:35:08 +03:00
static int bpf_tcp_ca_init ( struct btf * btf )
{
s32 type_id ;
type_id = btf_find_by_name_kind ( btf , " sock " , BTF_KIND_STRUCT ) ;
if ( type_id < 0 )
return - EINVAL ;
sock_id = type_id ;
type_id = btf_find_by_name_kind ( btf , " tcp_sock " , BTF_KIND_STRUCT ) ;
if ( type_id < 0 )
return - EINVAL ;
tcp_sock_id = type_id ;
tcp_sock_type = btf_type_by_id ( btf , tcp_sock_id ) ;
2024-01-20 01:49:54 +03:00
type_id = btf_find_by_name_kind ( btf , " tcp_congestion_ops " , BTF_KIND_STRUCT ) ;
if ( type_id < 0 )
return - EINVAL ;
tcp_congestion_ops_type = btf_type_by_id ( btf , type_id ) ;
2020-01-09 03:35:08 +03:00
return 0 ;
}
static bool is_unsupported ( u32 member_offset )
{
unsigned int i ;
for ( i = 0 ; i < ARRAY_SIZE ( unsupported_ops ) ; i + + ) {
if ( member_offset = = unsupported_ops [ i ] )
return true ;
}
return false ;
}
static bool bpf_tcp_ca_is_valid_access ( int off , int size ,
enum bpf_access_type type ,
const struct bpf_prog * prog ,
struct bpf_insn_access_aux * info )
{
2021-10-25 09:40:23 +03:00
if ( ! bpf_tracing_btf_ctx_access ( off , size , type , prog , info ) )
2020-01-09 03:35:08 +03:00
return false ;
bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs
Kfuncs currently support specifying the KF_TRUSTED_ARGS flag to signal
to the verifier that it should enforce that a BPF program passes it a
"safe", trusted pointer. Currently, "safe" means that the pointer is
either PTR_TO_CTX, or is refcounted. There may be cases, however, where
the kernel passes a BPF program a safe / trusted pointer to an object
that the BPF program wishes to use as a kptr, but because the object
does not yet have a ref_obj_id from the perspective of the verifier, the
program would be unable to pass it to a KF_ACQUIRE | KF_TRUSTED_ARGS
kfunc.
The solution is to expand the set of pointers that are considered
trusted according to KF_TRUSTED_ARGS, so that programs can invoke kfuncs
with these pointers without getting rejected by the verifier.
There is already a PTR_UNTRUSTED flag that is set in some scenarios,
such as when a BPF program reads a kptr directly from a map
without performing a bpf_kptr_xchg() call. These pointers of course can
and should be rejected by the verifier. Unfortunately, however,
PTR_UNTRUSTED does not cover all the cases for safety that need to
be addressed to adequately protect kfuncs. Specifically, pointers
obtained by a BPF program "walking" a struct are _not_ considered
PTR_UNTRUSTED according to BPF. For example, say that we were to add a
kfunc called bpf_task_acquire(), with KF_ACQUIRE | KF_TRUSTED_ARGS, to
acquire a struct task_struct *. If we only used PTR_UNTRUSTED to signal
that a task was unsafe to pass to a kfunc, the verifier would mistakenly
allow the following unsafe BPF program to be loaded:
SEC("tp_btf/task_newtask")
int BPF_PROG(unsafe_acquire_task,
struct task_struct *task,
u64 clone_flags)
{
struct task_struct *acquired, *nested;
nested = task->last_wakee;
/* Would not be rejected by the verifier. */
acquired = bpf_task_acquire(nested);
if (!acquired)
return 0;
bpf_task_release(acquired);
return 0;
}
To address this, this patch defines a new type flag called PTR_TRUSTED
which tracks whether a PTR_TO_BTF_ID pointer is safe to pass to a
KF_TRUSTED_ARGS kfunc or a BPF helper function. PTR_TRUSTED pointers are
passed directly from the kernel as a tracepoint or struct_ops callback
argument. Any nested pointer that is obtained from walking a PTR_TRUSTED
pointer is no longer PTR_TRUSTED. From the example above, the struct
task_struct *task argument is PTR_TRUSTED, but the 'nested' pointer
obtained from 'task->last_wakee' is not PTR_TRUSTED.
A subsequent patch will add kfuncs for storing a task kfunc as a kptr,
and then another patch will add selftests to validate.
Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20221120051004.3605026-3-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
2022-11-20 08:10:02 +03:00
if ( base_type ( info - > reg_type ) = = PTR_TO_BTF_ID & &
! bpf_type_has_unsafe_modifiers ( info - > reg_type ) & &
info - > btf_id = = sock_id )
2020-01-09 03:35:08 +03:00
/* promote it to tcp_sock */
info - > btf_id = tcp_sock_id ;
return true ;
}
static int bpf_tcp_ca_btf_struct_access ( struct bpf_verifier_log * log ,
2022-11-14 22:15:28 +03:00
const struct bpf_reg_state * reg ,
2023-04-04 07:50:23 +03:00
int off , int size )
2020-01-09 03:35:08 +03:00
{
2022-11-14 22:15:28 +03:00
const struct btf_type * t ;
2020-01-09 03:35:08 +03:00
size_t end ;
2022-11-14 22:15:28 +03:00
t = btf_type_by_id ( reg - > btf , reg - > btf_id ) ;
2020-01-09 03:35:08 +03:00
if ( t ! = tcp_sock_type ) {
bpf_log ( log , " only read is supported \n " ) ;
return - EACCES ;
}
switch ( off ) {
2022-06-22 22:12:23 +03:00
case offsetof ( struct sock , sk_pacing_rate ) :
end = offsetofend ( struct sock , sk_pacing_rate ) ;
break ;
case offsetof ( struct sock , sk_pacing_status ) :
end = offsetofend ( struct sock , sk_pacing_status ) ;
break ;
2020-01-09 03:35:08 +03:00
case bpf_ctx_range ( struct inet_connection_sock , icsk_ca_priv ) :
end = offsetofend ( struct inet_connection_sock , icsk_ca_priv ) ;
break ;
case offsetof ( struct inet_connection_sock , icsk_ack . pending ) :
end = offsetofend ( struct inet_connection_sock ,
icsk_ack . pending ) ;
break ;
case offsetof ( struct tcp_sock , snd_cwnd ) :
end = offsetofend ( struct tcp_sock , snd_cwnd ) ;
break ;
case offsetof ( struct tcp_sock , snd_cwnd_cnt ) :
end = offsetofend ( struct tcp_sock , snd_cwnd_cnt ) ;
break ;
case offsetof ( struct tcp_sock , snd_ssthresh ) :
end = offsetofend ( struct tcp_sock , snd_ssthresh ) ;
break ;
case offsetof ( struct tcp_sock , ecn_flags ) :
end = offsetofend ( struct tcp_sock , ecn_flags ) ;
break ;
2023-03-29 10:35:57 +03:00
case offsetof ( struct tcp_sock , app_limited ) :
end = offsetofend ( struct tcp_sock , app_limited ) ;
break ;
2020-01-09 03:35:08 +03:00
default :
bpf_log ( log , " no write support to tcp_sock at off %d \n " , off ) ;
return - EACCES ;
}
if ( off + size > end ) {
bpf_log ( log ,
" write access at off %d with size %d beyond the member of tcp_sock ended at %zu \n " ,
off , size , end ) ;
return - EACCES ;
}
2022-09-07 19:40:38 +03:00
return 0 ;
2020-01-09 03:35:08 +03:00
}
2020-01-09 03:45:51 +03:00
BPF_CALL_2 ( bpf_tcp_send_ack , struct tcp_sock * , tp , u32 , rcv_nxt )
{
/* bpf_tcp_ca prog cannot have NULL tp */
__tcp_send_ack ( ( struct sock * ) tp , rcv_nxt ) ;
return 0 ;
}
static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
. func = bpf_tcp_send_ack ,
. gpl_only = false ,
/* In case we want to report error later */
. ret_type = RET_INTEGER ,
. arg1_type = ARG_PTR_TO_BTF_ID ,
2020-09-21 15:12:20 +03:00
. arg1_btf_id = & tcp_sock_id ,
2020-01-09 03:45:51 +03:00
. arg2_type = ARG_ANYTHING ,
} ;
bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt
This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use
case is to allow a bpf-tcp-cc switching to another cc during init().
For example, when the tcp flow is not ecn ready, the bpf_dctcp
can switch to another cc by calling setsockopt(TCP_CONGESTION).
During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be
called and this could cause a recursion but it is stopped by the
current trampoline's logic (in the prog->active counter).
While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()),
the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring
bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not
available to the bpf-tcp-cc's release(). This will avoid release()
making setsockopt() call that will potentially allocate new resources.
Although the bpf-tcp-cc already has a more powerful way to read tcp_sock
from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and
bpf_setsockopt are available together. Thus, bpf_getsockopt() is also
added to all tcp_congestion_ops except release().
When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION)
to switch to a new cc, the old bpf-tcp-cc will be released by
bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map
after a rcu grace period because the trampoline's image cannot be freed
while the old bpf-tcp-cc is still running.
bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's
tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size
of icsk_ca_priv has already been raised a few times to avoid
extra kmalloc and memory referencing. The only exception is the
kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv.
To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer
value stored in icsk_ca_priv after switching and without over-complicating
the bpf's verifier for this one exception in tcp_cdg, this patch does not
allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be
implemented and then use the bpf_sk_storage as the extended storage.
bpf_sk_setsockopt proto has only been recently added and used
in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the
same proto instead of adding a new proto specifically for bpf-tcp-cc.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com
2021-08-24 20:30:07 +03:00
static u32 prog_ops_moff ( const struct bpf_prog * prog )
{
const struct btf_member * m ;
const struct btf_type * t ;
u32 midx ;
midx = prog - > expected_attach_type ;
2024-01-20 01:49:54 +03:00
t = tcp_congestion_ops_type ;
bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt
This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use
case is to allow a bpf-tcp-cc switching to another cc during init().
For example, when the tcp flow is not ecn ready, the bpf_dctcp
can switch to another cc by calling setsockopt(TCP_CONGESTION).
During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be
called and this could cause a recursion but it is stopped by the
current trampoline's logic (in the prog->active counter).
While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()),
the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring
bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not
available to the bpf-tcp-cc's release(). This will avoid release()
making setsockopt() call that will potentially allocate new resources.
Although the bpf-tcp-cc already has a more powerful way to read tcp_sock
from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and
bpf_setsockopt are available together. Thus, bpf_getsockopt() is also
added to all tcp_congestion_ops except release().
When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION)
to switch to a new cc, the old bpf-tcp-cc will be released by
bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map
after a rcu grace period because the trampoline's image cannot be freed
while the old bpf-tcp-cc is still running.
bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's
tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size
of icsk_ca_priv has already been raised a few times to avoid
extra kmalloc and memory referencing. The only exception is the
kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv.
To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer
value stored in icsk_ca_priv after switching and without over-complicating
the bpf's verifier for this one exception in tcp_cdg, this patch does not
allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be
implemented and then use the bpf_sk_storage as the extended storage.
bpf_sk_setsockopt proto has only been recently added and used
in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the
same proto instead of adding a new proto specifically for bpf-tcp-cc.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com
2021-08-24 20:30:07 +03:00
m = & btf_type_member ( t ) [ midx ] ;
2021-12-01 21:10:25 +03:00
return __btf_member_bit_offset ( t , m ) / 8 ;
bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt
This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use
case is to allow a bpf-tcp-cc switching to another cc during init().
For example, when the tcp flow is not ecn ready, the bpf_dctcp
can switch to another cc by calling setsockopt(TCP_CONGESTION).
During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be
called and this could cause a recursion but it is stopped by the
current trampoline's logic (in the prog->active counter).
While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()),
the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring
bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not
available to the bpf-tcp-cc's release(). This will avoid release()
making setsockopt() call that will potentially allocate new resources.
Although the bpf-tcp-cc already has a more powerful way to read tcp_sock
from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and
bpf_setsockopt are available together. Thus, bpf_getsockopt() is also
added to all tcp_congestion_ops except release().
When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION)
to switch to a new cc, the old bpf-tcp-cc will be released by
bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map
after a rcu grace period because the trampoline's image cannot be freed
while the old bpf-tcp-cc is still running.
bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's
tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size
of icsk_ca_priv has already been raised a few times to avoid
extra kmalloc and memory referencing. The only exception is the
kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv.
To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer
value stored in icsk_ca_priv after switching and without over-complicating
the bpf's verifier for this one exception in tcp_cdg, this patch does not
allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be
implemented and then use the bpf_sk_storage as the extended storage.
bpf_sk_setsockopt proto has only been recently added and used
in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the
same proto instead of adding a new proto specifically for bpf-tcp-cc.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com
2021-08-24 20:30:07 +03:00
}
2020-01-09 03:35:08 +03:00
static const struct bpf_func_proto *
bpf_tcp_ca_get_func_proto ( enum bpf_func_id func_id ,
const struct bpf_prog * prog )
{
2020-01-09 03:45:51 +03:00
switch ( func_id ) {
case BPF_FUNC_tcp_send_ack :
return & bpf_tcp_send_ack_proto ;
2020-03-20 18:21:01 +03:00
case BPF_FUNC_sk_storage_get :
2020-09-25 03:04:02 +03:00
return & bpf_sk_storage_get_proto ;
2020-03-20 18:21:01 +03:00
case BPF_FUNC_sk_storage_delete :
2020-09-25 03:04:02 +03:00
return & bpf_sk_storage_delete_proto ;
bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt
This patch allows the bpf-tcp-cc to call bpf_setsockopt. One use
case is to allow a bpf-tcp-cc switching to another cc during init().
For example, when the tcp flow is not ecn ready, the bpf_dctcp
can switch to another cc by calling setsockopt(TCP_CONGESTION).
During setsockopt(TCP_CONGESTION), the new tcp-cc's init() will be
called and this could cause a recursion but it is stopped by the
current trampoline's logic (in the prog->active counter).
While retiring a bpf-tcp-cc (e.g. in tcp_v[46]_destroy_sock()),
the tcp stack calls bpf-tcp-cc's release(). To avoid the retiring
bpf-tcp-cc making further changes to the sk, bpf_setsockopt is not
available to the bpf-tcp-cc's release(). This will avoid release()
making setsockopt() call that will potentially allocate new resources.
Although the bpf-tcp-cc already has a more powerful way to read tcp_sock
from the PTR_TO_BTF_ID, it is usually expected that bpf_getsockopt and
bpf_setsockopt are available together. Thus, bpf_getsockopt() is also
added to all tcp_congestion_ops except release().
When the old bpf-tcp-cc is calling setsockopt(TCP_CONGESTION)
to switch to a new cc, the old bpf-tcp-cc will be released by
bpf_struct_ops_put(). Thus, this patch also puts the bpf_struct_ops_map
after a rcu grace period because the trampoline's image cannot be freed
while the old bpf-tcp-cc is still running.
bpf-tcp-cc can only access icsk_ca_priv as SCALAR. All kernel's
tcp-cc is also accessing the icsk_ca_priv as SCALAR. The size
of icsk_ca_priv has already been raised a few times to avoid
extra kmalloc and memory referencing. The only exception is the
kernel's tcp_cdg.c that stores a kmalloc()-ed pointer in icsk_ca_priv.
To avoid the old bpf-tcp-cc accidentally overriding this tcp_cdg's pointer
value stored in icsk_ca_priv after switching and without over-complicating
the bpf's verifier for this one exception in tcp_cdg, this patch does not
allow switching to tcp_cdg. If there is a need, bpf_tcp_cdg can be
implemented and then use the bpf_sk_storage as the extended storage.
bpf_sk_setsockopt proto has only been recently added and used
in bpf-sockopt and bpf-iter-tcp, so impose the tcp_cdg limitation in the
same proto instead of adding a new proto specifically for bpf-tcp-cc.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20210824173007.3976921-1-kafai@fb.com
2021-08-24 20:30:07 +03:00
case BPF_FUNC_setsockopt :
/* Does not allow release() to call setsockopt.
* release ( ) is called when the current bpf - tcp - cc
* is retiring . It is not allowed to call
* setsockopt ( ) to make further changes which
* may potentially allocate new resources .
*/
if ( prog_ops_moff ( prog ) ! =
offsetof ( struct tcp_congestion_ops , release ) )
return & bpf_sk_setsockopt_proto ;
return NULL ;
case BPF_FUNC_getsockopt :
/* Since get/setsockopt is usually expected to
* be available together , disable getsockopt for
* release also to avoid usage surprise .
* The bpf - tcp - cc already has a more powerful way
* to read tcp_sock from the PTR_TO_BTF_ID .
*/
if ( prog_ops_moff ( prog ) ! =
offsetof ( struct tcp_congestion_ops , release ) )
return & bpf_sk_getsockopt_proto ;
return NULL ;
2021-11-13 17:22:26 +03:00
case BPF_FUNC_ktime_get_coarse_ns :
return & bpf_ktime_get_coarse_ns_proto ;
2020-01-09 03:45:51 +03:00
default :
2024-01-24 05:21:04 +03:00
return bpf_base_func_proto ( func_id , prog ) ;
2020-01-09 03:45:51 +03:00
}
2020-01-09 03:35:08 +03:00
}
2024-01-29 04:24:08 +03:00
BTF_KFUNCS_START ( bpf_tcp_ca_check_kfunc_ids )
2022-07-21 16:42:35 +03:00
BTF_ID_FLAGS ( func , tcp_reno_ssthresh )
BTF_ID_FLAGS ( func , tcp_reno_cong_avoid )
BTF_ID_FLAGS ( func , tcp_reno_undo_cwnd )
BTF_ID_FLAGS ( func , tcp_slow_start )
BTF_ID_FLAGS ( func , tcp_cong_avoid_ai )
2024-01-29 04:24:08 +03:00
BTF_KFUNCS_END ( bpf_tcp_ca_check_kfunc_ids )
2021-03-25 04:52:01 +03:00
2022-01-14 19:39:46 +03:00
static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
2022-07-21 16:42:35 +03:00
. owner = THIS_MODULE ,
. set = & bpf_tcp_ca_check_kfunc_ids ,
2022-01-14 19:39:46 +03:00
} ;
2021-03-25 04:52:01 +03:00
2020-01-09 03:35:08 +03:00
static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
. get_func_proto = bpf_tcp_ca_get_func_proto ,
. is_valid_access = bpf_tcp_ca_is_valid_access ,
. btf_struct_access = bpf_tcp_ca_btf_struct_access ,
} ;
static int bpf_tcp_ca_init_member ( const struct btf_type * t ,
const struct btf_member * member ,
void * kdata , const void * udata )
{
const struct tcp_congestion_ops * utcp_ca ;
struct tcp_congestion_ops * tcp_ca ;
u32 moff ;
utcp_ca = ( const struct tcp_congestion_ops * ) udata ;
tcp_ca = ( struct tcp_congestion_ops * ) kdata ;
2021-12-01 21:10:25 +03:00
moff = __btf_member_bit_offset ( t , member ) / 8 ;
2020-01-09 03:35:08 +03:00
switch ( moff ) {
case offsetof ( struct tcp_congestion_ops , flags ) :
if ( utcp_ca - > flags & ~ TCP_CONG_MASK )
return - EINVAL ;
tcp_ca - > flags = utcp_ca - > flags ;
return 1 ;
case offsetof ( struct tcp_congestion_ops , name ) :
2020-03-14 04:02:09 +03:00
if ( bpf_obj_name_cpy ( tcp_ca - > name , utcp_ca - > name ,
sizeof ( tcp_ca - > name ) ) < = 0 )
2020-01-09 03:35:08 +03:00
return - EINVAL ;
return 1 ;
}
return 0 ;
}
static int bpf_tcp_ca_check_member ( const struct btf_type * t ,
2023-01-25 19:47:34 +03:00
const struct btf_member * member ,
const struct bpf_prog * prog )
2020-01-09 03:35:08 +03:00
{
2021-12-01 21:10:25 +03:00
if ( is_unsupported ( __btf_member_bit_offset ( t , member ) / 8 ) )
2020-01-09 03:35:08 +03:00
return - ENOTSUPP ;
return 0 ;
}
static int bpf_tcp_ca_reg ( void * kdata )
{
return tcp_register_congestion_control ( kdata ) ;
}
static void bpf_tcp_ca_unreg ( void * kdata )
{
tcp_unregister_congestion_control ( kdata ) ;
}
2023-03-23 06:24:02 +03:00
static int bpf_tcp_ca_update ( void * kdata , void * old_kdata )
{
return tcp_update_congestion_control ( kdata , old_kdata ) ;
}
bpf: Create links for BPF struct_ops maps.
Make bpf_link support struct_ops. Previously, struct_ops were always
used alone without any associated links. Upon updating its value, a
struct_ops would be activated automatically. Yet other BPF program
types required to make a bpf_link with their instances before they
could become active. Now, however, you can create an inactive
struct_ops, and create a link to activate it later.
With bpf_links, struct_ops has a behavior similar to other BPF program
types. You can pin/unpin them from their links and the struct_ops will
be deactivated when its link is removed while previously need someone
to delete the value for it to be deactivated.
bpf_links are responsible for registering their associated
struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
set to create a bpf_link, while a structs without this flag behaves in
the same manner as before and is registered upon updating its value.
The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
used to craft the links for BPF struct_ops programs, but also to
create links for BPF struct_ops them-self. Since the links of BPF
struct_ops programs are only used to create trampolines internally,
they are never seen in other contexts. Thus, they can be reused for
struct_ops themself.
To maintain a reference to the map supporting this link, we add
bpf_struct_ops_link as an additional type. The pointer of the map is
RCU and won't be necessary until later in the patchset.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-03-23 06:24:00 +03:00
static int bpf_tcp_ca_validate ( void * kdata )
{
return tcp_validate_congestion_control ( kdata ) ;
}
2023-12-15 12:12:20 +03:00
static u32 bpf_tcp_ca_ssthresh ( struct sock * sk )
{
return 0 ;
}
static void bpf_tcp_ca_cong_avoid ( struct sock * sk , u32 ack , u32 acked )
{
}
static void bpf_tcp_ca_set_state ( struct sock * sk , u8 new_state )
{
}
static void bpf_tcp_ca_cwnd_event ( struct sock * sk , enum tcp_ca_event ev )
{
}
static void bpf_tcp_ca_in_ack_event ( struct sock * sk , u32 flags )
{
}
static void bpf_tcp_ca_pkts_acked ( struct sock * sk , const struct ack_sample * sample )
{
}
static u32 bpf_tcp_ca_min_tso_segs ( struct sock * sk )
{
return 0 ;
}
static void bpf_tcp_ca_cong_control ( struct sock * sk , const struct rate_sample * rs )
{
}
static u32 bpf_tcp_ca_undo_cwnd ( struct sock * sk )
{
return 0 ;
}
static u32 bpf_tcp_ca_sndbuf_expand ( struct sock * sk )
{
return 0 ;
}
static void __bpf_tcp_ca_init ( struct sock * sk )
{
}
static void __bpf_tcp_ca_release ( struct sock * sk )
{
}
static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
. ssthresh = bpf_tcp_ca_ssthresh ,
. cong_avoid = bpf_tcp_ca_cong_avoid ,
. set_state = bpf_tcp_ca_set_state ,
. cwnd_event = bpf_tcp_ca_cwnd_event ,
. in_ack_event = bpf_tcp_ca_in_ack_event ,
. pkts_acked = bpf_tcp_ca_pkts_acked ,
. min_tso_segs = bpf_tcp_ca_min_tso_segs ,
. cong_control = bpf_tcp_ca_cong_control ,
. undo_cwnd = bpf_tcp_ca_undo_cwnd ,
. sndbuf_expand = bpf_tcp_ca_sndbuf_expand ,
. init = __bpf_tcp_ca_init ,
. release = __bpf_tcp_ca_release ,
} ;
2024-01-20 01:50:02 +03:00
static struct bpf_struct_ops bpf_tcp_congestion_ops = {
2020-01-09 03:35:08 +03:00
. verifier_ops = & bpf_tcp_ca_verifier_ops ,
. reg = bpf_tcp_ca_reg ,
. unreg = bpf_tcp_ca_unreg ,
2023-03-23 06:24:02 +03:00
. update = bpf_tcp_ca_update ,
2020-01-09 03:35:08 +03:00
. check_member = bpf_tcp_ca_check_member ,
. init_member = bpf_tcp_ca_init_member ,
. init = bpf_tcp_ca_init ,
bpf: Create links for BPF struct_ops maps.
Make bpf_link support struct_ops. Previously, struct_ops were always
used alone without any associated links. Upon updating its value, a
struct_ops would be activated automatically. Yet other BPF program
types required to make a bpf_link with their instances before they
could become active. Now, however, you can create an inactive
struct_ops, and create a link to activate it later.
With bpf_links, struct_ops has a behavior similar to other BPF program
types. You can pin/unpin them from their links and the struct_ops will
be deactivated when its link is removed while previously need someone
to delete the value for it to be deactivated.
bpf_links are responsible for registering their associated
struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
set to create a bpf_link, while a structs without this flag behaves in
the same manner as before and is registered upon updating its value.
The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
used to craft the links for BPF struct_ops programs, but also to
create links for BPF struct_ops them-self. Since the links of BPF
struct_ops programs are only used to create trampolines internally,
they are never seen in other contexts. Thus, they can be reused for
struct_ops themself.
To maintain a reference to the map supporting this link, we add
bpf_struct_ops_link as an additional type. The pointer of the map is
RCU and won't be necessary until later in the patchset.
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
2023-03-23 06:24:00 +03:00
. validate = bpf_tcp_ca_validate ,
2020-01-09 03:35:08 +03:00
. name = " tcp_congestion_ops " ,
2023-12-15 12:12:20 +03:00
. cfi_stubs = & __bpf_ops_tcp_congestion_ops ,
2024-01-20 01:50:02 +03:00
. owner = THIS_MODULE ,
2020-01-09 03:35:08 +03:00
} ;
2022-01-14 19:39:46 +03:00
static int __init bpf_tcp_ca_kfunc_init ( void )
{
2024-01-20 01:50:02 +03:00
int ret ;
ret = register_btf_kfunc_id_set ( BPF_PROG_TYPE_STRUCT_OPS , & bpf_tcp_ca_kfunc_set ) ;
ret = ret ? : register_bpf_struct_ops ( & bpf_tcp_congestion_ops , tcp_congestion_ops ) ;
return ret ;
2022-01-14 19:39:46 +03:00
}
late_initcall ( bpf_tcp_ca_kfunc_init ) ;