833bac7ec3
Commit 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable") introduced the net.smc.rmem and net.smc.wmem sysctls to specify the size of buffers to be used for SMC type connections. This created a regression for users that specified the buffer size via setsockopt() as the effective buffer size was now doubled. Re-introduce the division by 2 in the SMC buffer create code and level this out by duplicating the net.smc.[rw]mem values used for initializing sk_rcvbuf/sk_sndbuf at socket creation time. This gives users of both methods (setsockopt or sysctl) the effective buffer size that they expect. Initialize net.smc.[rw]mem from its own constant of 64kB, respectively. Internal performance tests show that this value is a good compromise between throughput/latency and memory consumption. Also, this decouples it from any tuning that was done to net.ipv4.tcp_[rw]mem[1] before the module for SMC protocol was loaded. Check that no more than INT_MAX / 2 is assigned to net.smc.[rw]mem, in order to avoid any overflow condition when that is doubled for use in sk_sndbuf or sk_rcvbuf. While at it, drop the confusing sk_buf_size variable from __smc_buf_create and name "compressed" buffer size variables more consistently. Background: Before the commit mentioned above, SMC's buffer allocator in __smc_buf_create() always used half of the sockets' sk_rcvbuf/sk_sndbuf value as initial value to search for appropriate buffers. If the search resorted to using a bigger buffer when all buffers of the specified size were busy, the duplicate of the used effective buffer size is stored back to sk_rcvbuf/sk_sndbuf. When available, buffers of exactly the size that a user had specified as input to setsockopt() were used, despite setsockopt()'s documentation in "man 7 socket" talking of a mandatory duplication: [...] SO_SNDBUF Sets or gets the maximum socket send buffer in bytes. The kernel doubles this value (to allow space for book‐ keeping overhead) when it is set using setsockopt(2), and this doubled value is returned by getsockopt(2). The default value is set by the /proc/sys/net/core/wmem_default file and the maximum allowed value is set by the /proc/sys/net/core/wmem_max file. The minimum (doubled) value for this option is 2048. [...] Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable") Co-developed-by: Jan Karcher <jaka@linux.ibm.com> Signed-off-by: Jan Karcher <jaka@linux.ibm.com> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com> Signed-off-by: Gerd Bayer <gbayer@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
118 lines
2.7 KiB
C
118 lines
2.7 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Shared Memory Communications over RDMA (SMC-R) and RoCE
|
|
*
|
|
* smc_sysctl.c: sysctl interface to SMC subsystem.
|
|
*
|
|
* Copyright (c) 2022, Alibaba Inc.
|
|
*
|
|
* Author: Tony Lu <tonylu@linux.alibaba.com>
|
|
*
|
|
*/
|
|
|
|
#include <linux/init.h>
|
|
#include <linux/sysctl.h>
|
|
#include <net/net_namespace.h>
|
|
|
|
#include "smc.h"
|
|
#include "smc_core.h"
|
|
#include "smc_llc.h"
|
|
#include "smc_sysctl.h"
|
|
|
|
static int min_sndbuf = SMC_BUF_MIN_SIZE;
|
|
static int min_rcvbuf = SMC_BUF_MIN_SIZE;
|
|
static int max_sndbuf = INT_MAX / 2;
|
|
static int max_rcvbuf = INT_MAX / 2;
|
|
static const int net_smc_wmem_init = (64 * 1024);
|
|
static const int net_smc_rmem_init = (64 * 1024);
|
|
|
|
static struct ctl_table smc_table[] = {
|
|
{
|
|
.procname = "autocorking_size",
|
|
.data = &init_net.smc.sysctl_autocorking_size,
|
|
.maxlen = sizeof(unsigned int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_douintvec,
|
|
},
|
|
{
|
|
.procname = "smcr_buf_type",
|
|
.data = &init_net.smc.sysctl_smcr_buf_type,
|
|
.maxlen = sizeof(unsigned int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_douintvec_minmax,
|
|
.extra1 = SYSCTL_ZERO,
|
|
.extra2 = SYSCTL_TWO,
|
|
},
|
|
{
|
|
.procname = "smcr_testlink_time",
|
|
.data = &init_net.smc.sysctl_smcr_testlink_time,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_jiffies,
|
|
},
|
|
{
|
|
.procname = "wmem",
|
|
.data = &init_net.smc.sysctl_wmem,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_minmax,
|
|
.extra1 = &min_sndbuf,
|
|
.extra2 = &max_sndbuf,
|
|
},
|
|
{
|
|
.procname = "rmem",
|
|
.data = &init_net.smc.sysctl_rmem,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_minmax,
|
|
.extra1 = &min_rcvbuf,
|
|
.extra2 = &max_rcvbuf,
|
|
},
|
|
{ }
|
|
};
|
|
|
|
int __net_init smc_sysctl_net_init(struct net *net)
|
|
{
|
|
struct ctl_table *table;
|
|
|
|
table = smc_table;
|
|
if (!net_eq(net, &init_net)) {
|
|
int i;
|
|
|
|
table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
|
|
if (!table)
|
|
goto err_alloc;
|
|
|
|
for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++)
|
|
table[i].data += (void *)net - (void *)&init_net;
|
|
}
|
|
|
|
net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table);
|
|
if (!net->smc.smc_hdr)
|
|
goto err_reg;
|
|
|
|
net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
|
|
net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS;
|
|
net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME;
|
|
WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init);
|
|
WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init);
|
|
|
|
return 0;
|
|
|
|
err_reg:
|
|
if (!net_eq(net, &init_net))
|
|
kfree(table);
|
|
err_alloc:
|
|
return -ENOMEM;
|
|
}
|
|
|
|
void __net_exit smc_sysctl_net_exit(struct net *net)
|
|
{
|
|
struct ctl_table *table;
|
|
|
|
table = net->smc.smc_hdr->ctl_table_arg;
|
|
unregister_net_sysctl_table(net->smc.smc_hdr);
|
|
if (!net_eq(net, &init_net))
|
|
kfree(table);
|
|
}
|