Merge branch 'net-smc-smc-intra-os-shortcut-with-loopback-ism'

Wen Gu says:

====================
net/smc: SMC intra-OS shortcut with loopback-ism

This patch set acts as the second part of the new version of [1] (The first
part can be referred from [2]), the updated things of this version are listed
at the end.

- Background

SMC-D is now used in IBM z with ISM function to optimize network interconnect
for intra-CPC communications. Inspired by this, we try to make SMC-D available
on the non-s390 architecture through a software-implemented Emulated-ISM device,
that is the loopback-ism device here, to accelerate inter-process or
inter-containers communication within the same OS instance.

- Design

This patch set includes 3 parts:

 - Patch #1: some prepare work for loopback-ism.
 - Patch #2-#7: implement loopback-ism device and adapt SMC-D for it.
   loopback-ism now serves only SMC and no userspace interfaces exposed.
 - Patch #8-#11: memory copy optimization for intra-OS scenario.

The loopback-ism device is designed as an ISMv2 device and not be limited to
a specific net namespace, ends of both inter-process connection (1/1' in diagram
below) or inter-container connection (2/2' in diagram below) can find the same
available loopback-ism and choose it during the CLC handshake.

 Container 1 (ns1)                              Container 2 (ns2)
 +-----------------------------------------+    +-------------------------+
 | +-------+      +-------+      +-------+ |    |        +-------+        |
 | | App A |      | App B |      | App C | |    |        | App D |<-+     |
 | +-------+      +---^---+      +-------+ |    |        +-------+  |(2') |
 |     |127.0.0.1 (1')|             |192.168.0.11       192.168.0.12|     |
 |  (1)|   +--------+ | +--------+  |(2)   |    | +--------+   +--------+ |
 |     `-->|   lo   |-` |  eth0  |<-`      |    | |   lo   |   |  eth0  | |
 +---------+--|---^-+---+-----|--+---------+    +-+--------+---+-^------+-+
              |   |           |                                  |
 Kernel       |   |           |                                  |
 +----+-------v---+-----------v----------------------------------+---+----+
 |    |                            TCP                               |    |
 |    |                                                              |    |
 |    +--------------------------------------------------------------+    |
 |                                                                        |
 |                           +--------------+                             |
 |                           | smc loopback |                             |
 +---------------------------+--------------+-----------------------------+

loopback-ism device creates DMBs (shared memory) for each connection peer.
Since data transfer occurs within the same kernel, the sndbuf of each peer
is only a descriptor and point to the same memory region as peer DMB, so that
the data copy from sndbuf to peer DMB can be avoided in loopback-ism case.

 Container 1 (ns1)                              Container 2 (ns2)
 +-----------------------------------------+    +-------------------------+
 | +-------+                               |    |        +-------+        |
 | | App C |-----+                         |    |        | App D |        |
 | +-------+     |                         |    |        +-^-----+        |
 |               |                         |    |          |              |
 |           (2) |                         |    |     (2') |              |
 |               |                         |    |          |              |
 +---------------|-------------------------+    +----------|--------------+
                 |                                         |
 Kernel          |                                         |
 +---------------|-----------------------------------------|--------------+
 | +--------+ +--v-----+                           +--------+ +--------+  |
 | |dmb_desc| |snd_desc|                           |dmb_desc| |snd_desc|  |
 | +-----|--+ +--|-----+                           +-----|--+ +--------+  |
 | +-----|--+    |                                 +-----|--+             |
 | | DMB C  |    +---------------------------------| DMB D  |             |
 | +--------+                                      +--------+             |
 |                                                                        |
 |                           +--------------+                             |
 |                           | smc loopback |                             |
 +---------------------------+--------------+-----------------------------+

- Benchmark Test

 * Test environments:
      - VM with Intel Xeon Platinum 8 core 2.50GHz, 16 GiB mem.
      - SMC sndbuf/DMB size 1MB.

 * Test object:
      - TCP: run on TCP loopback.
      - SMC lo: run on SMC loopback-ism.

1. ipc-benchmark (see [3])

 - ./<foo> -c 1000000 -s 100

                            TCP                  SMC-lo
Message
rate (msg/s)              84991                  151293(+78.01%)

2. sockperf

 - serv: <smc_run> sockperf sr --tcp
 - clnt: <smc_run> sockperf { tp | pp } --tcp --msg-size={ 64000 for tp | 14 for pp } -i 127.0.0.1 -t 30

                            TCP                  SMC-lo
Bandwidth(MBps)        5033.569                7987.732(+58.69%)
Latency(us)               5.986                   3.398(-43.23%)

3. nginx/wrk

 - serv: <smc_run> nginx
 - clnt: <smc_run> wrk -t 8 -c 1000 -d 30 http://127.0.0.1:80

                           TCP                   SMC-lo
Requests/s           187951.76                267107.90(+42.12%)

4. redis-benchmark

 - serv: <smc_run> redis-server
 - clnt: <smc_run> redis-benchmark -h 127.0.0.1 -q -t set,get -n 400000 -c 200 -d 1024

                           TCP                   SMC-lo
GET(Requests/s)       86132.64                118133.49(+37.15%)
SET(Requests/s)       87374.40                122887.86(+40.65%)

Change log:
v7->v6
- Patch #2: minor: remove unnecessary 'return' of inline smc_loopback_exit().
- Patch #10: minor: directly return 0 instead of 'rc' in smcd_cdc_msg_send().
- all: collect the Reviewed-by tags.

v6->RFC v5
Link: https://lore.kernel.org/netdev/20240414040304.54255-1-guwen@linux.alibaba.com/
- Patch #2: make the use of CONFIG_SMC_LO cleaner.
- Patch #5: mark some smcd_ops that loopback-ism doesn't support as
  optional and check for the support when they are called.
- Patch #7: keep loopback-ism at the beginning of the SMC-D device list.
- Some expression changes in commit logs and comments.

RFC v5->RFC v4:
Link: https://lore.kernel.org/netdev/20240324135522.108564-1-guwen@linux.alibaba.com/
- Patch #2: minor changes in description of config SMC_LO and comments.
- Patch #10: minor changes in comments and if(smc_ism_support_dmb_nocopy())
  check in smcd_cdc_msg_send().
- Patch #3: change smc_lo_generate_id() to smc_lo_generate_ids() and SMC_LO_CHID
  to SMC_LO_RESERVED_CHID.
- Patch #5: memcpy while holding the ldev->dmb_ht_lock.
- Some expression changes in commit logs.

RFC v4->v3:
Link: https://lore.kernel.org/netdev/20240317100545.96663-1-guwen@linux.alibaba.com/
- The merge window of v6.9 is open, so post this series as an RFC.
- Patch #6: since some information fed back by smc_nl_handle_smcd_dev() dose
  not apply to Emulated-ISM (including loopback-ism here), loopback-ism is
  not exposed through smc netlink for the time being. we may refactor this
  part when smc netlink interface is updated.

v3->v2:
Link: https://lore.kernel.org/netdev/20240312142743.41406-1-guwen@linux.alibaba.com/
- Patch #11: use tasklet_schedule(&conn->rx_tsklet) instead of smcd_cdc_rx_handler()
  to avoid possible recursive locking of conn->send_lock and use {read|write}_lock_bh()
  to acquire dmb_ht_lock.

v2->v1:
Link: https://lore.kernel.org/netdev/20240307095536.29648-1-guwen@linux.alibaba.com/
- All the patches: changed the term virtual-ISM to Emulated-ISM as defined by SMCv2.1.
- Patch #3: optimized the description of SMC_LO config. Avoid exposing loopback-ism
  to sysfs and remove all the knobs until future definition clear.
- Patch #3: try to make lockdep happy by using read_lock_bh() in smc_lo_move_data().
- Patch #6: defaultly use physical contiguous DMB buffers.
- Patch #11: defaultly enable DMB no-copy for loopback-ism and free the DMB in
  unregister_dmb or detach_dmb when dmb_node->refcnt reaches 0, instead of using
  wait_event to keep waiting in unregister_dmb.

v1->RFC:
Link: https://lore.kernel.org/netdev/20240111120036.109903-1-guwen@linux.alibaba.com/
- Patch #9: merge rx_bytes and tx_bytes as xfer_bytes statistics:
  /sys/devices/virtual/smc/loopback-ism/xfer_bytes
- Patch #10: add support_dmb_nocopy operation to check if SMC-D device supports
  merging sndbuf with peer DMB.
- Patch #13 & #14: introduce loopback-ism device control of DMB memory type and
  control of whether to merge sndbuf and DMB. They can be respectively set by:
  /sys/devices/virtual/smc/loopback-ism/dmb_type
  /sys/devices/virtual/smc/loopback-ism/dmb_copy
  The motivation for these two control is that a performance bottleneck was
  found when using vzalloced DMB and sndbuf is merged with DMB, and there are
  many CPUs and CONFIG_HARDENED_USERCOPY is set [4]. The bottleneck is caused
  by the lock contention in vmap_area_lock [5] which is involved in memcpy_from_msg()
  or memcpy_to_msg(). Currently, Uladzislau Rezki is working on mitigating the
  vmap lock contention [6]. It has significant effects, but using virtual memory
  still has additional overhead compared to using physical memory.
  So this new version provides controls of dmb_type and dmb_copy to suit
  different scenarios.
- Some minor changes and comments improvements.

RFC->old version([1]):
Link: https://lore.kernel.org/netdev/1702214654-32069-1-git-send-email-guwen@linux.alibaba.com/
- Patch #1: improve the loopback-ism dump, it shows as follows now:
  # smcd d
  FID  Type  PCI-ID        PCHID  InUse  #LGs  PNET-ID
  0000 0     loopback-ism  ffff   No        0
- Patch #3: introduce the smc_ism_set_v2_capable() helper and set
  smc_ism_v2_capable when ISMv2 or virtual ISM is registered,
  regardless of whether there is already a device in smcd device list.
- Patch #3: loopback-ism will be added into /sys/devices/virtual/smc/loopback-ism/.
- Patch #8: introduce the runtime switch /sys/devices/virtual/smc/loopback-ism/active
  to activate or deactivate the loopback-ism.
- Patch #9: introduce the statistics of loopback-ism by
  /sys/devices/virtual/smc/loopback-ism/{{tx|rx}_tytes|dmbs_cnt}.
- Some minor changes and comments improvements.

[1] https://lore.kernel.org/netdev/1695568613-125057-1-git-send-email-guwen@linux.alibaba.com/
[2] https://lore.kernel.org/netdev/20231219142616.80697-1-guwen@linux.alibaba.com/
[3] https://github.com/goldsborough/ipc-bench
[4] https://lore.kernel.org/all/3189e342-c38f-6076-b730-19a6efd732a5@linux.alibaba.com/
[5] https://lore.kernel.org/all/238e63cd-e0e8-4fbf-852f-bc4d5bc35d5a@linux.alibaba.com/
[6] https://lore.kernel.org/all/20240102184633.748113-1-urezki@gmail.com/
====================

Link: https://lore.kernel.org/r/20240428060738.60843-1-guwen@linux.alibaba.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
Paolo Abeni 2024-04-30 13:24:50 +02:00
commit e458a9addf
12 changed files with 721 additions and 28 deletions

View File

@ -745,7 +745,7 @@ static int smcd_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid,
}
static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb,
struct ism_client *client)
void *client)
{
return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, client);
}

View File

@ -47,7 +47,6 @@ struct smcd_dmb {
#define ISM_ERROR 0xFFFF
struct smcd_dev;
struct ism_client;
struct smcd_gid {
u64 gid;
@ -58,14 +57,8 @@ struct smcd_ops {
int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid,
u32 vid_valid, u32 vid);
int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb,
struct ism_client *client);
void *client);
int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
int (*set_vlan_required)(struct smcd_dev *dev);
int (*reset_vlan_required)(struct smcd_dev *dev);
int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid,
u32 trigger_irq, u32 event_code, u64 info);
int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx,
bool sf, unsigned int offset, void *data,
unsigned int size);
@ -73,11 +66,23 @@ struct smcd_ops {
void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid);
u16 (*get_chid)(struct smcd_dev *dev);
struct device* (*get_dev)(struct smcd_dev *dev);
/* optional operations */
int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
int (*set_vlan_required)(struct smcd_dev *dev);
int (*reset_vlan_required)(struct smcd_dev *dev);
int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid,
u32 trigger_irq, u32 event_code, u64 info);
int (*support_dmb_nocopy)(struct smcd_dev *dev);
int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
int (*detach_dmb)(struct smcd_dev *dev, u64 token);
};
struct smcd_dev {
const struct smcd_ops *ops;
void *priv;
void *client;
struct list_head list;
spinlock_t lock;
struct smc_connection **conn;

View File

@ -20,3 +20,16 @@ config SMC_DIAG
smcss.
if unsure, say Y.
config SMC_LO
bool "SMC intra-OS shortcut with loopback-ism"
depends on SMC
default n
help
SMC_LO enables the creation of an Emulated-ISM device named
loopback-ism in SMC and makes use of it for transferring data
when communication occurs within the same OS. This helps in
convenient testing of SMC-D since loopback-ism is independent
of architecture or hardware.
if unsure, say N.

View File

@ -6,3 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
smc-y += smc_tracepoint.o
smc-$(CONFIG_SYSCTL) += smc_sysctl.o
smc-$(CONFIG_SMC_LO) += smc_loopback.o

View File

@ -53,6 +53,7 @@
#include "smc_stats.h"
#include "smc_tracepoint.h"
#include "smc_sysctl.h"
#include "smc_loopback.h"
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
* creation on server
@ -1435,6 +1436,14 @@ static int smc_connect_ism(struct smc_sock *smc,
}
smc_conn_save_peer_info(smc, aclc);
if (smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) {
rc = smcd_buf_attach(smc);
if (rc) {
rc = SMC_CLC_DECL_MEM; /* try to fallback */
goto connect_abort;
}
}
smc_close_init(smc);
smc_rx_init(smc);
smc_tx_init(smc);
@ -2539,6 +2548,14 @@ static void smc_listen_work(struct work_struct *work)
mutex_unlock(&smc_server_lgr_pending);
}
smc_conn_save_peer_info(new_smc, cclc);
if (ini->is_smcd &&
smc_ism_support_dmb_nocopy(new_smc->conn.lgr->smcd)) {
rc = smcd_buf_attach(new_smc);
if (rc)
goto out_decl;
}
smc_listen_out_connected(new_smc);
SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
goto out_free;
@ -3555,15 +3572,23 @@ static int __init smc_init(void)
goto out_sock;
}
rc = smc_loopback_init();
if (rc) {
pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc);
goto out_ib;
}
rc = tcp_register_ulp(&smc_ulp_ops);
if (rc) {
pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
goto out_ib;
goto out_lo;
}
static_branch_enable(&tcp_have_smc);
return 0;
out_lo:
smc_loopback_exit();
out_ib:
smc_ib_unregister_client();
out_sock:
@ -3601,6 +3626,7 @@ static void __exit smc_exit(void)
tcp_unregister_ulp(&smc_ulp_ops);
sock_unregister(PF_SMC);
smc_core_exit();
smc_loopback_exit();
smc_ib_unregister_client();
smc_ism_exit();
destroy_workqueue(smc_close_wq);

View File

@ -18,6 +18,7 @@
#include "smc_tx.h"
#include "smc_rx.h"
#include "smc_close.h"
#include "smc_ism.h"
/********************************** send *************************************/
@ -255,6 +256,14 @@ int smcd_cdc_msg_send(struct smc_connection *conn)
return rc;
smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn);
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
if (smc_ism_support_dmb_nocopy(conn->lgr->smcd))
/* if local sndbuf shares the same memory region with
* peer DMB, then don't update the tx_curs_fin
* and sndbuf_space until peer has consumed the data.
*/
return 0;
/* Calculate transmitted data and increment free send buffer space */
diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
&conn->tx_curs_sent);
@ -266,7 +275,7 @@ int smcd_cdc_msg_send(struct smc_connection *conn)
smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn);
smc_tx_sndbuf_nonfull(smc);
return rc;
return 0;
}
/********************************* receive ***********************************/
@ -323,7 +332,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
{
union smc_host_cursor cons_old, prod_old;
struct smc_connection *conn = &smc->conn;
int diff_cons, diff_prod;
int diff_cons, diff_prod, diff_tx;
smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn);
smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn);
@ -339,6 +348,29 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
atomic_add(diff_cons, &conn->peer_rmbe_space);
/* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
smp_mb__after_atomic();
/* if local sndbuf shares the same memory region with
* peer RMB, then update tx_curs_fin and sndbuf_space
* here since peer has already consumed the data.
*/
if (conn->lgr->is_smcd &&
smc_ism_support_dmb_nocopy(conn->lgr->smcd)) {
/* Calculate consumed data and
* increment free send buffer space.
*/
diff_tx = smc_curs_diff(conn->sndbuf_desc->len,
&conn->tx_curs_fin,
&conn->local_rx_ctrl.cons);
/* increase local sndbuf space and fin_curs */
smp_mb__before_atomic();
atomic_add(diff_tx, &conn->sndbuf_space);
/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
smp_mb__after_atomic();
smc_curs_copy(&conn->tx_curs_fin,
&conn->local_rx_ctrl.cons, conn);
smc_tx_sndbuf_nonfull(smc);
}
}
diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,

View File

@ -1149,6 +1149,20 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
}
}
static void smcd_buf_detach(struct smc_connection *conn)
{
struct smcd_dev *smcd = conn->lgr->smcd;
u64 peer_token = conn->peer_token;
if (!conn->sndbuf_desc)
return;
smc_ism_detach_dmb(smcd, peer_token);
kfree(conn->sndbuf_desc);
conn->sndbuf_desc = NULL;
}
static void smc_buf_unuse(struct smc_connection *conn,
struct smc_link_group *lgr)
{
@ -1192,6 +1206,8 @@ void smc_conn_free(struct smc_connection *conn)
if (lgr->is_smcd) {
if (!list_empty(&lgr->list))
smc_ism_unset_conn(conn);
if (smc_ism_support_dmb_nocopy(lgr->smcd))
smcd_buf_detach(conn);
tasklet_kill(&conn->rx_tsklet);
} else {
smc_cdc_wait_pend_tx_wr(conn);
@ -1445,6 +1461,8 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft)
smc_sk_wake_ups(smc);
if (conn->lgr->is_smcd) {
smc_ism_unset_conn(conn);
if (smc_ism_support_dmb_nocopy(conn->lgr->smcd))
smcd_buf_detach(conn);
if (soft)
tasklet_kill(&conn->rx_tsklet);
else
@ -2464,12 +2482,18 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
int rc;
/* create send buffer */
if (is_smcd &&
smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd))
goto create_rmb;
rc = __smc_buf_create(smc, is_smcd, false);
if (rc)
return rc;
create_rmb:
/* create rmb */
rc = __smc_buf_create(smc, is_smcd, true);
if (rc) {
if (rc && smc->conn.sndbuf_desc) {
down_write(&smc->conn.lgr->sndbufs_lock);
list_del(&smc->conn.sndbuf_desc->list);
up_write(&smc->conn.lgr->sndbufs_lock);
@ -2479,6 +2503,41 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
return rc;
}
int smcd_buf_attach(struct smc_sock *smc)
{
struct smc_connection *conn = &smc->conn;
struct smcd_dev *smcd = conn->lgr->smcd;
u64 peer_token = conn->peer_token;
struct smc_buf_desc *buf_desc;
int rc;
buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
if (!buf_desc)
return -ENOMEM;
/* The ghost sndbuf_desc describes the same memory region as
* peer RMB. Its lifecycle is consistent with the connection's
* and it will be freed with the connections instead of the
* link group.
*/
rc = smc_ism_attach_dmb(smcd, peer_token, buf_desc);
if (rc)
goto free;
smc->sk.sk_sndbuf = buf_desc->len;
buf_desc->cpu_addr =
(u8 *)buf_desc->cpu_addr + sizeof(struct smcd_cdc_msg);
buf_desc->len -= sizeof(struct smcd_cdc_msg);
conn->sndbuf_desc = buf_desc;
conn->sndbuf_desc->used = 1;
atomic_set(&conn->sndbuf_space, conn->sndbuf_desc->len);
return 0;
free:
kfree(buf_desc);
return rc;
}
static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
{
int i;

View File

@ -557,6 +557,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid,
void smc_smcd_terminate_all(struct smcd_dev *dev);
void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
int smc_buf_create(struct smc_sock *smc, bool is_smcd);
int smcd_buf_attach(struct smc_sock *smc);
int smc_uncompress_bufsize(u8 compressed);
int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc);

View File

@ -91,6 +91,11 @@ bool smc_ism_is_v2_capable(void)
return smc_ism_v2_capable;
}
void smc_ism_set_v2_capable(void)
{
smc_ism_v2_capable = true;
}
/* Set a connection using this DMBE. */
void smc_ism_set_conn(struct smc_connection *conn)
{
@ -126,6 +131,8 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid)
if (!vlanid) /* No valid vlan id */
return -EINVAL;
if (!smcd->ops->add_vlan_id)
return -EOPNOTSUPP;
/* create new vlan entry, in case we need it */
new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL);
@ -171,6 +178,8 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid)
if (!vlanid) /* No valid vlan id */
return -EINVAL;
if (!smcd->ops->del_vlan_id)
return -EOPNOTSUPP;
spin_lock_irqsave(&smcd->lock, flags);
list_for_each_entry(vlan, &smcd->vlan, list) {
@ -222,7 +231,6 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
struct smc_buf_desc *dmb_desc)
{
#if IS_ENABLED(CONFIG_ISM)
struct smcd_dmb dmb;
int rc;
@ -231,7 +239,7 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
dmb.sba_idx = dmb_desc->sba_idx;
dmb.vlan_id = lgr->vlan_id;
dmb.rgid = lgr->peer_gid.gid;
rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client);
rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client);
if (!rc) {
dmb_desc->sba_idx = dmb.sba_idx;
dmb_desc->token = dmb.dmb_tok;
@ -240,9 +248,46 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
dmb_desc->len = dmb.dmb_len;
}
return rc;
#else
return 0;
#endif
}
bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd)
{
/* for now only loopback-ism supports
* merging sndbuf with peer DMB to avoid
* data copies between them.
*/
return (smcd->ops->support_dmb_nocopy &&
smcd->ops->support_dmb_nocopy(smcd));
}
int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token,
struct smc_buf_desc *dmb_desc)
{
struct smcd_dmb dmb;
int rc = 0;
if (!dev->ops->attach_dmb)
return -EINVAL;
memset(&dmb, 0, sizeof(dmb));
dmb.dmb_tok = token;
rc = dev->ops->attach_dmb(dev, &dmb);
if (!rc) {
dmb_desc->sba_idx = dmb.sba_idx;
dmb_desc->token = dmb.dmb_tok;
dmb_desc->cpu_addr = dmb.cpu_addr;
dmb_desc->dma_addr = dmb.dma_addr;
dmb_desc->len = dmb.dmb_len;
}
return rc;
}
int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token)
{
if (!dev->ops->detach_dmb)
return -EINVAL;
return dev->ops->detach_dmb(dev, token);
}
static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
@ -322,6 +367,8 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list,
list_for_each_entry(smcd, &dev_list->list, list) {
if (num < snum)
goto next;
if (smc_ism_is_loopback(smcd))
goto next;
if (smc_nl_handle_smcd_dev(smcd, skb, cb))
goto errout;
next:
@ -372,7 +419,8 @@ static void smcd_handle_sw_event(struct smc_ism_event_work *wrk)
smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id);
break;
case ISM_EVENT_CODE_TESTLINK: /* Activity timer */
if (ev_info.code == ISM_EVENT_REQUEST) {
if (ev_info.code == ISM_EVENT_REQUEST &&
wrk->smcd->ops->signal_event) {
ev_info.code = ISM_EVENT_RESPONSE;
wrk->smcd->ops->signal_event(wrk->smcd,
&peer_gid,
@ -436,7 +484,7 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
static void smcd_register_dev(struct ism_dev *ism)
{
const struct smcd_ops *ops = ism_get_smcd_ops();
struct smcd_dev *smcd;
struct smcd_dev *smcd, *fentry;
if (!ops)
return;
@ -446,20 +494,28 @@ static void smcd_register_dev(struct ism_dev *ism)
if (!smcd)
return;
smcd->priv = ism;
smcd->client = &smc_ism_client;
ism_set_priv(ism, &smc_ism_client, smcd);
if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid))
smc_pnetid_by_table_smcd(smcd);
if (smcd->ops->supports_v2())
smc_ism_set_v2_capable();
mutex_lock(&smcd_dev_list.mutex);
if (list_empty(&smcd_dev_list.list)) {
if (smcd->ops->supports_v2())
smc_ism_v2_capable = true;
}
/* sort list: devices without pnetid before devices with pnetid */
if (smcd->pnetid[0])
/* sort list:
* - devices without pnetid before devices with pnetid;
* - loopback-ism always at the very beginning;
*/
if (!smcd->pnetid[0]) {
fentry = list_first_entry_or_null(&smcd_dev_list.list,
struct smcd_dev, list);
if (fentry && smc_ism_is_loopback(fentry))
list_add(&smcd->list, &fentry->list);
else
list_add(&smcd->list, &smcd_dev_list.list);
} else {
list_add_tail(&smcd->list, &smcd_dev_list.list);
else
list_add(&smcd->list, &smcd_dev_list.list);
}
mutex_unlock(&smcd_dev_list.mutex);
pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
@ -541,6 +597,8 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr)
if (lgr->peer_shutdown)
return 0;
if (!lgr->smcd->ops->signal_event)
return 0;
memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE);
ev_info.vlan_id = lgr->vlan_id;

View File

@ -48,10 +48,15 @@ int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id);
int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
struct smc_buf_desc *dmb_desc);
int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd);
int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token,
struct smc_buf_desc *dmb_desc);
int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token);
int smc_ism_signal_shutdown(struct smc_link_group *lgr);
void smc_ism_get_system_eid(u8 **eid);
u16 smc_ism_get_chid(struct smcd_dev *dev);
bool smc_ism_is_v2_capable(void);
void smc_ism_set_v2_capable(void);
int smc_ism_init(void);
void smc_ism_exit(void);
int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb);
@ -84,4 +89,9 @@ static inline bool smc_ism_is_emulated(struct smcd_dev *smcd)
return __smc_ism_is_emulated(chid);
}
static inline bool smc_ism_is_loopback(struct smcd_dev *smcd)
{
return (smcd->ops->get_chid(smcd) == 0xFFFF);
}
#endif

427
net/smc/smc_loopback.c Normal file
View File

@ -0,0 +1,427 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Shared Memory Communications Direct over loopback-ism device.
*
* Functions for loopback-ism device.
*
* Copyright (c) 2024, Alibaba Inc.
*
* Author: Wen Gu <guwen@linux.alibaba.com>
* Tony Lu <tonylu@linux.alibaba.com>
*
*/
#include <linux/device.h>
#include <linux/types.h>
#include <net/smc.h>
#include "smc_cdc.h"
#include "smc_ism.h"
#include "smc_loopback.h"
#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */
#define SMC_LO_SUPPORT_NOCOPY 0x1
#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0)
static const char smc_lo_dev_name[] = "loopback-ism";
static struct smc_lo_dev *lo_dev;
static void smc_lo_generate_ids(struct smc_lo_dev *ldev)
{
struct smcd_gid *lgid = &ldev->local_gid;
uuid_t uuid;
uuid_gen(&uuid);
memcpy(&lgid->gid, &uuid, sizeof(lgid->gid));
memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid),
sizeof(lgid->gid_ext));
ldev->chid = SMC_LO_RESERVED_CHID;
}
static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid,
u32 vid_valid, u32 vid)
{
struct smc_lo_dev *ldev = smcd->priv;
/* rgid should be the same as lgid */
if (!ldev || rgid->gid != ldev->local_gid.gid ||
rgid->gid_ext != ldev->local_gid.gid_ext)
return -ENETUNREACH;
return 0;
}
static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb,
void *client_priv)
{
struct smc_lo_dmb_node *dmb_node, *tmp_node;
struct smc_lo_dev *ldev = smcd->priv;
int sba_idx, rc;
/* check space for new dmb */
for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) {
if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask))
break;
}
if (sba_idx == SMC_LO_MAX_DMBS)
return -ENOSPC;
dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL);
if (!dmb_node) {
rc = -ENOMEM;
goto err_bit;
}
dmb_node->sba_idx = sba_idx;
dmb_node->len = dmb->dmb_len;
dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL |
__GFP_NOWARN | __GFP_NORETRY |
__GFP_NOMEMALLOC);
if (!dmb_node->cpu_addr) {
rc = -ENOMEM;
goto err_node;
}
dmb_node->dma_addr = SMC_DMA_ADDR_INVALID;
refcount_set(&dmb_node->refcnt, 1);
again:
/* add new dmb into hash table */
get_random_bytes(&dmb_node->token, sizeof(dmb_node->token));
write_lock_bh(&ldev->dmb_ht_lock);
hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) {
if (tmp_node->token == dmb_node->token) {
write_unlock_bh(&ldev->dmb_ht_lock);
goto again;
}
}
hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token);
write_unlock_bh(&ldev->dmb_ht_lock);
atomic_inc(&ldev->dmb_cnt);
dmb->sba_idx = dmb_node->sba_idx;
dmb->dmb_tok = dmb_node->token;
dmb->cpu_addr = dmb_node->cpu_addr;
dmb->dma_addr = dmb_node->dma_addr;
dmb->dmb_len = dmb_node->len;
return 0;
err_node:
kfree(dmb_node);
err_bit:
clear_bit(sba_idx, ldev->sba_idx_mask);
return rc;
}
static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev,
struct smc_lo_dmb_node *dmb_node)
{
/* remove dmb from hash table */
write_lock_bh(&ldev->dmb_ht_lock);
hash_del(&dmb_node->list);
write_unlock_bh(&ldev->dmb_ht_lock);
clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask);
kvfree(dmb_node->cpu_addr);
kfree(dmb_node);
if (atomic_dec_and_test(&ldev->dmb_cnt))
wake_up(&ldev->ldev_release);
}
static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
{
struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node;
struct smc_lo_dev *ldev = smcd->priv;
/* find dmb from hash table */
read_lock_bh(&ldev->dmb_ht_lock);
hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) {
if (tmp_node->token == dmb->dmb_tok) {
dmb_node = tmp_node;
break;
}
}
if (!dmb_node) {
read_unlock_bh(&ldev->dmb_ht_lock);
return -EINVAL;
}
read_unlock_bh(&ldev->dmb_ht_lock);
if (refcount_dec_and_test(&dmb_node->refcnt))
__smc_lo_unregister_dmb(ldev, dmb_node);
return 0;
}
static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd)
{
return SMC_LO_SUPPORT_NOCOPY;
}
static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
{
struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node;
struct smc_lo_dev *ldev = smcd->priv;
/* find dmb_node according to dmb->dmb_tok */
read_lock_bh(&ldev->dmb_ht_lock);
hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) {
if (tmp_node->token == dmb->dmb_tok) {
dmb_node = tmp_node;
break;
}
}
if (!dmb_node) {
read_unlock_bh(&ldev->dmb_ht_lock);
return -EINVAL;
}
read_unlock_bh(&ldev->dmb_ht_lock);
if (!refcount_inc_not_zero(&dmb_node->refcnt))
/* the dmb is being unregistered, but has
* not been removed from the hash table.
*/
return -EINVAL;
/* provide dmb information */
dmb->sba_idx = dmb_node->sba_idx;
dmb->dmb_tok = dmb_node->token;
dmb->cpu_addr = dmb_node->cpu_addr;
dmb->dma_addr = dmb_node->dma_addr;
dmb->dmb_len = dmb_node->len;
return 0;
}
static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token)
{
struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node;
struct smc_lo_dev *ldev = smcd->priv;
/* find dmb_node according to dmb->dmb_tok */
read_lock_bh(&ldev->dmb_ht_lock);
hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) {
if (tmp_node->token == token) {
dmb_node = tmp_node;
break;
}
}
if (!dmb_node) {
read_unlock_bh(&ldev->dmb_ht_lock);
return -EINVAL;
}
read_unlock_bh(&ldev->dmb_ht_lock);
if (refcount_dec_and_test(&dmb_node->refcnt))
__smc_lo_unregister_dmb(ldev, dmb_node);
return 0;
}
static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok,
unsigned int idx, bool sf, unsigned int offset,
void *data, unsigned int size)
{
struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node;
struct smc_lo_dev *ldev = smcd->priv;
struct smc_connection *conn;
if (!sf)
/* since sndbuf is merged with peer DMB, there is
* no need to copy data from sndbuf to peer DMB.
*/
return 0;
read_lock_bh(&ldev->dmb_ht_lock);
hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) {
if (tmp_node->token == dmb_tok) {
rmb_node = tmp_node;
break;
}
}
if (!rmb_node) {
read_unlock_bh(&ldev->dmb_ht_lock);
return -EINVAL;
}
memcpy((char *)rmb_node->cpu_addr + offset, data, size);
read_unlock_bh(&ldev->dmb_ht_lock);
conn = smcd->conn[rmb_node->sba_idx];
if (!conn || conn->killed)
return -EPIPE;
tasklet_schedule(&conn->rx_tsklet);
return 0;
}
static int smc_lo_supports_v2(void)
{
return SMC_LO_V2_CAPABLE;
}
static void smc_lo_get_local_gid(struct smcd_dev *smcd,
struct smcd_gid *smcd_gid)
{
struct smc_lo_dev *ldev = smcd->priv;
smcd_gid->gid = ldev->local_gid.gid;
smcd_gid->gid_ext = ldev->local_gid.gid_ext;
}
static u16 smc_lo_get_chid(struct smcd_dev *smcd)
{
return ((struct smc_lo_dev *)smcd->priv)->chid;
}
static struct device *smc_lo_get_dev(struct smcd_dev *smcd)
{
return &((struct smc_lo_dev *)smcd->priv)->dev;
}
static const struct smcd_ops lo_ops = {
.query_remote_gid = smc_lo_query_rgid,
.register_dmb = smc_lo_register_dmb,
.unregister_dmb = smc_lo_unregister_dmb,
.support_dmb_nocopy = smc_lo_support_dmb_nocopy,
.attach_dmb = smc_lo_attach_dmb,
.detach_dmb = smc_lo_detach_dmb,
.add_vlan_id = NULL,
.del_vlan_id = NULL,
.set_vlan_required = NULL,
.reset_vlan_required = NULL,
.signal_event = NULL,
.move_data = smc_lo_move_data,
.supports_v2 = smc_lo_supports_v2,
.get_local_gid = smc_lo_get_local_gid,
.get_chid = smc_lo_get_chid,
.get_dev = smc_lo_get_dev,
};
static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops,
int max_dmbs)
{
struct smcd_dev *smcd;
smcd = kzalloc(sizeof(*smcd), GFP_KERNEL);
if (!smcd)
return NULL;
smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *),
GFP_KERNEL);
if (!smcd->conn)
goto out_smcd;
smcd->ops = ops;
spin_lock_init(&smcd->lock);
spin_lock_init(&smcd->lgr_lock);
INIT_LIST_HEAD(&smcd->vlan);
INIT_LIST_HEAD(&smcd->lgr_list);
init_waitqueue_head(&smcd->lgrs_deleted);
return smcd;
out_smcd:
kfree(smcd);
return NULL;
}
static int smcd_lo_register_dev(struct smc_lo_dev *ldev)
{
struct smcd_dev *smcd;
smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS);
if (!smcd)
return -ENOMEM;
ldev->smcd = smcd;
smcd->priv = ldev;
smc_ism_set_v2_capable();
mutex_lock(&smcd_dev_list.mutex);
list_add(&smcd->list, &smcd_dev_list.list);
mutex_unlock(&smcd_dev_list.mutex);
pr_warn_ratelimited("smc: adding smcd device %s\n",
dev_name(&ldev->dev));
return 0;
}
static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev)
{
struct smcd_dev *smcd = ldev->smcd;
pr_warn_ratelimited("smc: removing smcd device %s\n",
dev_name(&ldev->dev));
smcd->going_away = 1;
smc_smcd_terminate_all(smcd);
mutex_lock(&smcd_dev_list.mutex);
list_del_init(&smcd->list);
mutex_unlock(&smcd_dev_list.mutex);
kfree(smcd->conn);
kfree(smcd);
}
static int smc_lo_dev_init(struct smc_lo_dev *ldev)
{
smc_lo_generate_ids(ldev);
rwlock_init(&ldev->dmb_ht_lock);
hash_init(ldev->dmb_ht);
atomic_set(&ldev->dmb_cnt, 0);
init_waitqueue_head(&ldev->ldev_release);
return smcd_lo_register_dev(ldev);
}
static void smc_lo_dev_exit(struct smc_lo_dev *ldev)
{
smcd_lo_unregister_dev(ldev);
if (atomic_read(&ldev->dmb_cnt))
wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt));
}
static void smc_lo_dev_release(struct device *dev)
{
struct smc_lo_dev *ldev =
container_of(dev, struct smc_lo_dev, dev);
kfree(ldev);
}
static int smc_lo_dev_probe(void)
{
struct smc_lo_dev *ldev;
int ret;
ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
if (!ldev)
return -ENOMEM;
ldev->dev.parent = NULL;
ldev->dev.release = smc_lo_dev_release;
device_initialize(&ldev->dev);
dev_set_name(&ldev->dev, smc_lo_dev_name);
ret = smc_lo_dev_init(ldev);
if (ret)
goto free_dev;
lo_dev = ldev; /* global loopback device */
return 0;
free_dev:
put_device(&ldev->dev);
return ret;
}
static void smc_lo_dev_remove(void)
{
if (!lo_dev)
return;
smc_lo_dev_exit(lo_dev);
put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */
}
int smc_loopback_init(void)
{
return smc_lo_dev_probe();
}
void smc_loopback_exit(void)
{
smc_lo_dev_remove();
}

61
net/smc/smc_loopback.h Normal file
View File

@ -0,0 +1,61 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Shared Memory Communications Direct over loopback-ism device.
*
* SMC-D loopback-ism device structure definitions.
*
* Copyright (c) 2024, Alibaba Inc.
*
* Author: Wen Gu <guwen@linux.alibaba.com>
* Tony Lu <tonylu@linux.alibaba.com>
*
*/
#ifndef _SMC_LOOPBACK_H
#define _SMC_LOOPBACK_H
#include <linux/device.h>
#include <linux/err.h>
#include <net/smc.h>
#if IS_ENABLED(CONFIG_SMC_LO)
#define SMC_LO_MAX_DMBS 5000
#define SMC_LO_DMBS_HASH_BITS 12
#define SMC_LO_RESERVED_CHID 0xFFFF
struct smc_lo_dmb_node {
struct hlist_node list;
u64 token;
u32 len;
u32 sba_idx;
void *cpu_addr;
dma_addr_t dma_addr;
refcount_t refcnt;
};
struct smc_lo_dev {
struct smcd_dev *smcd;
struct device dev;
u16 chid;
struct smcd_gid local_gid;
atomic_t dmb_cnt;
rwlock_t dmb_ht_lock;
DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS);
DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS);
wait_queue_head_t ldev_release;
};
int smc_loopback_init(void);
void smc_loopback_exit(void);
#else
static inline int smc_loopback_init(void)
{
return 0;
}
static inline void smc_loopback_exit(void)
{
}
#endif
#endif /* _SMC_LOOPBACK_H */