linux-stable/net/smc/smc_clc.h
Wen Gu b8d199451c net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R
On long-running enterprise production servers, high-order contiguous
memory pages are usually very rare and in most cases we can only get
fragmented pages.

When replacing TCP with SMC-R in such production scenarios, attempting
to allocate high-order physically contiguous sndbufs and RMBs may result
in frequent memory compaction, which will cause unexpected hung issue
and further stability risks.

So this patch is aimed to allow SMC-R link group to use virtually
contiguous sndbufs and RMBs to avoid potential issues mentioned above.
Whether to use physically or virtually contiguous buffers can be set
by sysctl smcr_buf_type.

Note that using virtually contiguous buffers will bring an acceptable
performance regression, which can be mainly divided into two parts:

1) regression in data path, which is brought by additional address
   translation of sndbuf by RNIC in Tx. But in general, translating
   address through MTT is fast.

   Taking 256KB sndbuf and RMB as an example, the comparisons in qperf
   latency and bandwidth test with physically and virtually contiguous
   buffers are as follows:

- client:
  smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\
  -t 5 -vu tcp_{bw|lat}
- server:
  smc_run taskset -c <cpu> qperf

   [latency]
   msgsize              tcp            smcr        smcr-use-virt-buf
   1               11.17 us         7.56 us         7.51 us (-0.67%)
   2               10.65 us         7.74 us         7.56 us (-2.31%)
   4               11.11 us         7.52 us         7.59 us ( 0.84%)
   8               10.83 us         7.55 us         7.51 us (-0.48%)
   16              11.21 us         7.46 us         7.51 us ( 0.71%)
   32              10.65 us         7.53 us         7.58 us ( 0.61%)
   64              10.95 us         7.74 us         7.80 us ( 0.76%)
   128             11.14 us         7.83 us         7.87 us ( 0.47%)
   256             10.97 us         7.94 us         7.92 us (-0.28%)
   512             11.23 us         7.94 us         8.20 us ( 3.25%)
   1024            11.60 us         8.12 us         8.20 us ( 0.96%)
   2048            14.04 us         8.30 us         8.51 us ( 2.49%)
   4096            16.88 us         9.13 us         9.07 us (-0.64%)
   8192            22.50 us        10.56 us        11.22 us ( 6.26%)
   16384           28.99 us        12.88 us        13.83 us ( 7.37%)
   32768           40.13 us        16.76 us        16.95 us ( 1.16%)
   65536           68.70 us        24.68 us        24.85 us ( 0.68%)
   [bandwidth]
   msgsize                tcp              smcr          smcr-use-virt-buf
   1                1.65 MB/s         1.59 MB/s         1.53 MB/s (-3.88%)
   2                3.32 MB/s         3.17 MB/s         3.08 MB/s (-2.67%)
   4                6.66 MB/s         6.33 MB/s         6.09 MB/s (-3.85%)
   8               13.67 MB/s        13.45 MB/s        11.97 MB/s (-10.99%)
   16              25.36 MB/s        27.15 MB/s        24.16 MB/s (-11.01%)
   32              48.22 MB/s        54.24 MB/s        49.41 MB/s (-8.89%)
   64             106.79 MB/s       107.32 MB/s        99.05 MB/s (-7.71%)
   128            210.21 MB/s       202.46 MB/s       201.02 MB/s (-0.71%)
   256            400.81 MB/s       416.81 MB/s       393.52 MB/s (-5.59%)
   512            746.49 MB/s       834.12 MB/s       809.99 MB/s (-2.89%)
   1024          1292.33 MB/s      1641.96 MB/s      1571.82 MB/s (-4.27%)
   2048          2007.64 MB/s      2760.44 MB/s      2717.68 MB/s (-1.55%)
   4096          2665.17 MB/s      4157.44 MB/s      4070.76 MB/s (-2.09%)
   8192          3159.72 MB/s      4361.57 MB/s      4270.65 MB/s (-2.08%)
   16384         4186.70 MB/s      4574.13 MB/s      4501.17 MB/s (-1.60%)
   32768         4093.21 MB/s      4487.42 MB/s      4322.43 MB/s (-3.68%)
   65536         4057.14 MB/s      4735.61 MB/s      4555.17 MB/s (-3.81%)

2) regression in buffer initialization and destruction path, which is
   brought by additional MR operations of sndbufs. But thanks to link
   group buffer reuse mechanism, the impact of this kind of regression
   decreases as times of buffer reuse increases.

   Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R
   buffer-related function obtained by bpftrace are as follows:

   Function                         Phys-bufs           Virt-bufs
   smcr_new_buf_create()             67154 ns            79164 ns
   smc_ib_buf_map_sg()                 525 ns              928 ns
   smc_ib_get_memory_region()       162294 ns           161191 ns
   smc_wr_reg_send()                  9957 ns             9635 ns
   smc_ib_put_memory_region()       203548 ns           198374 ns
   smc_ib_buf_unmap_sg()               508 ns             1158 ns

------------
Test environment notes:
1. Above tests run on 2 VMs within the same Host.
2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to
   the each VM respectively.
3. VMs' vCPUs are binded to different physical CPUs, and the binded
   physical CPUs are isolated by `isolcpus=xxx` cmdline.
4. NICs' queue number are set to 1.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-18 11:19:17 +01:00

401 lines
13 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* CLC (connection layer control) handshake over initial TCP socket to
* prepare for RDMA traffic
*
* Copyright IBM Corp. 2016
*
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
*/
#ifndef _SMC_CLC_H
#define _SMC_CLC_H
#include <rdma/ib_verbs.h>
#include <linux/smc.h>
#include "smc.h"
#include "smc_netlink.h"
#define SMC_CLC_PROPOSAL 0x01
#define SMC_CLC_ACCEPT 0x02
#define SMC_CLC_CONFIRM 0x03
#define SMC_CLC_DECLINE 0x04
#define SMC_TYPE_R 0 /* SMC-R only */
#define SMC_TYPE_D 1 /* SMC-D only */
#define SMC_TYPE_N 2 /* neither SMC-R nor SMC-D */
#define SMC_TYPE_B 3 /* SMC-R and SMC-D */
#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
#define CLC_WAIT_TIME_SHORT HZ /* short wait time on clcsock */
#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */
#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */
#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */
#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */
#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found (R or D) */
#define SMC_CLC_DECL_NOSMCDDEV 0x03030001 /* no SMC-D device found */
#define SMC_CLC_DECL_NOSMCRDEV 0x03030002 /* no SMC-R device found */
#define SMC_CLC_DECL_NOISM2SUPP 0x03030003 /* hardware has no ISMv2 support */
#define SMC_CLC_DECL_NOV2EXT 0x03030004 /* peer sent no clc v2 extension */
#define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */
#define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */
#define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */
#define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */
#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
#define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */
#define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/
#define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */
#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */
#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */
#define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */
#define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */
#define SMC_CLC_DECL_NOROUTE 0x030e0000 /* SMC-Rv2 conn. no route to peer */
#define SMC_CLC_DECL_NOINDIRECT 0x030f0000 /* SMC-Rv2 conn. indirect mismatch*/
#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */
#define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */
#define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */
#define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */
#define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */
#define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */
struct smc_clc_msg_hdr { /* header1 of clc messages */
u8 eyecatcher[4]; /* eye catcher */
u8 type; /* proposal / accept / confirm / decline */
__be16 length;
#if defined(__BIG_ENDIAN_BITFIELD)
u8 version : 4,
typev2 : 2,
typev1 : 2;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
u8 typev1 : 2,
typev2 : 2,
version : 4;
#endif
} __packed; /* format defined in RFC7609 */
struct smc_clc_msg_trail { /* trailer of clc messages */
u8 eyecatcher[4];
};
struct smc_clc_msg_local { /* header2 of clc messages */
u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
u8 gid[16]; /* gid of ib_device port */
u8 mac[6]; /* mac of ib_device port */
};
/* Struct would be 4 byte aligned, but it is used in an array that is sent
* to peers and must conform to RFC7609, hence we need to use packed here.
*/
struct smc_clc_ipv6_prefix {
struct in6_addr prefix;
u8 prefix_len;
} __packed; /* format defined in RFC7609 */
#if defined(__BIG_ENDIAN_BITFIELD)
struct smc_clc_v2_flag {
u8 release : 4,
rsvd : 3,
seid : 1;
};
#elif defined(__LITTLE_ENDIAN_BITFIELD)
struct smc_clc_v2_flag {
u8 seid : 1,
rsvd : 3,
release : 4;
};
#endif
struct smc_clnt_opts_area_hdr {
u8 eid_cnt; /* number of user defined EIDs */
u8 ism_gid_cnt; /* number of ISMv2 GIDs */
u8 reserved1;
struct smc_clc_v2_flag flag;
u8 reserved2[2];
__be16 smcd_v2_ext_offset; /* SMC-Dv2 Extension Offset */
};
struct smc_clc_smcd_gid_chid {
__be64 gid; /* ISM GID */
__be16 chid; /* ISMv2 CHID */
} __packed; /* format defined in
* IBM Shared Memory Communications Version 2
* (https://www.ibm.com/support/pages/node/6326337)
*/
struct smc_clc_v2_extension {
struct smc_clnt_opts_area_hdr hdr;
u8 roce[16]; /* RoCEv2 GID */
u8 reserved[16];
u8 user_eids[][SMC_MAX_EID_LEN];
};
struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/
__be32 outgoing_subnet; /* subnet mask */
u8 prefix_len; /* number of significant bits in mask */
u8 reserved[2];
u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
} __aligned(4);
struct smc_clc_msg_smcd { /* SMC-D GID information */
struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */
__be16 v2_ext_offset; /* SMC Version 2 Extension Offset */
u8 reserved[28];
};
struct smc_clc_smcd_v2_extension {
u8 system_eid[SMC_MAX_EID_LEN];
u8 reserved[16];
struct smc_clc_smcd_gid_chid gidchid[];
};
struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */
struct smc_clc_msg_hdr hdr;
struct smc_clc_msg_local lcl;
__be16 iparea_offset; /* offset to IP address information area */
} __aligned(4);
#define SMC_CLC_MAX_V6_PREFIX 8
#define SMC_CLC_MAX_UEID 8
struct smc_clc_msg_proposal_area {
struct smc_clc_msg_proposal pclc_base;
struct smc_clc_msg_smcd pclc_smcd;
struct smc_clc_msg_proposal_prefix pclc_prfx;
struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX];
struct smc_clc_v2_extension pclc_v2_ext;
u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN];
struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext;
struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS];
struct smc_clc_msg_trail pclc_trl;
};
struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */
struct smc_clc_msg_local lcl;
u8 qpn[3]; /* QP number */
__be32 rmb_rkey; /* RMB rkey */
u8 rmbe_idx; /* Index of RMBE in RMB */
__be32 rmbe_alert_token; /* unique connection id */
#if defined(__BIG_ENDIAN_BITFIELD)
u8 rmbe_size : 4, /* buf size (compressed) */
qp_mtu : 4; /* QP mtu */
#elif defined(__LITTLE_ENDIAN_BITFIELD)
u8 qp_mtu : 4,
rmbe_size : 4;
#endif
u8 reserved;
__be64 rmb_dma_addr; /* RMB virtual address */
u8 reserved2;
u8 psn[3]; /* packet sequence number */
} __packed;
struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */
u64 gid; /* Sender GID */
u64 token; /* DMB token */
u8 dmbe_idx; /* DMBE index */
#if defined(__BIG_ENDIAN_BITFIELD)
u8 dmbe_size : 4, /* buf size (compressed) */
reserved3 : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
u8 reserved3 : 4,
dmbe_size : 4;
#endif
u16 reserved4;
__be32 linkid; /* Link identifier */
} __packed;
#define SMC_CLC_OS_ZOS 1
#define SMC_CLC_OS_LINUX 2
#define SMC_CLC_OS_AIX 3
struct smc_clc_first_contact_ext {
#if defined(__BIG_ENDIAN_BITFIELD)
u8 v2_direct : 1,
reserved : 7;
u8 os_type : 4,
release : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
u8 reserved : 7,
v2_direct : 1;
u8 release : 4,
os_type : 4;
#endif
u8 reserved2[2];
u8 hostname[SMC_MAX_HOSTNAME_LEN];
};
struct smc_clc_fce_gid_ext {
u8 reserved[16];
u8 gid_cnt;
u8 reserved2[3];
u8 gid[][SMC_GID_SIZE];
};
struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
struct smc_clc_msg_hdr hdr;
union {
struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
struct { /* SMC-D */
struct smcd_clc_msg_accept_confirm_common d0;
u32 reserved5[3];
};
};
} __packed; /* format defined in RFC7609 */
struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */
struct smc_clc_msg_hdr hdr;
union {
struct { /* SMC-R */
struct smcr_clc_msg_accept_confirm r0;
u8 eid[SMC_MAX_EID_LEN];
u8 reserved6[8];
} r1;
struct { /* SMC-D */
struct smcd_clc_msg_accept_confirm_common d0;
__be16 chid;
u8 eid[SMC_MAX_EID_LEN];
u8 reserved5[8];
} d1;
};
};
struct smc_clc_msg_decline { /* clc decline message */
struct smc_clc_msg_hdr hdr;
u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
__be32 peer_diagnosis; /* diagnosis information */
#if defined(__BIG_ENDIAN_BITFIELD)
u8 os_type : 4,
reserved : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
u8 reserved : 4,
os_type : 4;
#endif
u8 reserved2[3];
struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
} __aligned(4);
#define SMC_DECL_DIAG_COUNT_V2 4 /* no. of additional peer diagnosis codes */
struct smc_clc_msg_decline_v2 { /* clc decline message */
struct smc_clc_msg_hdr hdr;
u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
__be32 peer_diagnosis; /* diagnosis information */
#if defined(__BIG_ENDIAN_BITFIELD)
u8 os_type : 4,
reserved : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
u8 reserved : 4,
os_type : 4;
#endif
u8 reserved2[3];
__be32 peer_diagnosis_v2[SMC_DECL_DIAG_COUNT_V2];
struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
} __aligned(4);
/* determine start of the prefix area within the proposal message */
static inline struct smc_clc_msg_proposal_prefix *
smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
{
return (struct smc_clc_msg_proposal_prefix *)
((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
}
static inline bool smcr_indicated(int smc_type)
{
return smc_type == SMC_TYPE_R || smc_type == SMC_TYPE_B;
}
static inline bool smcd_indicated(int smc_type)
{
return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B;
}
static inline u8 smc_indicated_type(int is_smcd, int is_smcr)
{
if (is_smcd && is_smcr)
return SMC_TYPE_B;
if (is_smcd)
return SMC_TYPE_D;
if (is_smcr)
return SMC_TYPE_R;
return SMC_TYPE_N;
}
/* get SMC-D info from proposal message */
static inline struct smc_clc_msg_smcd *
smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
{
if (smcd_indicated(prop->hdr.typev1) &&
ntohs(prop->iparea_offset) != sizeof(struct smc_clc_msg_smcd))
return NULL;
return (struct smc_clc_msg_smcd *)(prop + 1);
}
static inline struct smc_clc_v2_extension *
smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop)
{
struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop);
if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset))
return NULL;
return (struct smc_clc_v2_extension *)
((u8 *)prop_smcd +
offsetof(struct smc_clc_msg_smcd, v2_ext_offset) +
sizeof(prop_smcd->v2_ext_offset) +
ntohs(prop_smcd->v2_ext_offset));
}
static inline struct smc_clc_smcd_v2_extension *
smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext)
{
if (!prop_v2ext)
return NULL;
if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset))
return NULL;
return (struct smc_clc_smcd_v2_extension *)
((u8 *)prop_v2ext +
offsetof(struct smc_clc_v2_extension, hdr) +
offsetof(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) +
sizeof(prop_v2ext->hdr.smcd_v2_ext_offset) +
ntohs(prop_v2ext->hdr.smcd_v2_ext_offset));
}
struct smcd_dev;
struct smc_init_info;
int smc_clc_prfx_match(struct socket *clcsock,
struct smc_clc_msg_proposal_prefix *prop);
int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
u8 expected_type, unsigned long timeout);
int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version);
int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini);
int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
u8 version, u8 *eid, struct smc_init_info *ini);
int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact,
u8 version, u8 *negotiated_eid);
void smc_clc_init(void) __init;
void smc_clc_exit(void);
void smc_clc_get_hostname(u8 **host);
bool smc_clc_match_eid(u8 *negotiated_eid,
struct smc_clc_v2_extension *smc_v2_ext,
u8 *peer_eid, u8 *local_eid);
int smc_clc_ueid_count(void);
int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb);
int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info);
int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info);
int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info);
int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb);
int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info);
int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info);
#endif