diff --git a/include/linux/socket.h b/include/linux/socket.h index 8ef26d89ef49..6f85f5d957ef 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -366,6 +366,7 @@ struct ucred { #define SOL_XDP 283 #define SOL_MPTCP 284 #define SOL_MCTP 285 +#define SOL_SMC 286 /* IPX options */ #define IPX_TYPE 1 diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 78b91bb92f0d..1168302b7927 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -394,6 +394,7 @@ struct tcp_sock { bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) + bool (*smc_hs_congested)(const struct sock *sk); bool syn_smc; /* SYN includes SMC */ #endif diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index ea8a9cf2619b..47b166684fd8 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -12,5 +12,7 @@ struct netns_smc { /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; + + bool limit_smc_hs; /* constraint on handshake */ }; #endif diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 6c2874fd2c00..693f549f6966 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -59,6 +59,9 @@ enum { SMC_NETLINK_DUMP_SEID, SMC_NETLINK_ENABLE_SEID, SMC_NETLINK_DISABLE_SEID, + SMC_NETLINK_DUMP_HS_LIMITATION, + SMC_NETLINK_ENABLE_HS_LIMITATION, + SMC_NETLINK_DISABLE_HS_LIMITATION, }; /* SMC_GENL_FAMILY top level attributes */ @@ -284,4 +287,16 @@ enum { __SMC_NLA_SEID_TABLE_MAX, SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1 }; + +/* SMC_NETLINK_HS_LIMITATION attributes */ +enum { + SMC_NLA_HS_LIMITATION_UNSPEC, + SMC_NLA_HS_LIMITATION_ENABLED, /* u8 */ + __SMC_NLA_HS_LIMITATION_MAX, + SMC_NLA_HS_LIMITATION_MAX = __SMC_NLA_HS_LIMITATION_MAX - 1 +}; + +/* SMC socket options */ +#define SMC_LIMIT_HS 1 /* constraint on smc handshake */ + #endif /* _UAPI_LINUX_SMC_H */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index af94a6d22a9d..92e65d56dc2c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6703,7 +6703,8 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); #if IS_ENABLED(CONFIG_SMC) - ireq->smc_ok = rx_opt->smc_ok; + ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested && + tcp_sk(sk)->smc_hs_congested(sk)); #endif } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 00b2e9deabb0..246c874de629 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -59,12 +59,52 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ +static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); +int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + void *hdr; + + if (cb_ctx->pos[0]) + goto out; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_DUMP_HS_LIMITATION); + if (!hdr) + return -ENOMEM; + + if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED, + sock_net(skb->sk)->smc.limit_smc_hs)) + goto err; + + genlmsg_end(skb, hdr); + cb_ctx->pos[0] = 1; +out: + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info) +{ + sock_net(skb->sk)->smc.limit_smc_hs = true; + return 0; +} + +int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info) +{ + sock_net(skb->sk)->smc.limit_smc_hs = false; + return 0; +} + static void smc_set_keepalive(struct sock *sk, int val) { struct smc_sock *smc = smc_sk(sk); @@ -72,6 +112,51 @@ static void smc_set_keepalive(struct sock *sk, int val) smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); } +static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct smc_sock *smc; + + smc = smc_clcsock_user_data(sk); + + if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > + sk->sk_max_ack_backlog) + goto drop; + + if (sk_acceptq_is_full(&smc->sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + goto drop; + } + + /* passthrough to original syn recv sock fct */ + return smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, + own_req); + +drop: + dst_release(dst); + tcp_listendrop(sk); + return NULL; +} + +static bool smc_hs_congested(const struct sock *sk) +{ + const struct smc_sock *smc; + + smc = smc_clcsock_user_data(sk); + + if (!smc) + return true; + + if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) + return true; + + return false; +} + static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; @@ -1594,6 +1679,9 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_dec(&lsmc->queued_smc_hs); + if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -2199,6 +2287,9 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_inc(&lsmc->queued_smc_hs); + new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; @@ -2227,7 +2318,7 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock) lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_hs_wq, &lsmc->tcp_listen_work)) + if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) sock_put(&lsmc->sk); } } @@ -2265,6 +2356,18 @@ static int smc_listen(struct socket *sock, int backlog) smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + + /* save original ops */ + smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; + + smc->af_ops = *smc->ori_af_ops; + smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock; + + inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; + + if (smc->limit_smc_hs) + tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + rc = kernel_listen(smc->clcsock, backlog); if (rc) { smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; @@ -2558,6 +2661,67 @@ static int smc_shutdown(struct socket *sock, int how) return rc ? rc : rc1; } +static int __smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct smc_sock *smc; + int val, len; + + smc = smc_sk(sock->sk); + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case SMC_LIMIT_HS: + val = smc->limit_smc_hs; + break; + default: + return -EOPNOTSUPP; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +static int __smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int val, rc; + + smc = smc_sk(sk); + + lock_sock(sk); + switch (optname) { + case SMC_LIMIT_HS: + if (optlen < sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; + + smc->limit_smc_hs = !!val; + rc = 0; + break; + default: + rc = -EOPNOTSUPP; + break; + } + release_sock(sk); + + return rc; +} + static int smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -2567,6 +2731,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (level == SOL_TCP && optname == TCP_ULP) return -EOPNOTSUPP; + else if (level == SOL_SMC) + return __smc_setsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sk); @@ -2649,6 +2815,9 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, struct smc_sock *smc; int rc; + if (level == SOL_SMC) + return __smc_getsockopt(sock, level, optname, optval, optlen); + smc = smc_sk(sock->sk); mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { @@ -2877,6 +3046,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; + /* default behavior from limit_smc_hs in every net namespace */ + smc->limit_smc_hs = net->smc.limit_smc_hs; + rc = 0; if (!clcsock) { rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, @@ -3024,9 +3196,14 @@ static int __init smc_init(void) goto out_nl; rc = -ENOMEM; + + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + if (!smc_tcp_ls_wq) + goto out_pnet; + smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); if (!smc_hs_wq) - goto out_pnet; + goto out_alloc_tcp_ls_wq; smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); if (!smc_close_wq) @@ -3097,6 +3274,8 @@ static int __init smc_init(void) destroy_workqueue(smc_close_wq); out_alloc_hs_wq: destroy_workqueue(smc_hs_wq); +out_alloc_tcp_ls_wq: + destroy_workqueue(smc_tcp_ls_wq); out_pnet: smc_pnet_exit(); out_nl: @@ -3115,6 +3294,7 @@ static void __exit smc_exit(void) smc_core_exit(); smc_ib_unregister_client(); destroy_workqueue(smc_close_wq); + destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); diff --git a/net/smc/smc.h b/net/smc/smc.h index 37b2001a0255..a096d8af21a0 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -14,6 +14,7 @@ #include #include #include /* __aligned */ +#include #include #include "smc_ib.h" @@ -249,9 +250,14 @@ struct smc_sock { /* smc sock container */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ + bool limit_smc_hs; /* put constraint on handshake */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ + atomic_t queued_smc_hs; /* queued smc handshakes */ + struct inet_connection_sock_af_ops af_ops; + const struct inet_connection_sock_af_ops *ori_af_ops; + /* original af ops */ int sockopt_defer_accept; /* sockopt TCP_DEFER_ACCEPT * value @@ -276,7 +282,7 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } -static inline struct smc_sock *smc_clcsock_user_data(struct sock *clcsk) +static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) { return (struct smc_sock *) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); @@ -331,4 +337,9 @@ void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid); +/* smc handshake limitation interface for netlink */ +int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); +int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); + #endif /* __SMC_H */ diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index f13ab0661ed5..c5a62f6f52ba 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -111,6 +111,21 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_seid, }, + { + .cmd = SMC_NETLINK_DUMP_HS_LIMITATION, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_hs_limitation, + }, + { + .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_enable_hs_limitation, + }, + { + .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_disable_hs_limitation, + }, }; static const struct nla_policy smc_gen_nl_policy[2] = { diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 0599246c0376..ff61b7b95875 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -870,6 +870,9 @@ int smc_pnet_net_init(struct net *net) smc_pnet_create_pnetids_list(net); + /* disable handshake limitation by default */ + net->smc.limit_smc_hs = 0; + return 0; }