linux-stable/net/smc/af_smc.c

3654 lines
92 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* AF_SMC protocol family socket handler keeping the AF_INET sock address type
* applies to SOCK_STREAM sockets only
* offers an alternative communication option for TCP-protocol sockets
* applicable with RoCE-cards only
*
* Initial restrictions:
* - support for alternate links postponed
*
* Copyright IBM Corp. 2016, 2018
*
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
* based on prototype from Frank Blaschka
*/
#define KMSG_COMPONENT "smc"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/workqueue.h>
#include <linux/in.h>
sched/headers: Move task_struct::signal and task_struct::sighand types and accessors into <linux/sched/signal.h> task_struct::signal and task_struct::sighand are pointers, which would normally make it straightforward to not define those types in sched.h. That is not so, because the types are accompanied by a myriad of APIs (macros and inline functions) that dereference them. Split the types and the APIs out of sched.h and move them into a new header, <linux/sched/signal.h>. With this change sched.h does not know about 'struct signal' and 'struct sighand' anymore, trying to put accessors into sched.h as a test fails the following way: ./include/linux/sched.h: In function ‘test_signal_types’: ./include/linux/sched.h:2461:18: error: dereferencing pointer to incomplete type ‘struct signal_struct’ ^ This reduces the size and complexity of sched.h significantly. Update all headers and .c code that relied on getting the signal handling functionality from <linux/sched.h> to include <linux/sched/signal.h>. The list of affected files in the preparatory patch was partly generated by grepping for the APIs, and partly by doing coverage build testing, both all[yes|mod|def|no]config builds on 64-bit and 32-bit x86, and an array of cross-architecture builds. Nevertheless some (trivial) build breakage is still expected related to rare Kconfig combinations and in-flight patches to various kernel code, but most of it should be handled by this patch. Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-02-02 07:35:14 +00:00
#include <linux/sched/signal.h>
#include <linux/if_vlan.h>
#include <linux/rcupdate_wait.h>
#include <linux/ctype.h>
#include <linux/splice.h>
sched/headers: Move task_struct::signal and task_struct::sighand types and accessors into <linux/sched/signal.h> task_struct::signal and task_struct::sighand are pointers, which would normally make it straightforward to not define those types in sched.h. That is not so, because the types are accompanied by a myriad of APIs (macros and inline functions) that dereference them. Split the types and the APIs out of sched.h and move them into a new header, <linux/sched/signal.h>. With this change sched.h does not know about 'struct signal' and 'struct sighand' anymore, trying to put accessors into sched.h as a test fails the following way: ./include/linux/sched.h: In function ‘test_signal_types’: ./include/linux/sched.h:2461:18: error: dereferencing pointer to incomplete type ‘struct signal_struct’ ^ This reduces the size and complexity of sched.h significantly. Update all headers and .c code that relied on getting the signal handling functionality from <linux/sched.h> to include <linux/sched/signal.h>. The list of affected files in the preparatory patch was partly generated by grepping for the APIs, and partly by doing coverage build testing, both all[yes|mod|def|no]config builds on 64-bit and 32-bit x86, and an array of cross-architecture builds. Nevertheless some (trivial) build breakage is still expected related to rare Kconfig combinations and in-flight patches to various kernel code, but most of it should be handled by this patch. Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-02-02 07:35:14 +00:00
#include <net/sock.h>
#include <net/tcp.h>
#include <net/smc.h>
#include <asm/ioctls.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include "smc_netns.h"
#include "smc.h"
#include "smc_clc.h"
#include "smc_llc.h"
#include "smc_cdc.h"
#include "smc_core.h"
#include "smc_ib.h"
#include "smc_ism.h"
#include "smc_pnet.h"
#include "smc_netlink.h"
#include "smc_tx.h"
#include "smc_rx.h"
#include "smc_close.h"
#include "smc_stats.h"
#include "smc_tracepoint.h"
#include "smc_sysctl.h"
#include "smc_loopback.h"
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
* creation on server
*/
static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group
* creation on client
*/
static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */
struct workqueue_struct *smc_hs_wq; /* wq for handshake work */
struct workqueue_struct *smc_close_wq; /* wq for close work */
static void smc_tcp_listen_work(struct work_struct *);
static void smc_connect_work(struct work_struct *);
int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
{
struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
void *hdr;
if (cb_ctx->pos[0])
goto out;
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&smc_gen_nl_family, NLM_F_MULTI,
SMC_NETLINK_DUMP_HS_LIMITATION);
if (!hdr)
return -ENOMEM;
if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
sock_net(skb->sk)->smc.limit_smc_hs))
goto err;
genlmsg_end(skb, hdr);
cb_ctx->pos[0] = 1;
out:
return skb->len;
err:
genlmsg_cancel(skb, hdr);
return -EMSGSIZE;
}
int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
{
sock_net(skb->sk)->smc.limit_smc_hs = true;
return 0;
}
int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
{
sock_net(skb->sk)->smc.limit_smc_hs = false;
return 0;
}
static void smc_set_keepalive(struct sock *sk, int val)
{
struct smc_sock *smc = smc_sk(sk);
smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}
net/smc: Limit backlog connections Current implementation does not handling backlog semantics, one potential risk is that server will be flooded by infinite amount connections, even if client was SMC-incapable. This patch works to put a limit on backlog connections, referring to the TCP implementation, we divides SMC connections into two categories: 1. Half SMC connection, which includes all TCP established while SMC not connections. 2. Full SMC connection, which includes all SMC established connections. For half SMC connection, since all half SMC connections starts with TCP established, we can achieve our goal by put a limit before TCP established. Refer to the implementation of TCP, this limits will based on not only the half SMC connections but also the full connections, which is also a constraint on full SMC connections. For full SMC connections, although we know exactly where it starts, it's quite hard to put a limit before it. The easiest way is to block wait before receive SMC confirm CLC message, while it's under protection by smc_server_lgr_pending, a global lock, which leads this limit to the entire host instead of a single listen socket. Another way is to drop the full connections, but considering the cast of SMC connections, we prefer to keep full SMC connections. Even so, the limits of full SMC connections still exists, see commits about half SMC connection below. After this patch, the limits of backend connection shows like: For SMC: 1. Client with SMC-capability can makes 2 * backlog full SMC connections or 1 * backlog half SMC connections and 1 * backlog full SMC connections at most. 2. Client without SMC-capability can only makes 1 * backlog half TCP connections and 1 * backlog full TCP connections. Signed-off-by: D. Wythe <alibuda@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 09:11:35 +00:00
static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst,
struct request_sock *req_unhash,
bool *own_req)
{
struct smc_sock *smc;
struct sock *child;
net/smc: Limit backlog connections Current implementation does not handling backlog semantics, one potential risk is that server will be flooded by infinite amount connections, even if client was SMC-incapable. This patch works to put a limit on backlog connections, referring to the TCP implementation, we divides SMC connections into two categories: 1. Half SMC connection, which includes all TCP established while SMC not connections. 2. Full SMC connection, which includes all SMC established connections. For half SMC connection, since all half SMC connections starts with TCP established, we can achieve our goal by put a limit before TCP established. Refer to the implementation of TCP, this limits will based on not only the half SMC connections but also the full connections, which is also a constraint on full SMC connections. For full SMC connections, although we know exactly where it starts, it's quite hard to put a limit before it. The easiest way is to block wait before receive SMC confirm CLC message, while it's under protection by smc_server_lgr_pending, a global lock, which leads this limit to the entire host instead of a single listen socket. Another way is to drop the full connections, but considering the cast of SMC connections, we prefer to keep full SMC connections. Even so, the limits of full SMC connections still exists, see commits about half SMC connection below. After this patch, the limits of backend connection shows like: For SMC: 1. Client with SMC-capability can makes 2 * backlog full SMC connections or 1 * backlog half SMC connections and 1 * backlog full SMC connections at most. 2. Client without SMC-capability can only makes 1 * backlog half TCP connections and 1 * backlog full TCP connections. Signed-off-by: D. Wythe <alibuda@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 09:11:35 +00:00
smc = smc_clcsock_user_data(sk);
if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
sk->sk_max_ack_backlog)
goto drop;
if (sk_acceptq_is_full(&smc->sk)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
/* passthrough to original syn recv sock fct */
child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash,
own_req);
/* child must not inherit smc or its ops */
if (child) {
rcu_assign_sk_user_data(child, NULL);
/* v4-mapped sockets don't inherit parent ops. Don't restore. */
if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops)
inet_csk(child)->icsk_af_ops = smc->ori_af_ops;
}
return child;
net/smc: Limit backlog connections Current implementation does not handling backlog semantics, one potential risk is that server will be flooded by infinite amount connections, even if client was SMC-incapable. This patch works to put a limit on backlog connections, referring to the TCP implementation, we divides SMC connections into two categories: 1. Half SMC connection, which includes all TCP established while SMC not connections. 2. Full SMC connection, which includes all SMC established connections. For half SMC connection, since all half SMC connections starts with TCP established, we can achieve our goal by put a limit before TCP established. Refer to the implementation of TCP, this limits will based on not only the half SMC connections but also the full connections, which is also a constraint on full SMC connections. For full SMC connections, although we know exactly where it starts, it's quite hard to put a limit before it. The easiest way is to block wait before receive SMC confirm CLC message, while it's under protection by smc_server_lgr_pending, a global lock, which leads this limit to the entire host instead of a single listen socket. Another way is to drop the full connections, but considering the cast of SMC connections, we prefer to keep full SMC connections. Even so, the limits of full SMC connections still exists, see commits about half SMC connection below. After this patch, the limits of backend connection shows like: For SMC: 1. Client with SMC-capability can makes 2 * backlog full SMC connections or 1 * backlog half SMC connections and 1 * backlog full SMC connections at most. 2. Client without SMC-capability can only makes 1 * backlog half TCP connections and 1 * backlog full TCP connections. Signed-off-by: D. Wythe <alibuda@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 09:11:35 +00:00
drop:
dst_release(dst);
tcp_listendrop(sk);
return NULL;
}
2022-02-10 09:11:36 +00:00
static bool smc_hs_congested(const struct sock *sk)
{
const struct smc_sock *smc;
smc = smc_clcsock_user_data(sk);
if (!smc)
return true;
if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
return true;
return false;
}
static struct smc_hashinfo smc_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};
static struct smc_hashinfo smc_v6_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
};
static int smc_hash_sk(struct sock *sk)
{
struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
struct hlist_head *head;
head = &h->ht;
write_lock_bh(&h->lock);
sk_add_node(sk, head);
write_unlock_bh(&h->lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
return 0;
}
static void smc_unhash_sk(struct sock *sk)
{
struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
write_lock_bh(&h->lock);
if (sk_del_node_init(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
write_unlock_bh(&h->lock);
}
/* This will be called before user really release sock_lock. So do the
* work which we didn't do because of user hold the sock_lock in the
* BH context
*/
static void smc_release_cb(struct sock *sk)
{
struct smc_sock *smc = smc_sk(sk);
if (smc->conn.tx_in_release_sock) {
smc_tx_pending(&smc->conn);
smc->conn.tx_in_release_sock = false;
}
}
struct proto smc_proto = {
.name = "SMC",
.owner = THIS_MODULE,
.keepalive = smc_set_keepalive,
.hash = smc_hash_sk,
.unhash = smc_unhash_sk,
.release_cb = smc_release_cb,
.obj_size = sizeof(struct smc_sock),
.h.smc_hash = &smc_v4_hashinfo,
.slab_flags = SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto);
struct proto smc_proto6 = {
.name = "SMC6",
.owner = THIS_MODULE,
.keepalive = smc_set_keepalive,
.hash = smc_hash_sk,
.unhash = smc_unhash_sk,
.release_cb = smc_release_cb,
.obj_size = sizeof(struct smc_sock),
.h.smc_hash = &smc_v6_hashinfo,
.slab_flags = SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto6);
static void smc_fback_restore_callbacks(struct smc_sock *smc)
{
struct sock *clcsk = smc->clcsock->sk;
write_lock_bh(&clcsk->sk_callback_lock);
clcsk->sk_user_data = NULL;
smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change);
smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready);
smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space);
smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report);
write_unlock_bh(&clcsk->sk_callback_lock);
}
static void smc_restore_fallback_changes(struct smc_sock *smc)
{
if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
smc->clcsock->file->private_data = smc->sk.sk_socket;
smc->clcsock->file = NULL;
smc_fback_restore_callbacks(smc);
}
}
static int __smc_release(struct smc_sock *smc)
{
struct sock *sk = &smc->sk;
int rc = 0;
if (!smc->use_fallback) {
rc = smc_close_active(smc);
smc_sock_set_flag(sk, SOCK_DEAD);
sk->sk_shutdown |= SHUTDOWN_MASK;
} else {
net/smc: fix sk_refcnt underflow on linkdown and fallback We got the following WARNING when running ab/nginx test with RDMA link flapping (up-down-up). The reason is when smc_sock fallback and at linkdown happens simultaneously, we may got the following situation: __smc_lgr_terminate() --> smc_conn_kill() --> smc_close_active_abort() smc_sock->sk_state = SMC_CLOSED sock_put(smc_sock) smc_sock was set to SMC_CLOSED and sock_put() been called when terminate the link group. But later application call close() on the socket, then we got: __smc_release(): if (smc_sock->fallback) smc_sock->sk_state = SMC_CLOSED sock_put(smc_sock) Again we set the smc_sock to CLOSED through it's already in CLOSED state, and double put the refcnt, so the following warning happens: refcount_t: underflow; use-after-free. WARNING: CPU: 5 PID: 860 at lib/refcount.c:28 refcount_warn_saturate+0x8d/0xf0 Modules linked in: CPU: 5 PID: 860 Comm: nginx Not tainted 5.10.46+ #403 Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8c24b4c 04/01/2014 RIP: 0010:refcount_warn_saturate+0x8d/0xf0 Code: 05 5c 1e b5 01 01 e8 52 25 bc ff 0f 0b c3 80 3d 4f 1e b5 01 00 75 ad 48 RSP: 0018:ffffc90000527e50 EFLAGS: 00010286 RAX: 0000000000000026 RBX: ffff8881300df2c0 RCX: 0000000000000027 RDX: 0000000000000000 RSI: ffff88813bd58040 RDI: ffff88813bd58048 RBP: 0000000000000000 R08: 0000000000000003 R09: 0000000000000001 R10: ffff8881300df2c0 R11: ffffc90000527c78 R12: ffff8881300df340 R13: ffff8881300df930 R14: ffff88810b3dad80 R15: ffff8881300df4f8 FS: 00007f739de8fb80(0000) GS:ffff88813bd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000a01b008 CR3: 0000000111b64003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: smc_release+0x353/0x3f0 __sock_release+0x3d/0xb0 sock_close+0x11/0x20 __fput+0x93/0x230 task_work_run+0x65/0xa0 exit_to_user_mode_prepare+0xf9/0x100 syscall_exit_to_user_mode+0x27/0x190 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This patch adds check in __smc_release() to make sure we won't do an extra sock_put() and set the socket to CLOSED when its already in CLOSED state. Fixes: 51f1de79ad8e (net/smc: replace sock_put worker by socket refcounting) Signed-off-by: Dust Li <dust.li@linux.alibaba.com> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com> Signed-off-by: Dust Li <dust.li@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-11-10 07:02:34 +00:00
if (sk->sk_state != SMC_CLOSED) {
if (sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_INIT)
sock_put(sk); /* passive closing */
if (sk->sk_state == SMC_LISTEN) {
/* wake up clcsock accept */
rc = kernel_sock_shutdown(smc->clcsock,
SHUT_RDWR);
}
sk->sk_state = SMC_CLOSED;
sk->sk_state_change(sk);
}
smc_restore_fallback_changes(smc);
}
sk->sk_prot->unhash(sk);
if (sk->sk_state == SMC_CLOSED) {
if (smc->clcsock) {
release_sock(sk);
smc_clcsock_release(smc);
lock_sock(sk);
}
if (!smc->use_fallback)
smc_conn_free(&smc->conn);
}
return rc;
}
static int smc_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int old_state, rc = 0;
if (!sk)
goto out;
sock_hold(sk); /* sock_put below */
smc = smc_sk(sk);
old_state = sk->sk_state;
/* cleanup for a dangling non-blocking connect */
if (smc->connect_nonblock && old_state == SMC_INIT)
tcp_abort(smc->clcsock->sk, ECONNABORTED);
net/smc: Prevent smc_release() from long blocking In nginx/wrk benchmark, there's a hung problem with high probability on case likes that: (client will last several minutes to exit) server: smc_run nginx client: smc_run wrk -c 10000 -t 1 http://server Client hangs with the following backtrace: 0 [ffffa7ce8Of3bbf8] __schedule at ffffffff9f9eOd5f 1 [ffffa7ce8Of3bc88] schedule at ffffffff9f9eløe6 2 [ffffa7ce8Of3bcaO] schedule_timeout at ffffffff9f9e3f3c 3 [ffffa7ce8Of3bd2O] wait_for_common at ffffffff9f9el9de 4 [ffffa7ce8Of3bd8O] __flush_work at ffffffff9fOfeOl3 5 [ffffa7ce8øf3bdfO] smc_release at ffffffffcO697d24 [smc] 6 [ffffa7ce8Of3be2O] __sock_release at ffffffff9f8O2e2d 7 [ffffa7ce8Of3be4ø] sock_close at ffffffff9f8ø2ebl 8 [ffffa7ce8øf3be48] __fput at ffffffff9f334f93 9 [ffffa7ce8Of3be78] task_work_run at ffffffff9flOlff5 10 [ffffa7ce8Of3beaO] do_exit at ffffffff9fOe5Ol2 11 [ffffa7ce8Of3bflO] do_group_exit at ffffffff9fOe592a 12 [ffffa7ce8Of3bf38] __x64_sys_exit_group at ffffffff9fOe5994 13 [ffffa7ce8Of3bf4O] do_syscall_64 at ffffffff9f9d4373 14 [ffffa7ce8Of3bfsO] entry_SYSCALL_64_after_hwframe at ffffffff9fa0007c This issue dues to flush_work(), which is used to wait for smc_connect_work() to finish in smc_release(). Once lots of smc_connect_work() was pending or all executing work dangling, smc_release() has to block until one worker comes to free, which is equivalent to wait another smc_connnect_work() to finish. In order to fix this, There are two changes: 1. For those idle smc_connect_work(), cancel it from the workqueue; for executing smc_connect_work(), waiting for it to finish. For that purpose, replace flush_work() with cancel_work_sync(). 2. Since smc_connect() hold a reference for passive closing, if smc_connect_work() has been cancelled, release the reference. Fixes: 24ac3a08e658 ("net/smc: rebuild nonblocking connect") Reported-by: Tony Lu <tonylu@linux.alibaba.com> Tested-by: Dust Li <dust.li@linux.alibaba.com> Reviewed-by: Dust Li <dust.li@linux.alibaba.com> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Link: https://lore.kernel.org/r/1639571361-101128-1-git-send-email-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-12-15 12:29:21 +00:00
if (cancel_work_sync(&smc->connect_work))
sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */
if (sk->sk_state == SMC_LISTEN)
/* smc_close_non_accepted() is called and acquires
* sock lock for child sockets again
*/
lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
else
lock_sock(sk);
if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE &&
!smc->use_fallback)
smc_close_active_abort(smc);
rc = __smc_release(smc);
/* detach socket */
sock_orphan(sk);
sock->sk = NULL;
release_sock(sk);
sock_put(sk); /* sock_hold above */
sock_put(sk); /* final sock_put */
out:
return rc;
}
static void smc_destruct(struct sock *sk)
{
if (sk->sk_state != SMC_CLOSED)
return;
if (!sock_flag(sk, SOCK_DEAD))
return;
}
static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
int protocol)
{
struct smc_sock *smc;
struct proto *prot;
struct sock *sk;
prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
if (!sk)
return NULL;
sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
sk->sk_state = SMC_INIT;
sk->sk_destruct = smc_destruct;
sk->sk_protocol = protocol;
net/smc: Fix setsockopt and sysctl to specify same buffer size again Commit 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable") introduced the net.smc.rmem and net.smc.wmem sysctls to specify the size of buffers to be used for SMC type connections. This created a regression for users that specified the buffer size via setsockopt() as the effective buffer size was now doubled. Re-introduce the division by 2 in the SMC buffer create code and level this out by duplicating the net.smc.[rw]mem values used for initializing sk_rcvbuf/sk_sndbuf at socket creation time. This gives users of both methods (setsockopt or sysctl) the effective buffer size that they expect. Initialize net.smc.[rw]mem from its own constant of 64kB, respectively. Internal performance tests show that this value is a good compromise between throughput/latency and memory consumption. Also, this decouples it from any tuning that was done to net.ipv4.tcp_[rw]mem[1] before the module for SMC protocol was loaded. Check that no more than INT_MAX / 2 is assigned to net.smc.[rw]mem, in order to avoid any overflow condition when that is doubled for use in sk_sndbuf or sk_rcvbuf. While at it, drop the confusing sk_buf_size variable from __smc_buf_create and name "compressed" buffer size variables more consistently. Background: Before the commit mentioned above, SMC's buffer allocator in __smc_buf_create() always used half of the sockets' sk_rcvbuf/sk_sndbuf value as initial value to search for appropriate buffers. If the search resorted to using a bigger buffer when all buffers of the specified size were busy, the duplicate of the used effective buffer size is stored back to sk_rcvbuf/sk_sndbuf. When available, buffers of exactly the size that a user had specified as input to setsockopt() were used, despite setsockopt()'s documentation in "man 7 socket" talking of a mandatory duplication: [...] SO_SNDBUF Sets or gets the maximum socket send buffer in bytes. The kernel doubles this value (to allow space for book‐ keeping overhead) when it is set using setsockopt(2), and this doubled value is returned by getsockopt(2). The default value is set by the /proc/sys/net/core/wmem_default file and the maximum allowed value is set by the /proc/sys/net/core/wmem_max file. The minimum (doubled) value for this option is 2048. [...] Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable") Co-developed-by: Jan Karcher <jaka@linux.ibm.com> Signed-off-by: Jan Karcher <jaka@linux.ibm.com> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com> Signed-off-by: Gerd Bayer <gbayer@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-08-04 17:06:23 +00:00
WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem));
WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem));
smc = smc_sk(sk);
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
INIT_WORK(&smc->connect_work, smc_connect_work);
net/smc: init conn.tx_work & conn.send_lock sooner syzkaller found that following program crashes the host : { int fd = socket(AF_SMC, SOCK_STREAM, 0); int val = 1; listen(fd, 0); shutdown(fd, SHUT_RDWR); setsockopt(fd, 6, TCP_NODELAY, &val, 4); } Simply initialize conn.tx_work & conn.send_lock at socket creation, rather than deeper in the stack. ODEBUG: assert_init not available (active state 0) object type: timer_list hint: (null) WARNING: CPU: 1 PID: 13988 at lib/debugobjects.c:329 debug_print_object+0x16a/0x210 lib/debugobjects.c:326 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 13988 Comm: syz-executor0 Not tainted 4.17.0-rc4+ #46 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 panic+0x22f/0x4de kernel/panic.c:184 __warn.cold.8+0x163/0x1b3 kernel/panic.c:536 report_bug+0x252/0x2d0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992 RIP: 0010:debug_print_object+0x16a/0x210 lib/debugobjects.c:326 RSP: 0018:ffff880197a37880 EFLAGS: 00010086 RAX: 0000000000000061 RBX: 0000000000000005 RCX: ffffc90001ed0000 RDX: 0000000000004aaf RSI: ffffffff8160f6f1 RDI: 0000000000000001 RBP: ffff880197a378c0 R08: ffff8801aa7a0080 R09: ffffed003b5e3eb2 R10: ffffed003b5e3eb2 R11: ffff8801daf1f597 R12: 0000000000000001 R13: ffffffff88d96980 R14: ffffffff87fa19a0 R15: ffffffff81666ec0 debug_object_assert_init+0x309/0x500 lib/debugobjects.c:692 debug_timer_assert_init kernel/time/timer.c:724 [inline] debug_assert_init kernel/time/timer.c:776 [inline] del_timer+0x74/0x140 kernel/time/timer.c:1198 try_to_grab_pending+0x439/0x9a0 kernel/workqueue.c:1223 mod_delayed_work_on+0x91/0x250 kernel/workqueue.c:1592 mod_delayed_work include/linux/workqueue.h:541 [inline] smc_setsockopt+0x387/0x6d0 net/smc/af_smc.c:1367 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: 01d2f7e2cdd3 ("net/smc: sockopts TCP_NODELAY and TCP_CORK") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ursula Braun <ubraun@linux.ibm.com> Cc: linux-s390@vger.kernel.org Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-17 10:54:21 +00:00
INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
INIT_LIST_HEAD(&smc->accept_q);
spin_lock_init(&smc->accept_q_lock);
net/smc: init conn.tx_work & conn.send_lock sooner syzkaller found that following program crashes the host : { int fd = socket(AF_SMC, SOCK_STREAM, 0); int val = 1; listen(fd, 0); shutdown(fd, SHUT_RDWR); setsockopt(fd, 6, TCP_NODELAY, &val, 4); } Simply initialize conn.tx_work & conn.send_lock at socket creation, rather than deeper in the stack. ODEBUG: assert_init not available (active state 0) object type: timer_list hint: (null) WARNING: CPU: 1 PID: 13988 at lib/debugobjects.c:329 debug_print_object+0x16a/0x210 lib/debugobjects.c:326 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 13988 Comm: syz-executor0 Not tainted 4.17.0-rc4+ #46 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 panic+0x22f/0x4de kernel/panic.c:184 __warn.cold.8+0x163/0x1b3 kernel/panic.c:536 report_bug+0x252/0x2d0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992 RIP: 0010:debug_print_object+0x16a/0x210 lib/debugobjects.c:326 RSP: 0018:ffff880197a37880 EFLAGS: 00010086 RAX: 0000000000000061 RBX: 0000000000000005 RCX: ffffc90001ed0000 RDX: 0000000000004aaf RSI: ffffffff8160f6f1 RDI: 0000000000000001 RBP: ffff880197a378c0 R08: ffff8801aa7a0080 R09: ffffed003b5e3eb2 R10: ffffed003b5e3eb2 R11: ffff8801daf1f597 R12: 0000000000000001 R13: ffffffff88d96980 R14: ffffffff87fa19a0 R15: ffffffff81666ec0 debug_object_assert_init+0x309/0x500 lib/debugobjects.c:692 debug_timer_assert_init kernel/time/timer.c:724 [inline] debug_assert_init kernel/time/timer.c:776 [inline] del_timer+0x74/0x140 kernel/time/timer.c:1198 try_to_grab_pending+0x439/0x9a0 kernel/workqueue.c:1223 mod_delayed_work_on+0x91/0x250 kernel/workqueue.c:1592 mod_delayed_work include/linux/workqueue.h:541 [inline] smc_setsockopt+0x387/0x6d0 net/smc/af_smc.c:1367 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903 __do_sys_setsockopt net/socket.c:1914 [inline] __se_sys_setsockopt net/socket.c:1911 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: 01d2f7e2cdd3 ("net/smc: sockopts TCP_NODELAY and TCP_CORK") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ursula Braun <ubraun@linux.ibm.com> Cc: linux-s390@vger.kernel.org Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-05-17 10:54:21 +00:00
spin_lock_init(&smc->conn.send_lock);
sk->sk_prot->hash(sk);
mutex_init(&smc->clcsock_release_lock);
smc_init_saved_callbacks(smc);
return sk;
}
static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc;
smc = smc_sk(sk);
/* replicate tests from inet_bind(), to be safe wrt. future changes */
rc = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
goto out;
rc = -EAFNOSUPPORT;
if (addr->sin_family != AF_INET &&
addr->sin_family != AF_INET6 &&
addr->sin_family != AF_UNSPEC)
goto out;
/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
if (addr->sin_family == AF_UNSPEC &&
addr->sin_addr.s_addr != htonl(INADDR_ANY))
goto out;
lock_sock(sk);
/* Check if socket is already active */
rc = -EINVAL;
if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
goto out_rel;
smc->clcsock->sk->sk_reuse = sk->sk_reuse;
net/smc: Support SO_REUSEPORT This enables SO_REUSEPORT [1] for clcsock when it is set on smc socket, so that some applications which uses it can be transparently replaced with SMC. Also, this helps improve load distribution. Here is a simple test of NGINX + wrk with SMC. The CPU usage is collected on NGINX (server) side as below. Disable SO_REUSEPORT: 05:15:33 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle 05:15:34 PM all 7.02 0.00 11.86 0.00 2.04 8.93 0.00 0.00 0.00 70.15 05:15:34 PM 0 0.00 0.00 0.00 0.00 16.00 70.00 0.00 0.00 0.00 14.00 05:15:34 PM 1 11.58 0.00 22.11 0.00 0.00 0.00 0.00 0.00 0.00 66.32 05:15:34 PM 2 1.00 0.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 98.00 05:15:34 PM 3 16.84 0.00 30.53 0.00 0.00 0.00 0.00 0.00 0.00 52.63 05:15:34 PM 4 28.72 0.00 44.68 0.00 0.00 0.00 0.00 0.00 0.00 26.60 05:15:34 PM 5 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 05:15:34 PM 6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 05:15:34 PM 7 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 Enable SO_REUSEPORT: 05:15:20 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle 05:15:21 PM all 8.56 0.00 14.40 0.00 2.20 9.86 0.00 0.00 0.00 64.98 05:15:21 PM 0 0.00 0.00 4.08 0.00 14.29 76.53 0.00 0.00 0.00 5.10 05:15:21 PM 1 9.09 0.00 16.16 0.00 1.01 0.00 0.00 0.00 0.00 73.74 05:15:21 PM 2 9.38 0.00 16.67 0.00 1.04 0.00 0.00 0.00 0.00 72.92 05:15:21 PM 3 10.42 0.00 17.71 0.00 1.04 0.00 0.00 0.00 0.00 70.83 05:15:21 PM 4 9.57 0.00 15.96 0.00 0.00 0.00 0.00 0.00 0.00 74.47 05:15:21 PM 5 9.18 0.00 15.31 0.00 0.00 1.02 0.00 0.00 0.00 74.49 05:15:21 PM 6 8.60 0.00 15.05 0.00 0.00 0.00 0.00 0.00 0.00 76.34 05:15:21 PM 7 12.37 0.00 14.43 0.00 0.00 0.00 0.00 0.00 0.00 73.20 Using SO_REUSEPORT helps the load distribution of NGINX be more balanced. [1] https://man7.org/linux/man-pages/man7/socket.7.html Signed-off-by: Tony Lu <tonylu@linux.alibaba.com> Acked-by: Wenjia Zhang <wenjia@linux.ibm.com> Link: https://lore.kernel.org/r/20220922121906.72406-1-tonylu@linux.alibaba.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-09-22 12:19:07 +00:00
smc->clcsock->sk->sk_reuseport = sk->sk_reuseport;
rc = kernel_bind(smc->clcsock, uaddr, addr_len);
out_rel:
release_sock(sk);
out:
return rc;
}
/* copy only relevant settings and flags of SOL_SOCKET level from smc to
* clc socket (since smc is not called for these options from net/core)
*/
#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
(1UL << SOCK_KEEPOPEN) | \
(1UL << SOCK_LINGER) | \
(1UL << SOCK_BROADCAST) | \
(1UL << SOCK_TIMESTAMP) | \
(1UL << SOCK_DBG) | \
(1UL << SOCK_RCVTSTAMP) | \
(1UL << SOCK_RCVTSTAMPNS) | \
(1UL << SOCK_LOCALROUTE) | \
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
(1UL << SOCK_RXQ_OVFL) | \
(1UL << SOCK_WIFI_STATUS) | \
(1UL << SOCK_NOFCS) | \
(1UL << SOCK_FILTER_LOCKED) | \
(1UL << SOCK_TSTAMP_NEW))
/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */
static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk,
unsigned long mask)
{
struct net *nnet = sock_net(nsk);
nsk->sk_userlocks = osk->sk_userlocks;
if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) {
nsk->sk_sndbuf = osk->sk_sndbuf;
} else {
if (mask == SK_FLAGS_SMC_TO_CLC)
WRITE_ONCE(nsk->sk_sndbuf,
READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1]));
else
WRITE_ONCE(nsk->sk_sndbuf,
2 * READ_ONCE(nnet->smc.sysctl_wmem));
}
if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) {
nsk->sk_rcvbuf = osk->sk_rcvbuf;
} else {
if (mask == SK_FLAGS_SMC_TO_CLC)
WRITE_ONCE(nsk->sk_rcvbuf,
READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1]));
else
WRITE_ONCE(nsk->sk_rcvbuf,
2 * READ_ONCE(nnet->smc.sysctl_rmem));
}
}
static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
unsigned long mask)
{
/* options we don't get control via setsockopt for */
nsk->sk_type = osk->sk_type;
nsk->sk_sndtimeo = osk->sk_sndtimeo;
nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
nsk->sk_mark = READ_ONCE(osk->sk_mark);
nsk->sk_priority = READ_ONCE(osk->sk_priority);
nsk->sk_rcvlowat = osk->sk_rcvlowat;
nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
nsk->sk_err = osk->sk_err;
nsk->sk_flags &= ~mask;
nsk->sk_flags |= osk->sk_flags & mask;
smc_adjust_sock_bufsizes(nsk, osk, mask);
}
static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
{
smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
}
#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
(1UL << SOCK_KEEPOPEN) | \
(1UL << SOCK_LINGER) | \
(1UL << SOCK_DBG))
/* copy only settings and flags relevant for smc from clc to smc socket */
static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
{
smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
/* register the new vzalloced sndbuf on all links */
static int smcr_lgr_reg_sndbufs(struct smc_link *link,
struct smc_buf_desc *snd_desc)
{
struct smc_link_group *lgr = link->lgr;
int i, rc = 0;
if (!snd_desc->is_vm)
return -EINVAL;
/* protect against parallel smcr_link_reg_buf() */
down_write(&lgr->llc_conf_mutex);
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
if (!smc_link_active(&lgr->lnk[i]))
continue;
rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc);
if (rc)
break;
}
up_write(&lgr->llc_conf_mutex);
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
return rc;
}
/* register the new rmb on all links */
static int smcr_lgr_reg_rmbs(struct smc_link *link,
struct smc_buf_desc *rmb_desc)
{
struct smc_link_group *lgr = link->lgr;
bool do_slow = false;
int i, rc = 0;
rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
if (rc)
return rc;
down_read(&lgr->llc_conf_mutex);
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
if (!smc_link_active(&lgr->lnk[i]))
continue;
if (!rmb_desc->is_reg_mr[link->link_idx]) {
up_read(&lgr->llc_conf_mutex);
goto slow_path;
}
}
/* mr register already */
goto fast_path;
slow_path:
do_slow = true;
/* protect against parallel smc_llc_cli_rkey_exchange() and
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
* parallel smcr_link_reg_buf()
*/
down_write(&lgr->llc_conf_mutex);
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
if (!smc_link_active(&lgr->lnk[i]))
continue;
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc);
if (rc)
goto out;
}
fast_path:
/* exchange confirm_rkey msg with peer */
rc = smc_llc_do_confirm_rkey(link, rmb_desc);
if (rc) {
rc = -EFAULT;
goto out;
}
rmb_desc->is_conf_rkey = true;
out:
do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex);
smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
return rc;
}
static int smcr_clnt_conf_first_link(struct smc_sock *smc)
{
struct smc_link *link = smc->conn.lnk;
struct smc_llc_qentry *qentry;
int rc;
net/smc: avoid data corruption caused by decline We found a data corruption issue during testing of SMC-R on Redis applications. The benchmark has a low probability of reporting a strange error as shown below. "Error: Protocol error, got "\xe2" as reply type byte" Finally, we found that the retrieved error data was as follows: 0xE2 0xD4 0xC3 0xD9 0x04 0x00 0x2C 0x20 0xA6 0x56 0x00 0x16 0x3E 0x0C 0xCB 0x04 0x02 0x01 0x00 0x00 0x20 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0xE2 It is quite obvious that this is a SMC DECLINE message, which means that the applications received SMC protocol message. We found that this was caused by the following situations: client server ¦ clc proposal -------------> ¦ clc accept <------------- ¦ clc confirm -------------> wait llc confirm send llc confirm ¦failed llc confirm ¦ x------ (after 2s)timeout wait llc confirm rsp wait decline (after 1s) timeout (after 2s) timeout ¦ decline --------------> ¦ decline <-------------- As a result, a decline message was sent in the implementation, and this message was read from TCP by the already-fallback connection. This patch double the client timeout as 2x of the server value, With this simple change, the Decline messages should never cross or collide (during Confirm link timeout). This issue requires an immediate solution, since the protocol updates involve a more long-term solution. Fixes: 0fb0b02bd6fd ("net/smc: adapt SMC client code to use the LLC flow") Signed-off-by: D. Wythe <alibuda@linux.alibaba.com> Reviewed-by: Wen Gu <guwen@linux.alibaba.com> Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-11-22 02:37:05 +00:00
/* Receive CONFIRM LINK request from server over RoCE fabric.
* Increasing the client's timeout by twice as much as the server's
* timeout by default can temporarily avoid decline messages of
* both sides crossing or colliding
*/
qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME,
SMC_LLC_CONFIRM_LINK);
if (!qentry) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
}
smc_llc_save_peer_uid(qentry);
rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
if (rc)
return SMC_CLC_DECL_RMBE_EC;
rc = smc_ib_modify_qp_rts(link);
if (rc)
return SMC_CLC_DECL_ERR_RDYLNK;
smc_wr_remember_qp_attr(link);
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
/* reg the sndbuf if it was vzalloced */
if (smc->conn.sndbuf_desc->is_vm) {
if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
return SMC_CLC_DECL_ERR_REGBUF;
}
/* reg the rmb */
if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
return SMC_CLC_DECL_ERR_REGBUF;
/* confirm_rkey is implicit on 1st contact */
smc->conn.rmb_desc->is_conf_rkey = true;
/* send CONFIRM LINK response over RoCE fabric */
rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
if (rc < 0)
return SMC_CLC_DECL_TIMEOUT_CL;
smc_llc_link_active(link);
smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
if (link->lgr->max_links > 1) {
/* optional 2nd link, receive ADD LINK request from server */
qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
SMC_LLC_ADD_LINK);
if (!qentry) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
if (rc == -EAGAIN)
rc = 0; /* no DECLINE received, go with one link */
return rc;
}
smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
smc_llc_cli_add_link(link, qentry);
}
return 0;
}
static bool smc_isascii(char *hostname)
{
int i;
for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
if (!isascii(hostname[i]))
return false;
return true;
}
static void smc_conn_save_peer_info_fce(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
struct smc_clc_first_contact_ext *fce;
int clc_v2_len;
if (clc->hdr.version == SMC_V1 ||
!(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
return;
if (smc->conn.lgr->is_smcd) {
memcpy(smc->conn.lgr->negotiated_eid, clc->d1.eid,
SMC_MAX_EID_LEN);
clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, d1);
} else {
memcpy(smc->conn.lgr->negotiated_eid, clc->r1.eid,
SMC_MAX_EID_LEN);
clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, r1);
}
fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc) + clc_v2_len);
smc->conn.lgr->peer_os = fce->os_type;
smc->conn.lgr->peer_smc_release = fce->release;
if (smc_isascii(fce->hostname))
memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
SMC_MAX_HOSTNAME_LEN);
}
static void smcr_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
smc->conn.peer_rmbe_size = bufsize;
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
}
static void smcd_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
smc->conn.peer_token = ntohll(clc->d0.token);
/* msg header takes up space in the buffer */
smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
}
static void smc_conn_save_peer_info(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *clc)
{
if (smc->conn.lgr->is_smcd)
smcd_conn_save_peer_info(smc, clc);
else
smcr_conn_save_peer_info(smc, clc);
smc_conn_save_peer_info_fce(smc, clc);
}
static void smc_link_save_peer_info(struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc,
struct smc_init_info *ini)
{
link->peer_qpn = ntoh24(clc->r0.qpn);
memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE);
memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac));
link->peer_psn = ntoh24(clc->r0.psn);
link->peer_mtu = clc->r0.qp_mtu;
}
static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc,
struct smc_stats_fback *fback_arr)
{
int cnt;
for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) {
if (fback_arr[cnt].fback_code == smc->fallback_rsn) {
fback_arr[cnt].count++;
break;
}
if (!fback_arr[cnt].fback_code) {
fback_arr[cnt].fback_code = smc->fallback_rsn;
fback_arr[cnt].count++;
break;
}
}
}
static void smc_stat_fallback(struct smc_sock *smc)
{
struct net *net = sock_net(&smc->sk);
mutex_lock(&net->smc.mutex_fback_rsn);
if (smc->listen_smc) {
smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv);
net->smc.fback_rsn->srv_fback_cnt++;
} else {
smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt);
net->smc.fback_rsn->clnt_fback_cnt++;
}
mutex_unlock(&net->smc.mutex_fback_rsn);
}
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
/* must be called under rcu read lock */
static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key)
{
struct socket_wq *wq;
__poll_t flags;
wq = rcu_dereference(smc->sk.sk_wq);
if (!skwq_has_sleeper(wq))
return;
/* wake up smc sk->sk_wq */
if (!key) {
/* sk_state_change */
wake_up_interruptible_all(&wq->wait);
} else {
flags = key_to_poll(key);
if (flags & (EPOLLIN | EPOLLOUT))
/* sk_data_ready or sk_write_space */
wake_up_interruptible_sync_poll(&wq->wait, flags);
else if (flags & EPOLLERR)
/* sk_error_report */
wake_up_interruptible_poll(&wq->wait, flags);
}
}
static int smc_fback_mark_woken(wait_queue_entry_t *wait,
unsigned int mode, int sync, void *key)
{
struct smc_mark_woken *mark =
container_of(wait, struct smc_mark_woken, wait_entry);
mark->woken = true;
mark->key = key;
return 0;
}
static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk,
void (*clcsock_callback)(struct sock *sk))
{
struct smc_mark_woken mark = { .woken = false };
struct socket_wq *wq;
init_waitqueue_func_entry(&mark.wait_entry,
smc_fback_mark_woken);
rcu_read_lock();
wq = rcu_dereference(clcsk->sk_wq);
if (!wq)
goto out;
add_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
clcsock_callback(clcsk);
remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
if (mark.woken)
smc_fback_wakeup_waitqueue(smc, mark.key);
out:
rcu_read_unlock();
}
static void smc_fback_state_change(struct sock *clcsk)
{
struct smc_sock *smc;
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
read_lock_bh(&clcsk->sk_callback_lock);
smc = smc_clcsock_user_data(clcsk);
if (smc)
smc_fback_forward_wakeup(smc, clcsk,
smc->clcsk_state_change);
read_unlock_bh(&clcsk->sk_callback_lock);
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
}
static void smc_fback_data_ready(struct sock *clcsk)
{
struct smc_sock *smc;
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
read_lock_bh(&clcsk->sk_callback_lock);
smc = smc_clcsock_user_data(clcsk);
if (smc)
smc_fback_forward_wakeup(smc, clcsk,
smc->clcsk_data_ready);
read_unlock_bh(&clcsk->sk_callback_lock);
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
}
static void smc_fback_write_space(struct sock *clcsk)
{
struct smc_sock *smc;
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
read_lock_bh(&clcsk->sk_callback_lock);
smc = smc_clcsock_user_data(clcsk);
if (smc)
smc_fback_forward_wakeup(smc, clcsk,
smc->clcsk_write_space);
read_unlock_bh(&clcsk->sk_callback_lock);
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
}
static void smc_fback_error_report(struct sock *clcsk)
{
struct smc_sock *smc;
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
read_lock_bh(&clcsk->sk_callback_lock);
smc = smc_clcsock_user_data(clcsk);
if (smc)
smc_fback_forward_wakeup(smc, clcsk,
smc->clcsk_error_report);
read_unlock_bh(&clcsk->sk_callback_lock);
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
}
static void smc_fback_replace_callbacks(struct smc_sock *smc)
{
struct sock *clcsk = smc->clcsock->sk;
write_lock_bh(&clcsk->sk_callback_lock);
clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change,
&smc->clcsk_state_change);
smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready,
&smc->clcsk_data_ready);
smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space,
&smc->clcsk_write_space);
smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report,
&smc->clcsk_error_report);
write_unlock_bh(&clcsk->sk_callback_lock);
}
static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
{
int rc = 0;
mutex_lock(&smc->clcsock_release_lock);
if (!smc->clcsock) {
rc = -EBADF;
goto out;
}
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
smc->use_fallback = true;
smc->fallback_rsn = reason_code;
smc_stat_fallback(smc);
trace_smc_switch_to_fallback(smc, reason_code);
if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
smc->clcsock->file = smc->sk.sk_socket->file;
smc->clcsock->file->private_data = smc->clcsock;
smc->clcsock->wq.fasync_list =
smc->sk.sk_socket->wq.fasync_list;
smc->sk.sk_socket->wq.fasync_list = NULL;
net/smc: Forward wakeup to smc socket waitqueue after fallback When we replace TCP with SMC and a fallback occurs, there may be some socket waitqueue entries remaining in smc socket->wq, such as eppoll_entries inserted by userspace applications. After the fallback, data flows over TCP/IP and only clcsocket->wq will be woken up. Applications can't be notified by the entries which were inserted in smc socket->wq before fallback. So we need a mechanism to wake up smc socket->wq at the same time if some entries remaining in it. The current workaround is to transfer the entries from smc socket->wq to clcsock->wq during the fallback. But this may cause a crash like this: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Tainted: G E 5.16.0+ #107 RIP: 0010:__wake_up_common+0x65/0x170 Call Trace: <IRQ> __wake_up_common_lock+0x7a/0xc0 sock_def_readable+0x3c/0x70 tcp_data_queue+0x4a7/0xc40 tcp_rcv_established+0x32f/0x660 ? sk_filter_trim_cap+0xcb/0x2e0 tcp_v4_do_rcv+0x10b/0x260 tcp_v4_rcv+0xd2a/0xde0 ip_protocol_deliver_rcu+0x3b/0x1d0 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0x6a/0x110 ? tcp_v4_early_demux+0xa2/0x140 ? tcp_v4_early_demux+0x10d/0x140 ip_sublist_rcv_finish+0x49/0x60 ip_sublist_rcv+0x19d/0x230 ip_list_rcv+0x13e/0x170 __netif_receive_skb_list_core+0x1c2/0x240 netif_receive_skb_list_internal+0x1e6/0x320 napi_complete_done+0x11d/0x190 mlx5e_napi_poll+0x163/0x6b0 [mlx5_core] __napi_poll+0x3c/0x1b0 net_rx_action+0x27c/0x300 __do_softirq+0x114/0x2d2 irq_exit_rcu+0xb4/0xe0 common_interrupt+0xba/0xe0 </IRQ> <TASK> The crash is caused by privately transferring waitqueue entries from smc socket->wq to clcsock->wq. The owners of these entries, such as epoll, have no idea that the entries have been transferred to a different socket wait queue and still use original waitqueue spinlock (smc socket->wq.wait.lock) to make the entries operation exclusive, but it doesn't work. The operations to the entries, such as removing from the waitqueue (now is clcsock->wq after fallback), may cause a crash when clcsock waitqueue is being iterated over at the moment. This patch tries to fix this by no longer transferring wait queue entries privately, but introducing own implementations of clcsock's callback functions in fallback situation. The callback functions will forward the wakeup to smc socket->wq if clcsock->wq is actually woken up and smc socket->wq has remaining entries. Fixes: 2153bd1e3d3d ("net/smc: Transfer remaining wait queue entries during fallback") Suggested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Acked-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-26 15:33:04 +00:00
/* There might be some wait entries remaining
* in smc sk->sk_wq and they should be woken up
* as clcsock's wait queue is woken up.
*/
smc_fback_replace_callbacks(smc);
}
out:
mutex_unlock(&smc->clcsock_release_lock);
return rc;
}
/* fall back during connect */
static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
{
struct net *net = sock_net(&smc->sk);
int rc = 0;
rc = smc_switch_to_fallback(smc, reason_code);
if (rc) { /* fallback fails */
this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
if (smc->sk.sk_state == SMC_INIT)
sock_put(&smc->sk); /* passive closing */
return rc;
}
smc_copy_sock_settings_to_clc(smc);
smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
return 0;
}
/* decline and fall back during connect */
static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
u8 version)
{
struct net *net = sock_net(&smc->sk);
int rc;
if (reason_code < 0) { /* error, fallback is not possible */
this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
if (smc->sk.sk_state == SMC_INIT)
sock_put(&smc->sk); /* passive closing */
return reason_code;
}
if (reason_code != SMC_CLC_DECL_PEERDECL) {
rc = smc_clc_send_decline(smc, reason_code, version);
if (rc < 0) {
this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
if (smc->sk.sk_state == SMC_INIT)
sock_put(&smc->sk); /* passive closing */
return rc;
}
}
return smc_connect_fallback(smc, reason_code);
}
static void smc_conn_abort(struct smc_sock *smc, int local_first)
{
struct smc_connection *conn = &smc->conn;
struct smc_link_group *lgr = conn->lgr;
bool lgr_valid = false;
if (smc_conn_lgr_valid(conn))
lgr_valid = true;
smc_conn_free(conn);
if (local_first && lgr_valid)
smc_lgr_cleanup_early(lgr);
}
/* check if there is a rdma device available for this connection. */
/* called for connect and listen */
static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
{
/* PNET table look up: search active ib_device and port
* within same PNETID that also contains the ethernet device
* used for the internal TCP socket
*/
smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
if (!ini->check_smcrv2 && !ini->ib_dev)
return SMC_CLC_DECL_NOSMCRDEV;
if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2)
return SMC_CLC_DECL_NOSMCRDEV;
return 0;
}
/* check if there is an ISM device available for this connection. */
/* called for connect and listen */
static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
{
/* Find ISM device with same PNETID as connecting interface */
smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
if (!ini->ism_dev[0])
return SMC_CLC_DECL_NOSMCDDEV;
else
ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
return 0;
}
/* is chid unique for the ism devices that are already determined? */
static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
int cnt)
{
int i = (!ini->ism_dev[0]) ? 1 : 0;
for (; i < cnt; i++)
if (ini->ism_chid[i] == chid)
return false;
return true;
}
/* determine possible V2 ISM devices (either without PNETID or with PNETID plus
* PNETID matching net_device)
*/
static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
struct smc_init_info *ini)
{
int rc = SMC_CLC_DECL_NOSMCDDEV;
struct smcd_dev *smcd;
int i = 1, entry = 1;
bool is_emulated;
u16 chid;
if (smcd_indicated(ini->smc_type_v1))
rc = 0; /* already initialized for V1 */
mutex_lock(&smcd_dev_list.mutex);
list_for_each_entry(smcd, &smcd_dev_list.list, list) {
if (smcd->going_away || smcd == ini->ism_dev[0])
continue;
chid = smc_ism_get_chid(smcd);
if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
continue;
is_emulated = __smc_ism_is_emulated(chid);
if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
if (is_emulated && entry == SMCD_CLC_MAX_V2_GID_ENTRIES)
/* It's the last GID-CHID entry left in CLC
* Proposal SMC-Dv2 extension, but an Emulated-
* ISM device will take two entries. So give
* up it and try the next potential ISM device.
*/
continue;
ini->ism_dev[i] = smcd;
ini->ism_chid[i] = chid;
ini->is_smcd = true;
rc = 0;
i++;
entry = is_emulated ? entry + 2 : entry + 1;
if (entry > SMCD_CLC_MAX_V2_GID_ENTRIES)
break;
}
}
mutex_unlock(&smcd_dev_list.mutex);
ini->ism_offered_cnt = i - 1;
if (!ini->ism_dev[0] && !ini->ism_dev[1])
ini->smcd_version = 0;
return rc;
}
/* Check for VLAN ID and register it on ISM device just for CLC handshake */
static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
struct smc_init_info *ini)
{
if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
return SMC_CLC_DECL_ISMVLANERR;
return 0;
}
static int smc_find_proposal_devices(struct smc_sock *smc,
struct smc_init_info *ini)
{
int rc = 0;
/* check if there is an ism device available */
if (!(ini->smcd_version & SMC_V1) ||
smc_find_ism_device(smc, ini) ||
smc_connect_ism_vlan_setup(smc, ini))
ini->smcd_version &= ~SMC_V1;
/* else ISM V1 is supported for this connection */
/* check if there is an rdma device available */
if (!(ini->smcr_version & SMC_V1) ||
smc_find_rdma_device(smc, ini))
ini->smcr_version &= ~SMC_V1;
/* else RDMA is supported for this connection */
ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1,
ini->smcr_version & SMC_V1);
/* check if there is an ism v2 device available */
if (!(ini->smcd_version & SMC_V2) ||
!smc_ism_is_v2_capable() ||
smc_find_ism_v2_device_clnt(smc, ini))
ini->smcd_version &= ~SMC_V2;
/* check if there is an rdma v2 device available */
ini->check_smcrv2 = true;
ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr;
if (!(ini->smcr_version & SMC_V2) ||
smc->clcsock->sk->sk_family != AF_INET ||
!smc_clc_ueid_count() ||
smc_find_rdma_device(smc, ini))
ini->smcr_version &= ~SMC_V2;
ini->check_smcrv2 = false;
ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2,
ini->smcr_version & SMC_V2);
/* if neither ISM nor RDMA are supported, fallback */
if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
rc = SMC_CLC_DECL_NOSMCDEV;
return rc;
}
/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
* used, the VLAN ID will be registered again during the connection setup.
*/
static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
struct smc_init_info *ini)
{
if (!smcd_indicated(ini->smc_type_v1))
return 0;
if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
return SMC_CLC_DECL_CNFERR;
return 0;
}
#define SMC_CLC_MAX_ACCEPT_LEN \
(sizeof(struct smc_clc_msg_accept_confirm) + \
net/smc: add vendor unique experimental options area in clc handshake Add vendor unique experimental options area in clc handshake. In clc accept and confirm msg, vendor unique experimental options use the 16-Bytes reserved field, which defined in struct smc_clc_fce_gid_ext in previous version. Because of the struct smc_clc_first_contact_ext is widely used and limit the scope of modification, this patch moves the 16-Bytes reserved field out of struct smc_clc_fce_gid_ext, and followed with the struct smc_clc_first_contact_ext in a new struct names struct smc_clc_first_contact_ext_v2x. For SMC-R first connection, in previous version, the struct smc_clc_ first_contact_ext and the 16-Bytes reserved field has already been included in clc accept and confirm msg. Thus, this patch use struct smc_clc_first_contact_ext_v2x instead of the struct smc_clc_first_ contact_ext and the 16-Bytes reserved field in SMC-R clc accept and confirm msg is compatible with previous version. For SMC-D first connection, in previous version, only the struct smc_ clc_first_contact_ext is included in clc accept and confirm msg, and the 16-Bytes reserved field is not included. Thus, when the negotiated smc release version is the version before v2.1, we still use struct smc_clc_first_contact_ext for compatible consideration. If the negotiated smc release version is v2.1 or later, use struct smc_clc_first_contact_ ext_v2x instead. Signed-off-by: Guangguan Wang <guangguan.wang@linux.alibaba.com> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com> Reviewed-by: Jan Karcher <jaka@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-08-17 13:20:28 +00:00
sizeof(struct smc_clc_first_contact_ext_v2x) + \
sizeof(struct smc_clc_msg_trail))
/* CLC handshake during connect */
static int smc_connect_clc(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
struct smc_init_info *ini)
{
int rc = 0;
/* do inband token exchange */
rc = smc_clc_send_proposal(smc, ini);
if (rc)
return rc;
/* receive SMC Accept CLC message */
return smc_clc_wait_msg(smc, aclc, SMC_CLC_MAX_ACCEPT_LEN,
SMC_CLC_ACCEPT, CLC_WAIT_TIME);
}
void smc_fill_gid_list(struct smc_link_group *lgr,
struct smc_gidlist *gidlist,
struct smc_ib_device *known_dev, u8 *known_gid)
{
struct smc_init_info *alt_ini = NULL;
memset(gidlist, 0, sizeof(*gidlist));
memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE);
alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL);
if (!alt_ini)
goto out;
alt_ini->vlan_id = lgr->vlan_id;
alt_ini->check_smcrv2 = true;
alt_ini->smcrv2.saddr = lgr->saddr;
smc_pnet_find_alt_roce(lgr, alt_ini, known_dev);
if (!alt_ini->smcrv2.ib_dev_v2)
goto out;
memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2,
SMC_GID_SIZE);
out:
kfree(alt_ini);
}
static int smc_connect_rdma_v2_prepare(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
struct smc_init_info *ini)
{
struct smc_clc_first_contact_ext *fce =
smc_get_clc_first_contact_ext(aclc, false);
struct net *net = sock_net(&smc->sk);
int rc;
if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1)
return 0;
if (fce->v2_direct) {
memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN);
ini->smcrv2.uses_gateway = false;
} else {
if (smc_ib_find_route(net, smc->clcsock->sk->sk_rcv_saddr,
smc_ib_gid_to_ipv4(aclc->r0.lcl.gid),
ini->smcrv2.nexthop_mac,
&ini->smcrv2.uses_gateway))
return SMC_CLC_DECL_NOROUTE;
if (!ini->smcrv2.uses_gateway) {
/* mismatch: peer claims indirect, but its direct */
return SMC_CLC_DECL_NOINDIRECT;
}
}
ini->release_nr = fce->release;
rc = smc_clc_clnt_v2x_features_validate(fce, ini);
if (rc)
return rc;
return 0;
}
/* setup for RDMA connection of client */
static int smc_connect_rdma(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
struct smc_init_info *ini)
{
int i, reason_code = 0;
struct smc_link *link;
u8 *eid = NULL;
ini->is_smcd = false;
ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN);
memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE);
memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN);
ini->max_conns = SMC_CONN_PER_LGR_MAX;
ini->max_links = SMC_LINKS_ADD_LNK_MAX;
reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini);
if (reason_code)
return reason_code;
mutex_lock(&smc_client_lgr_pending);
reason_code = smc_conn_create(smc, ini);
if (reason_code) {
mutex_unlock(&smc_client_lgr_pending);
return reason_code;
}
smc_conn_save_peer_info(smc, aclc);
if (ini->first_contact_local) {
link = smc->conn.lnk;
} else {
/* set link that was assigned by server */
link = NULL;
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
struct smc_link *l = &smc->conn.lgr->lnk[i];
if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
!memcmp(l->peer_gid, &aclc->r0.lcl.gid,
SMC_GID_SIZE) &&
(aclc->hdr.version > SMC_V1 ||
!memcmp(l->peer_mac, &aclc->r0.lcl.mac,
sizeof(l->peer_mac)))) {
link = l;
break;
}
}
if (!link) {
reason_code = SMC_CLC_DECL_NOSRVLINK;
goto connect_abort;
}
smc_switch_link_and_count(&smc->conn, link);
}
/* create send buffer and rmb */
if (smc_buf_create(smc, false)) {
reason_code = SMC_CLC_DECL_MEM;
goto connect_abort;
}
if (ini->first_contact_local)
smc_link_save_peer_info(link, aclc, ini);
if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
reason_code = SMC_CLC_DECL_ERR_RTOK;
goto connect_abort;
}
smc_close_init(smc);
smc_rx_init(smc);
if (ini->first_contact_local) {
if (smc_ib_ready_link(link)) {
reason_code = SMC_CLC_DECL_ERR_RDYLNK;
goto connect_abort;
}
} else {
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
/* reg sendbufs if they were vzalloced */
if (smc->conn.sndbuf_desc->is_vm) {
if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) {
reason_code = SMC_CLC_DECL_ERR_REGBUF;
goto connect_abort;
}
}
if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
reason_code = SMC_CLC_DECL_ERR_REGBUF;
goto connect_abort;
}
}
if (aclc->hdr.version > SMC_V1) {
eid = aclc->r1.eid;
if (ini->first_contact_local)
smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist,
link->smcibdev, link->gid);
}
reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
aclc->hdr.version, eid, ini);
if (reason_code)
goto connect_abort;
smc_tx_init(smc);
if (ini->first_contact_local) {
/* QP confirmation over RoCE fabric */
smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
reason_code = smcr_clnt_conf_first_link(smc);
smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
if (reason_code)
goto connect_abort;
}
mutex_unlock(&smc_client_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
return 0;
connect_abort:
smc_conn_abort(smc, ini->first_contact_local);
mutex_unlock(&smc_client_lgr_pending);
smc->connect_nonblock = 0;
return reason_code;
}
/* The server has chosen one of the proposed ISM devices for the communication.
* Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
*/
static int
smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc,
struct smc_init_info *ini)
{
int i;
for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
ini->ism_selected = i;
return 0;
}
}
return -EPROTO;
}
/* setup for ISM connection of client */
static int smc_connect_ism(struct smc_sock *smc,
struct smc_clc_msg_accept_confirm *aclc,
struct smc_init_info *ini)
{
u8 *eid = NULL;
int rc = 0;
ini->is_smcd = true;
ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
if (aclc->hdr.version == SMC_V2) {
if (ini->first_contact_peer) {
struct smc_clc_first_contact_ext *fce =
smc_get_clc_first_contact_ext(aclc, true);
ini->release_nr = fce->release;
rc = smc_clc_clnt_v2x_features_validate(fce, ini);
if (rc)
return rc;
}
rc = smc_v2_determine_accepted_chid(aclc, ini);
if (rc)
return rc;
if (__smc_ism_is_emulated(ini->ism_chid[ini->ism_selected]))
ini->ism_peer_gid[ini->ism_selected].gid_ext =
ntohll(aclc->d1.gid_ext);
/* for non-Emulated-ISM devices, peer gid_ext remains 0. */
}
ini->ism_peer_gid[ini->ism_selected].gid = ntohll(aclc->d0.gid);
/* there is only one lgr role for SMC-D; use server lock */
mutex_lock(&smc_server_lgr_pending);
rc = smc_conn_create(smc, ini);
if (rc) {
mutex_unlock(&smc_server_lgr_pending);
return rc;
}
/* Create send and receive buffers */
rc = smc_buf_create(smc, true);
if (rc) {
rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
goto connect_abort;
}
smc_conn_save_peer_info(smc, aclc);
if (smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) {
rc = smcd_buf_attach(smc);
if (rc) {
rc = SMC_CLC_DECL_MEM; /* try to fallback */
goto connect_abort;
}
}
smc_close_init(smc);
smc_rx_init(smc);
smc_tx_init(smc);
if (aclc->hdr.version > SMC_V1)
eid = aclc->d1.eid;
rc = smc_clc_send_confirm(smc, ini->first_contact_local,
aclc->hdr.version, eid, ini);
if (rc)
goto connect_abort;
mutex_unlock(&smc_server_lgr_pending);
smc_copy_sock_settings_to_clc(smc);
smc->connect_nonblock = 0;
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
return 0;
connect_abort:
smc_conn_abort(smc, ini->first_contact_local);
mutex_unlock(&smc_server_lgr_pending);
smc->connect_nonblock = 0;
return rc;
}
/* check if received accept type and version matches a proposed one */
static int smc_connect_check_aclc(struct smc_init_info *ini,
struct smc_clc_msg_accept_confirm *aclc)
{
if (aclc->hdr.typev1 != SMC_TYPE_R &&
aclc->hdr.typev1 != SMC_TYPE_D)
return SMC_CLC_DECL_MODEUNSUPP;
if (aclc->hdr.version >= SMC_V2) {
if ((aclc->hdr.typev1 == SMC_TYPE_R &&
!smcr_indicated(ini->smc_type_v2)) ||
(aclc->hdr.typev1 == SMC_TYPE_D &&
!smcd_indicated(ini->smc_type_v2)))
return SMC_CLC_DECL_MODEUNSUPP;
} else {
if ((aclc->hdr.typev1 == SMC_TYPE_R &&
!smcr_indicated(ini->smc_type_v1)) ||
(aclc->hdr.typev1 == SMC_TYPE_D &&
!smcd_indicated(ini->smc_type_v1)))
return SMC_CLC_DECL_MODEUNSUPP;
}
return 0;
}
/* perform steps before actually connecting */
static int __smc_connect(struct smc_sock *smc)
{
u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1;
struct smc_clc_msg_accept_confirm *aclc;
struct smc_init_info *ini = NULL;
u8 *buf = NULL;
int rc = 0;
if (smc->use_fallback)
return smc_connect_fallback(smc, smc->fallback_rsn);
/* if peer has not signalled SMC-capability, fall back */
if (!tcp_sk(smc->clcsock->sk)->syn_smc)
return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
/* IPSec connections opt out of SMC optimizations */
if (using_ipsec(smc))
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
version);
ini = kzalloc(sizeof(*ini), GFP_KERNEL);
if (!ini)
return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
version);
ini->smcd_version = SMC_V1 | SMC_V2;
ini->smcr_version = SMC_V1 | SMC_V2;
ini->smc_type_v1 = SMC_TYPE_B;
ini->smc_type_v2 = SMC_TYPE_B;
/* get vlan id from IP device */
if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
ini->smcd_version &= ~SMC_V1;
ini->smcr_version = 0;
ini->smc_type_v1 = SMC_TYPE_N;
if (!ini->smcd_version) {
rc = SMC_CLC_DECL_GETVLANERR;
goto fallback;
}
}
rc = smc_find_proposal_devices(smc, ini);
if (rc)
goto fallback;
buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
if (!buf) {
rc = SMC_CLC_DECL_MEM;
goto fallback;
}
aclc = (struct smc_clc_msg_accept_confirm *)buf;
/* perform CLC handshake */
rc = smc_connect_clc(smc, aclc, ini);
if (rc) {
/* -EAGAIN on timeout, see tcp_recvmsg() */
if (rc == -EAGAIN) {
rc = -ETIMEDOUT;
smc->sk.sk_err = ETIMEDOUT;
}
goto vlan_cleanup;
}
/* check if smc modes and versions of CLC proposal and accept match */
rc = smc_connect_check_aclc(ini, aclc);
version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
if (rc)
goto vlan_cleanup;
/* depending on previous steps, connect using rdma or ism */
if (aclc->hdr.typev1 == SMC_TYPE_R) {
ini->smcr_version = version;
rc = smc_connect_rdma(smc, aclc, ini);
} else if (aclc->hdr.typev1 == SMC_TYPE_D) {
ini->smcd_version = version;
rc = smc_connect_ism(smc, aclc, ini);
}
if (rc)
goto vlan_cleanup;
SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc);
smc_connect_ism_vlan_cleanup(smc, ini);
kfree(buf);
kfree(ini);
return 0;
vlan_cleanup:
smc_connect_ism_vlan_cleanup(smc, ini);
kfree(buf);
fallback:
kfree(ini);
return smc_connect_decline_fallback(smc, rc, version);
}
static void smc_connect_work(struct work_struct *work)
{
struct smc_sock *smc = container_of(work, struct smc_sock,
connect_work);
long timeo = smc->sk.sk_sndtimeo;
int rc = 0;
if (!timeo)
timeo = MAX_SCHEDULE_TIMEOUT;
lock_sock(smc->clcsock->sk);
if (smc->clcsock->sk->sk_err) {
smc->sk.sk_err = smc->clcsock->sk->sk_err;
} else if ((1 << smc->clcsock->sk->sk_state) &
(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
if ((rc == -EPIPE) &&
((1 << smc->clcsock->sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
rc = 0;
}
release_sock(smc->clcsock->sk);
lock_sock(&smc->sk);
if (rc != 0 || smc->sk.sk_err) {
smc->sk.sk_state = SMC_CLOSED;
if (rc == -EPIPE || rc == -EAGAIN)
smc->sk.sk_err = EPIPE;
else if (rc == -ECONNREFUSED)
smc->sk.sk_err = ECONNREFUSED;
else if (signal_pending(current))
smc->sk.sk_err = -sock_intr_errno(timeo);
sock_put(&smc->sk); /* passive closing */
goto out;
}
rc = __smc_connect(smc);
if (rc < 0)
smc->sk.sk_err = -rc;
out:
if (!sock_flag(&smc->sk, SOCK_DEAD)) {
if (smc->sk.sk_err) {
smc->sk.sk_state_change(&smc->sk);
} else { /* allow polling before and after fallback decision */
smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
smc->sk.sk_write_space(&smc->sk);
}
}
release_sock(&smc->sk);
}
static int smc_connect(struct socket *sock, struct sockaddr *addr,
int alen, int flags)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc = -EINVAL;
smc = smc_sk(sk);
/* separate smc parameter checking to be safe */
if (alen < sizeof(addr->sa_family))
goto out_err;
if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
goto out_err;
lock_sock(sk);
switch (sock->state) {
default:
rc = -EINVAL;
goto out;
case SS_CONNECTED:
rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL;
goto out;
case SS_CONNECTING:
if (sk->sk_state == SMC_ACTIVE)
goto connected;
break;
case SS_UNCONNECTED:
sock->state = SS_CONNECTING;
break;
}
switch (sk->sk_state) {
default:
goto out;
case SMC_CLOSED:
rc = sock_error(sk) ? : -ECONNABORTED;
sock->state = SS_UNCONNECTED;
goto out;
case SMC_ACTIVE:
rc = -EISCONN;
goto out;
case SMC_INIT:
break;
}
smc_copy_sock_settings_to_clc(smc);
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
if (smc->connect_nonblock) {
rc = -EALREADY;
goto out;
}
rc = kernel_connect(smc->clcsock, addr, alen, flags);
if (rc && rc != -EINPROGRESS)
goto out;
if (smc->use_fallback) {
sock->state = rc ? SS_CONNECTING : SS_CONNECTED;
goto out;
}
sock_hold(&smc->sk); /* sock put in passive closing */
if (flags & O_NONBLOCK) {
if (queue_work(smc_hs_wq, &smc->connect_work))
smc->connect_nonblock = 1;
rc = -EINPROGRESS;
goto out;
} else {
rc = __smc_connect(smc);
if (rc < 0)
goto out;
}
connected:
rc = 0;
sock->state = SS_CONNECTED;
out:
release_sock(sk);
out_err:
return rc;
}
static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
{
struct socket *new_clcsock = NULL;
struct sock *lsk = &lsmc->sk;
struct sock *new_sk;
int rc = -EINVAL;
release_sock(lsk);
new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
if (!new_sk) {
rc = -ENOMEM;
lsk->sk_err = ENOMEM;
*new_smc = NULL;
lock_sock(lsk);
goto out;
}
*new_smc = smc_sk(new_sk);
mutex_lock(&lsmc->clcsock_release_lock);
if (lsmc->clcsock)
rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
mutex_unlock(&lsmc->clcsock_release_lock);
lock_sock(lsk);
if (rc < 0 && rc != -EAGAIN)
lsk->sk_err = -rc;
if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
new_sk->sk_prot->unhash(new_sk);
if (new_clcsock)
sock_release(new_clcsock);
new_sk->sk_state = SMC_CLOSED;
smc_sock_set_flag(new_sk, SOCK_DEAD);
sock_put(new_sk); /* final */
*new_smc = NULL;
goto out;
}
/* new clcsock has inherited the smc listen-specific sk_data_ready
* function; switch it back to the original sk_data_ready function
*/
new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
/* if new clcsock has also inherited the fallback-specific callback
* functions, switch them back to the original ones.
*/
if (lsmc->use_fallback) {
if (lsmc->clcsk_state_change)
new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change;
if (lsmc->clcsk_write_space)
new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space;
if (lsmc->clcsk_error_report)
new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report;
}
(*new_smc)->clcsock = new_clcsock;
out:
return rc;
}
/* add a just created sock to the accept queue of the listen sock as
* candidate for a following socket accept call from user space
*/
static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
{
struct smc_sock *par = smc_sk(parent);
sock_hold(sk); /* sock_put in smc_accept_unlink () */
spin_lock(&par->accept_q_lock);
list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
spin_unlock(&par->accept_q_lock);
sk_acceptq_added(parent);
}
/* remove a socket from the accept queue of its parental listening socket */
static void smc_accept_unlink(struct sock *sk)
{
struct smc_sock *par = smc_sk(sk)->listen_smc;
spin_lock(&par->accept_q_lock);
list_del_init(&smc_sk(sk)->accept_q);
spin_unlock(&par->accept_q_lock);
sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
sock_put(sk); /* sock_hold in smc_accept_enqueue */
}
/* remove a sock from the accept queue to bind it to a new socket created
* for a socket accept call from user space
*/
struct sock *smc_accept_dequeue(struct sock *parent,
struct socket *new_sock)
{
struct smc_sock *isk, *n;
struct sock *new_sk;
list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
new_sk = (struct sock *)isk;
smc_accept_unlink(new_sk);
if (new_sk->sk_state == SMC_CLOSED) {
new_sk->sk_prot->unhash(new_sk);
if (isk->clcsock) {
sock_release(isk->clcsock);
isk->clcsock = NULL;
}
sock_put(new_sk); /* final */
continue;
}
if (new_sock) {
sock_graft(new_sk, new_sock);
new_sock->state = SS_CONNECTED;
if (isk->use_fallback) {
smc_sk(new_sk)->clcsock->file = new_sock->file;
isk->clcsock->file->private_data = isk->clcsock;
}
}
return new_sk;
}
return NULL;
}
/* clean up for a created but never accepted sock */
void smc_close_non_accepted(struct sock *sk)
{
struct smc_sock *smc = smc_sk(sk);
sock_hold(sk); /* sock_put below */
lock_sock(sk);
if (!sk->sk_lingertime)
/* wait for peer closing */
WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT);
__smc_release(smc);
release_sock(sk);
sock_put(sk); /* sock_hold above */
sock_put(sk); /* final sock_put */
}
static int smcr_serv_conf_first_link(struct smc_sock *smc)
{
struct smc_link *link = smc->conn.lnk;
struct smc_llc_qentry *qentry;
int rc;
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
/* reg the sndbuf if it was vzalloced*/
if (smc->conn.sndbuf_desc->is_vm) {
if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc))
return SMC_CLC_DECL_ERR_REGBUF;
}
/* reg the rmb */
if (smcr_link_reg_buf(link, smc->conn.rmb_desc))
return SMC_CLC_DECL_ERR_REGBUF;
/* send CONFIRM LINK request to client over the RoCE fabric */
rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
if (rc < 0)
return SMC_CLC_DECL_TIMEOUT_CL;
/* receive CONFIRM LINK response from client over the RoCE fabric */
qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
SMC_LLC_CONFIRM_LINK);
if (!qentry) {
struct smc_clc_msg_decline dclc;
rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
}
smc_llc_save_peer_uid(qentry);
rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
if (rc)
return SMC_CLC_DECL_RMBE_EC;
/* confirm_rkey is implicit on 1st contact */
smc->conn.rmb_desc->is_conf_rkey = true;
smc_llc_link_active(link);
smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
if (link->lgr->max_links > 1) {
down_write(&link->lgr->llc_conf_mutex);
/* initial contact - try to establish second link */
smc_llc_srv_add_link(link, NULL);
up_write(&link->lgr->llc_conf_mutex);
}
return 0;
}
/* listen worker: finish */
static void smc_listen_out(struct smc_sock *new_smc)
{
struct smc_sock *lsmc = new_smc->listen_smc;
struct sock *newsmcsk = &new_smc->sk;
net/smc: Limit backlog connections Current implementation does not handling backlog semantics, one potential risk is that server will be flooded by infinite amount connections, even if client was SMC-incapable. This patch works to put a limit on backlog connections, referring to the TCP implementation, we divides SMC connections into two categories: 1. Half SMC connection, which includes all TCP established while SMC not connections. 2. Full SMC connection, which includes all SMC established connections. For half SMC connection, since all half SMC connections starts with TCP established, we can achieve our goal by put a limit before TCP established. Refer to the implementation of TCP, this limits will based on not only the half SMC connections but also the full connections, which is also a constraint on full SMC connections. For full SMC connections, although we know exactly where it starts, it's quite hard to put a limit before it. The easiest way is to block wait before receive SMC confirm CLC message, while it's under protection by smc_server_lgr_pending, a global lock, which leads this limit to the entire host instead of a single listen socket. Another way is to drop the full connections, but considering the cast of SMC connections, we prefer to keep full SMC connections. Even so, the limits of full SMC connections still exists, see commits about half SMC connection below. After this patch, the limits of backend connection shows like: For SMC: 1. Client with SMC-capability can makes 2 * backlog full SMC connections or 1 * backlog half SMC connections and 1 * backlog full SMC connections at most. 2. Client without SMC-capability can only makes 1 * backlog half TCP connections and 1 * backlog full TCP connections. Signed-off-by: D. Wythe <alibuda@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 09:11:35 +00:00
if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
atomic_dec(&lsmc->queued_smc_hs);
if (lsmc->sk.sk_state == SMC_LISTEN) {
lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
smc_accept_enqueue(&lsmc->sk, newsmcsk);
release_sock(&lsmc->sk);
} else { /* no longer listening */
smc_close_non_accepted(newsmcsk);
}
/* Wake up accept */
lsmc->sk.sk_data_ready(&lsmc->sk);
sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
}
/* listen worker: finish in state connected */
static void smc_listen_out_connected(struct smc_sock *new_smc)
{
struct sock *newsmcsk = &new_smc->sk;
if (newsmcsk->sk_state == SMC_INIT)
newsmcsk->sk_state = SMC_ACTIVE;
smc_listen_out(new_smc);
}
/* listen worker: finish in error state */
static void smc_listen_out_err(struct smc_sock *new_smc)
{
struct sock *newsmcsk = &new_smc->sk;
struct net *net = sock_net(newsmcsk);
this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt);
if (newsmcsk->sk_state == SMC_INIT)
sock_put(&new_smc->sk); /* passive closing */
newsmcsk->sk_state = SMC_CLOSED;
smc_listen_out(new_smc);
}
/* listen worker: decline and fall back if possible */
static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
int local_first, u8 version)
{
/* RDMA setup failed, switch back to TCP */
smc_conn_abort(new_smc, local_first);
if (reason_code < 0 ||
smc_switch_to_fallback(new_smc, reason_code)) {
/* error, no fallback possible */
smc_listen_out_err(new_smc);
return;
}
if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
smc_listen_out_err(new_smc);
return;
}
}
smc_listen_out_connected(new_smc);
}
/* listen worker: version checking */
static int smc_listen_v2_check(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc,
struct smc_init_info *ini)
{
struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
struct smc_clc_v2_extension *pclc_v2_ext;
int rc = SMC_CLC_DECL_PEERNOSMC;
ini->smc_type_v1 = pclc->hdr.typev1;
ini->smc_type_v2 = pclc->hdr.typev2;
ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
if (pclc->hdr.version > SMC_V1) {
if (smcd_indicated(ini->smc_type_v2))
ini->smcd_version |= SMC_V2;
if (smcr_indicated(ini->smc_type_v2))
ini->smcr_version |= SMC_V2;
}
if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) {
rc = SMC_CLC_DECL_PEERNOSMC;
goto out;
}
pclc_v2_ext = smc_get_clc_v2_ext(pclc);
if (!pclc_v2_ext) {
ini->smcd_version &= ~SMC_V2;
ini->smcr_version &= ~SMC_V2;
rc = SMC_CLC_DECL_NOV2EXT;
goto out;
}
pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
if (ini->smcd_version & SMC_V2) {
if (!smc_ism_is_v2_capable()) {
ini->smcd_version &= ~SMC_V2;
rc = SMC_CLC_DECL_NOISM2SUPP;
} else if (!pclc_smcd_v2_ext) {
ini->smcd_version &= ~SMC_V2;
rc = SMC_CLC_DECL_NOV2DEXT;
} else if (!pclc_v2_ext->hdr.eid_cnt &&
!pclc_v2_ext->hdr.flag.seid) {
ini->smcd_version &= ~SMC_V2;
rc = SMC_CLC_DECL_NOUEID;
}
}
if (ini->smcr_version & SMC_V2) {
if (!pclc_v2_ext->hdr.eid_cnt) {
ini->smcr_version &= ~SMC_V2;
rc = SMC_CLC_DECL_NOUEID;
}
}
ini->release_nr = pclc_v2_ext->hdr.flag.release;
if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE)
ini->release_nr = SMC_RELEASE;
out:
if (!ini->smcd_version && !ini->smcr_version)
return rc;
return 0;
}
/* listen worker: check prefixes */
static int smc_listen_prfx_check(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc)
{
struct smc_clc_msg_proposal_prefix *pclc_prfx;
struct socket *newclcsock = new_smc->clcsock;
if (pclc->hdr.typev1 == SMC_TYPE_N)
return 0;
pclc_prfx = smc_clc_proposal_get_prefix(pclc);
if (smc_clc_prfx_match(newclcsock, pclc_prfx))
return SMC_CLC_DECL_DIFFPREFIX;
return 0;
}
/* listen worker: initialize connection and buffers */
static int smc_listen_rdma_init(struct smc_sock *new_smc,
struct smc_init_info *ini)
{
int rc;
/* allocate connection / link group */
rc = smc_conn_create(new_smc, ini);
if (rc)
return rc;
/* create send buffer and rmb */
net/smc: Reset connection when trying to use SMCRv2 fails. We found a crash when using SMCRv2 with 2 Mellanox ConnectX-4. It can be reproduced by: - smc_run nginx - smc_run wrk -t 32 -c 500 -d 30 http://<ip>:<port> BUG: kernel NULL pointer dereference, address: 0000000000000014 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000108713067 P4D 8000000108713067 PUD 151127067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 4 PID: 2441 Comm: kworker/4:249 Kdump: loaded Tainted: G W E 6.4.0-rc1+ #42 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_clc_send_confirm_accept+0x284/0x580 [smc] RSP: 0018:ffffb8294b2d7c78 EFLAGS: 00010a06 RAX: ffff8f1873238880 RBX: ffffb8294b2d7dc8 RCX: 0000000000000000 RDX: 00000000000000b4 RSI: 0000000000000001 RDI: 0000000000b40c00 RBP: ffffb8294b2d7db8 R08: ffff8f1815c5860c R09: 0000000000000000 R10: 0000000000000400 R11: 0000000000000000 R12: ffff8f1846f56180 R13: ffff8f1815c5860c R14: 0000000000000001 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8f1aefd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000014 CR3: 00000001027a0001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> ? mlx5_ib_map_mr_sg+0xa1/0xd0 [mlx5_ib] ? smcr_buf_map_link+0x24b/0x290 [smc] ? __smc_buf_create+0x4ee/0x9b0 [smc] smc_clc_send_accept+0x4c/0xb0 [smc] smc_listen_work+0x346/0x650 [smc] ? __schedule+0x279/0x820 process_one_work+0x1e5/0x3f0 worker_thread+0x4d/0x2f0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> During the CLC handshake, server sequentially tries available SMCRv2 and SMCRv1 devices in smc_listen_work(). If an SMCRv2 device is found. SMCv2 based link group and link will be assigned to the connection. Then assumed that some buffer assignment errors happen later in the CLC handshake, such as RMB registration failure, server will give up SMCRv2 and try SMCRv1 device instead. But the resources assigned to the connection won't be reset. When server tries SMCRv1 device, the connection creation process will be executed again. Since conn->lnk has been assigned when trying SMCRv2, it will not be set to the correct SMCRv1 link in smcr_lgr_conn_assign_link(). So in such situation, conn->lgr points to correct SMCRv1 link group but conn->lnk points to the SMCRv2 link mistakenly. Then in smc_clc_send_confirm_accept(), conn->rmb_desc->mr[link->link_idx] will be accessed. Since the link->link_idx is not correct, the related MR may not have been initialized, so crash happens. | Try SMCRv2 device first | |-> conn->lgr: assign existed SMCRv2 link group; | |-> conn->link: assign existed SMCRv2 link (link_idx may be 1 in SMC_LGR_SYMMETRIC); | |-> sndbuf & RMB creation fails, quit; | | Try SMCRv1 device then | |-> conn->lgr: create SMCRv1 link group and assign; | |-> conn->link: keep SMCRv2 link mistakenly; | |-> sndbuf & RMB creation succeed, only RMB->mr[link_idx = 0] | initialized. | | Then smc_clc_send_confirm_accept() accesses | conn->rmb_desc->mr[conn->link->link_idx, which is 1], then crash. v This patch tries to fix this by cleaning conn->lnk before assigning link. In addition, it is better to reset the connection and clean the resources assigned if trying SMCRv2 failed in buffer creation or registration. Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2") Link: https://lore.kernel.org/r/20220523055056.2078994-1-liuyacan@corp.netease.com/ Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-18 05:14:55 +00:00
if (smc_buf_create(new_smc, false)) {
smc_conn_abort(new_smc, ini->first_contact_local);
return SMC_CLC_DECL_MEM;
net/smc: Reset connection when trying to use SMCRv2 fails. We found a crash when using SMCRv2 with 2 Mellanox ConnectX-4. It can be reproduced by: - smc_run nginx - smc_run wrk -t 32 -c 500 -d 30 http://<ip>:<port> BUG: kernel NULL pointer dereference, address: 0000000000000014 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000108713067 P4D 8000000108713067 PUD 151127067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 4 PID: 2441 Comm: kworker/4:249 Kdump: loaded Tainted: G W E 6.4.0-rc1+ #42 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_clc_send_confirm_accept+0x284/0x580 [smc] RSP: 0018:ffffb8294b2d7c78 EFLAGS: 00010a06 RAX: ffff8f1873238880 RBX: ffffb8294b2d7dc8 RCX: 0000000000000000 RDX: 00000000000000b4 RSI: 0000000000000001 RDI: 0000000000b40c00 RBP: ffffb8294b2d7db8 R08: ffff8f1815c5860c R09: 0000000000000000 R10: 0000000000000400 R11: 0000000000000000 R12: ffff8f1846f56180 R13: ffff8f1815c5860c R14: 0000000000000001 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8f1aefd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000014 CR3: 00000001027a0001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> ? mlx5_ib_map_mr_sg+0xa1/0xd0 [mlx5_ib] ? smcr_buf_map_link+0x24b/0x290 [smc] ? __smc_buf_create+0x4ee/0x9b0 [smc] smc_clc_send_accept+0x4c/0xb0 [smc] smc_listen_work+0x346/0x650 [smc] ? __schedule+0x279/0x820 process_one_work+0x1e5/0x3f0 worker_thread+0x4d/0x2f0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> During the CLC handshake, server sequentially tries available SMCRv2 and SMCRv1 devices in smc_listen_work(). If an SMCRv2 device is found. SMCv2 based link group and link will be assigned to the connection. Then assumed that some buffer assignment errors happen later in the CLC handshake, such as RMB registration failure, server will give up SMCRv2 and try SMCRv1 device instead. But the resources assigned to the connection won't be reset. When server tries SMCRv1 device, the connection creation process will be executed again. Since conn->lnk has been assigned when trying SMCRv2, it will not be set to the correct SMCRv1 link in smcr_lgr_conn_assign_link(). So in such situation, conn->lgr points to correct SMCRv1 link group but conn->lnk points to the SMCRv2 link mistakenly. Then in smc_clc_send_confirm_accept(), conn->rmb_desc->mr[link->link_idx] will be accessed. Since the link->link_idx is not correct, the related MR may not have been initialized, so crash happens. | Try SMCRv2 device first | |-> conn->lgr: assign existed SMCRv2 link group; | |-> conn->link: assign existed SMCRv2 link (link_idx may be 1 in SMC_LGR_SYMMETRIC); | |-> sndbuf & RMB creation fails, quit; | | Try SMCRv1 device then | |-> conn->lgr: create SMCRv1 link group and assign; | |-> conn->link: keep SMCRv2 link mistakenly; | |-> sndbuf & RMB creation succeed, only RMB->mr[link_idx = 0] | initialized. | | Then smc_clc_send_confirm_accept() accesses | conn->rmb_desc->mr[conn->link->link_idx, which is 1], then crash. v This patch tries to fix this by cleaning conn->lnk before assigning link. In addition, it is better to reset the connection and clean the resources assigned if trying SMCRv2 failed in buffer creation or registration. Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2") Link: https://lore.kernel.org/r/20220523055056.2078994-1-liuyacan@corp.netease.com/ Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-18 05:14:55 +00:00
}
return 0;
}
/* listen worker: initialize connection and buffers for SMC-D */
static int smc_listen_ism_init(struct smc_sock *new_smc,
struct smc_init_info *ini)
{
int rc;
rc = smc_conn_create(new_smc, ini);
if (rc)
return rc;
/* Create send and receive buffers */
rc = smc_buf_create(new_smc, true);
if (rc) {
smc_conn_abort(new_smc, ini->first_contact_local);
return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
SMC_CLC_DECL_MEM;
}
return 0;
}
static bool smc_is_already_selected(struct smcd_dev *smcd,
struct smc_init_info *ini,
int matches)
{
int i;
for (i = 0; i < matches; i++)
if (smcd == ini->ism_dev[i])
return true;
return false;
}
/* check for ISM devices matching proposed ISM devices */
static void smc_check_ism_v2_match(struct smc_init_info *ini,
u16 proposed_chid,
struct smcd_gid *proposed_gid,
unsigned int *matches)
{
struct smcd_dev *smcd;
list_for_each_entry(smcd, &smcd_dev_list.list, list) {
if (smcd->going_away)
continue;
if (smc_is_already_selected(smcd, ini, *matches))
continue;
if (smc_ism_get_chid(smcd) == proposed_chid &&
!smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
ini->ism_peer_gid[*matches].gid = proposed_gid->gid;
if (__smc_ism_is_emulated(proposed_chid))
ini->ism_peer_gid[*matches].gid_ext =
proposed_gid->gid_ext;
/* non-Emulated-ISM's peer gid_ext remains 0. */
ini->ism_dev[*matches] = smcd;
(*matches)++;
break;
}
}
}
static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini)
{
if (!ini->rc)
ini->rc = rc;
}
static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc,
struct smc_init_info *ini)
{
struct smc_clc_smcd_v2_extension *smcd_v2_ext;
struct smc_clc_v2_extension *smc_v2_ext;
struct smc_clc_msg_smcd *pclc_smcd;
unsigned int matches = 0;
struct smcd_gid smcd_gid;
u8 smcd_version;
u8 *eid = NULL;
int i, rc;
u16 chid;
if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
goto not_found;
pclc_smcd = smc_get_clc_msg_smcd(pclc);
smc_v2_ext = smc_get_clc_v2_ext(pclc);
smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
mutex_lock(&smcd_dev_list.mutex);
if (pclc_smcd->ism.chid) {
/* check for ISM device matching proposed native ISM device */
smcd_gid.gid = ntohll(pclc_smcd->ism.gid);
smcd_gid.gid_ext = 0;
smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
&smcd_gid, &matches);
}
for (i = 0; i < smc_v2_ext->hdr.ism_gid_cnt; i++) {
/* check for ISM devices matching proposed non-native ISM
* devices
*/
smcd_gid.gid = ntohll(smcd_v2_ext->gidchid[i].gid);
smcd_gid.gid_ext = 0;
chid = ntohs(smcd_v2_ext->gidchid[i].chid);
if (__smc_ism_is_emulated(chid)) {
if ((i + 1) == smc_v2_ext->hdr.ism_gid_cnt ||
chid != ntohs(smcd_v2_ext->gidchid[i + 1].chid))
/* each Emulated-ISM device takes two GID-CHID
* entries and CHID of the second entry repeats
* that of the first entry.
*
* So check if the next GID-CHID entry exists
* and both two entries' CHIDs are the same.
*/
continue;
smcd_gid.gid_ext =
ntohll(smcd_v2_ext->gidchid[++i].gid);
}
smc_check_ism_v2_match(ini, chid, &smcd_gid, &matches);
}
mutex_unlock(&smcd_dev_list.mutex);
if (!ini->ism_dev[0]) {
smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini);
goto not_found;
}
smc_ism_get_system_eid(&eid);
if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext,
smcd_v2_ext->system_eid, eid))
goto not_found;
/* separate - outside the smcd_dev_list.lock */
smcd_version = ini->smcd_version;
for (i = 0; i < matches; i++) {
ini->smcd_version = SMC_V2;
ini->is_smcd = true;
ini->ism_selected = i;
rc = smc_listen_ism_init(new_smc, ini);
if (rc) {
smc_find_ism_store_rc(rc, ini);
/* try next active ISM device */
continue;
}
return; /* matching and usable V2 ISM device found */
}
/* no V2 ISM device could be initialized */
ini->smcd_version = smcd_version; /* restore original value */
ini->negotiated_eid[0] = 0;
not_found:
ini->smcd_version &= ~SMC_V2;
ini->ism_dev[0] = NULL;
ini->is_smcd = false;
}
static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc,
struct smc_init_info *ini)
{
struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
int rc = 0;
/* check if ISM V1 is available */
if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
goto not_found;
ini->is_smcd = true; /* prepare ISM check */
ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid);
ini->ism_peer_gid[0].gid_ext = 0;
rc = smc_find_ism_device(new_smc, ini);
if (rc)
goto not_found;
ini->ism_selected = 0;
rc = smc_listen_ism_init(new_smc, ini);
if (!rc)
return; /* V1 ISM device found */
not_found:
smc_find_ism_store_rc(rc, ini);
ini->smcd_version &= ~SMC_V1;
ini->ism_dev[0] = NULL;
ini->is_smcd = false;
}
/* listen worker: register buffers */
static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
{
struct smc_connection *conn = &new_smc->conn;
if (!local_first) {
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
/* reg sendbufs if they were vzalloced */
if (conn->sndbuf_desc->is_vm) {
if (smcr_lgr_reg_sndbufs(conn->lnk,
conn->sndbuf_desc))
return SMC_CLC_DECL_ERR_REGBUF;
}
if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
net/smc: Allow virtually contiguous sndbufs or RMBs for SMC-R On long-running enterprise production servers, high-order contiguous memory pages are usually very rare and in most cases we can only get fragmented pages. When replacing TCP with SMC-R in such production scenarios, attempting to allocate high-order physically contiguous sndbufs and RMBs may result in frequent memory compaction, which will cause unexpected hung issue and further stability risks. So this patch is aimed to allow SMC-R link group to use virtually contiguous sndbufs and RMBs to avoid potential issues mentioned above. Whether to use physically or virtually contiguous buffers can be set by sysctl smcr_buf_type. Note that using virtually contiguous buffers will bring an acceptable performance regression, which can be mainly divided into two parts: 1) regression in data path, which is brought by additional address translation of sndbuf by RNIC in Tx. But in general, translating address through MTT is fast. Taking 256KB sndbuf and RMB as an example, the comparisons in qperf latency and bandwidth test with physically and virtually contiguous buffers are as follows: - client: smc_run taskset -c <cpu> qperf <server> -oo msg_size:1:64K:*2\ -t 5 -vu tcp_{bw|lat} - server: smc_run taskset -c <cpu> qperf [latency] msgsize tcp smcr smcr-use-virt-buf 1 11.17 us 7.56 us 7.51 us (-0.67%) 2 10.65 us 7.74 us 7.56 us (-2.31%) 4 11.11 us 7.52 us 7.59 us ( 0.84%) 8 10.83 us 7.55 us 7.51 us (-0.48%) 16 11.21 us 7.46 us 7.51 us ( 0.71%) 32 10.65 us 7.53 us 7.58 us ( 0.61%) 64 10.95 us 7.74 us 7.80 us ( 0.76%) 128 11.14 us 7.83 us 7.87 us ( 0.47%) 256 10.97 us 7.94 us 7.92 us (-0.28%) 512 11.23 us 7.94 us 8.20 us ( 3.25%) 1024 11.60 us 8.12 us 8.20 us ( 0.96%) 2048 14.04 us 8.30 us 8.51 us ( 2.49%) 4096 16.88 us 9.13 us 9.07 us (-0.64%) 8192 22.50 us 10.56 us 11.22 us ( 6.26%) 16384 28.99 us 12.88 us 13.83 us ( 7.37%) 32768 40.13 us 16.76 us 16.95 us ( 1.16%) 65536 68.70 us 24.68 us 24.85 us ( 0.68%) [bandwidth] msgsize tcp smcr smcr-use-virt-buf 1 1.65 MB/s 1.59 MB/s 1.53 MB/s (-3.88%) 2 3.32 MB/s 3.17 MB/s 3.08 MB/s (-2.67%) 4 6.66 MB/s 6.33 MB/s 6.09 MB/s (-3.85%) 8 13.67 MB/s 13.45 MB/s 11.97 MB/s (-10.99%) 16 25.36 MB/s 27.15 MB/s 24.16 MB/s (-11.01%) 32 48.22 MB/s 54.24 MB/s 49.41 MB/s (-8.89%) 64 106.79 MB/s 107.32 MB/s 99.05 MB/s (-7.71%) 128 210.21 MB/s 202.46 MB/s 201.02 MB/s (-0.71%) 256 400.81 MB/s 416.81 MB/s 393.52 MB/s (-5.59%) 512 746.49 MB/s 834.12 MB/s 809.99 MB/s (-2.89%) 1024 1292.33 MB/s 1641.96 MB/s 1571.82 MB/s (-4.27%) 2048 2007.64 MB/s 2760.44 MB/s 2717.68 MB/s (-1.55%) 4096 2665.17 MB/s 4157.44 MB/s 4070.76 MB/s (-2.09%) 8192 3159.72 MB/s 4361.57 MB/s 4270.65 MB/s (-2.08%) 16384 4186.70 MB/s 4574.13 MB/s 4501.17 MB/s (-1.60%) 32768 4093.21 MB/s 4487.42 MB/s 4322.43 MB/s (-3.68%) 65536 4057.14 MB/s 4735.61 MB/s 4555.17 MB/s (-3.81%) 2) regression in buffer initialization and destruction path, which is brought by additional MR operations of sndbufs. But thanks to link group buffer reuse mechanism, the impact of this kind of regression decreases as times of buffer reuse increases. Taking 256KB sndbuf and RMB as an example, latency of some key SMC-R buffer-related function obtained by bpftrace are as follows: Function Phys-bufs Virt-bufs smcr_new_buf_create() 67154 ns 79164 ns smc_ib_buf_map_sg() 525 ns 928 ns smc_ib_get_memory_region() 162294 ns 161191 ns smc_wr_reg_send() 9957 ns 9635 ns smc_ib_put_memory_region() 203548 ns 198374 ns smc_ib_buf_unmap_sg() 508 ns 1158 ns ------------ Test environment notes: 1. Above tests run on 2 VMs within the same Host. 2. The NIC is ConnectX-4Lx, using SRIOV and passing through 2 VFs to the each VM respectively. 3. VMs' vCPUs are binded to different physical CPUs, and the binded physical CPUs are isolated by `isolcpus=xxx` cmdline. 4. NICs' queue number are set to 1. Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-07-14 09:44:04 +00:00
return SMC_CLC_DECL_ERR_REGBUF;
}
return 0;
}
static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc,
struct smc_init_info *ini)
{
struct smc_clc_v2_extension *smc_v2_ext;
u8 smcr_version;
int rc;
if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2))
goto not_found;
smc_v2_ext = smc_get_clc_v2_ext(pclc);
if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL))
goto not_found;
/* prepare RDMA check */
memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE);
memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
ini->check_smcrv2 = true;
ini->smcrv2.clc_sk = new_smc->clcsock->sk;
ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr;
ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce);
rc = smc_find_rdma_device(new_smc, ini);
if (rc) {
smc_find_ism_store_rc(rc, ini);
goto not_found;
}
if (!ini->smcrv2.uses_gateway)
memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN);
smcr_version = ini->smcr_version;
ini->smcr_version = SMC_V2;
rc = smc_listen_rdma_init(new_smc, ini);
net/smc: Reset connection when trying to use SMCRv2 fails. We found a crash when using SMCRv2 with 2 Mellanox ConnectX-4. It can be reproduced by: - smc_run nginx - smc_run wrk -t 32 -c 500 -d 30 http://<ip>:<port> BUG: kernel NULL pointer dereference, address: 0000000000000014 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000108713067 P4D 8000000108713067 PUD 151127067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 4 PID: 2441 Comm: kworker/4:249 Kdump: loaded Tainted: G W E 6.4.0-rc1+ #42 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_clc_send_confirm_accept+0x284/0x580 [smc] RSP: 0018:ffffb8294b2d7c78 EFLAGS: 00010a06 RAX: ffff8f1873238880 RBX: ffffb8294b2d7dc8 RCX: 0000000000000000 RDX: 00000000000000b4 RSI: 0000000000000001 RDI: 0000000000b40c00 RBP: ffffb8294b2d7db8 R08: ffff8f1815c5860c R09: 0000000000000000 R10: 0000000000000400 R11: 0000000000000000 R12: ffff8f1846f56180 R13: ffff8f1815c5860c R14: 0000000000000001 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8f1aefd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000014 CR3: 00000001027a0001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> ? mlx5_ib_map_mr_sg+0xa1/0xd0 [mlx5_ib] ? smcr_buf_map_link+0x24b/0x290 [smc] ? __smc_buf_create+0x4ee/0x9b0 [smc] smc_clc_send_accept+0x4c/0xb0 [smc] smc_listen_work+0x346/0x650 [smc] ? __schedule+0x279/0x820 process_one_work+0x1e5/0x3f0 worker_thread+0x4d/0x2f0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> During the CLC handshake, server sequentially tries available SMCRv2 and SMCRv1 devices in smc_listen_work(). If an SMCRv2 device is found. SMCv2 based link group and link will be assigned to the connection. Then assumed that some buffer assignment errors happen later in the CLC handshake, such as RMB registration failure, server will give up SMCRv2 and try SMCRv1 device instead. But the resources assigned to the connection won't be reset. When server tries SMCRv1 device, the connection creation process will be executed again. Since conn->lnk has been assigned when trying SMCRv2, it will not be set to the correct SMCRv1 link in smcr_lgr_conn_assign_link(). So in such situation, conn->lgr points to correct SMCRv1 link group but conn->lnk points to the SMCRv2 link mistakenly. Then in smc_clc_send_confirm_accept(), conn->rmb_desc->mr[link->link_idx] will be accessed. Since the link->link_idx is not correct, the related MR may not have been initialized, so crash happens. | Try SMCRv2 device first | |-> conn->lgr: assign existed SMCRv2 link group; | |-> conn->link: assign existed SMCRv2 link (link_idx may be 1 in SMC_LGR_SYMMETRIC); | |-> sndbuf & RMB creation fails, quit; | | Try SMCRv1 device then | |-> conn->lgr: create SMCRv1 link group and assign; | |-> conn->link: keep SMCRv2 link mistakenly; | |-> sndbuf & RMB creation succeed, only RMB->mr[link_idx = 0] | initialized. | | Then smc_clc_send_confirm_accept() accesses | conn->rmb_desc->mr[conn->link->link_idx, which is 1], then crash. v This patch tries to fix this by cleaning conn->lnk before assigning link. In addition, it is better to reset the connection and clean the resources assigned if trying SMCRv2 failed in buffer creation or registration. Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2") Link: https://lore.kernel.org/r/20220523055056.2078994-1-liuyacan@corp.netease.com/ Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-18 05:14:55 +00:00
if (!rc) {
rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local);
net/smc: Reset connection when trying to use SMCRv2 fails. We found a crash when using SMCRv2 with 2 Mellanox ConnectX-4. It can be reproduced by: - smc_run nginx - smc_run wrk -t 32 -c 500 -d 30 http://<ip>:<port> BUG: kernel NULL pointer dereference, address: 0000000000000014 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000108713067 P4D 8000000108713067 PUD 151127067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 4 PID: 2441 Comm: kworker/4:249 Kdump: loaded Tainted: G W E 6.4.0-rc1+ #42 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_clc_send_confirm_accept+0x284/0x580 [smc] RSP: 0018:ffffb8294b2d7c78 EFLAGS: 00010a06 RAX: ffff8f1873238880 RBX: ffffb8294b2d7dc8 RCX: 0000000000000000 RDX: 00000000000000b4 RSI: 0000000000000001 RDI: 0000000000b40c00 RBP: ffffb8294b2d7db8 R08: ffff8f1815c5860c R09: 0000000000000000 R10: 0000000000000400 R11: 0000000000000000 R12: ffff8f1846f56180 R13: ffff8f1815c5860c R14: 0000000000000001 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8f1aefd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000014 CR3: 00000001027a0001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> ? mlx5_ib_map_mr_sg+0xa1/0xd0 [mlx5_ib] ? smcr_buf_map_link+0x24b/0x290 [smc] ? __smc_buf_create+0x4ee/0x9b0 [smc] smc_clc_send_accept+0x4c/0xb0 [smc] smc_listen_work+0x346/0x650 [smc] ? __schedule+0x279/0x820 process_one_work+0x1e5/0x3f0 worker_thread+0x4d/0x2f0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> During the CLC handshake, server sequentially tries available SMCRv2 and SMCRv1 devices in smc_listen_work(). If an SMCRv2 device is found. SMCv2 based link group and link will be assigned to the connection. Then assumed that some buffer assignment errors happen later in the CLC handshake, such as RMB registration failure, server will give up SMCRv2 and try SMCRv1 device instead. But the resources assigned to the connection won't be reset. When server tries SMCRv1 device, the connection creation process will be executed again. Since conn->lnk has been assigned when trying SMCRv2, it will not be set to the correct SMCRv1 link in smcr_lgr_conn_assign_link(). So in such situation, conn->lgr points to correct SMCRv1 link group but conn->lnk points to the SMCRv2 link mistakenly. Then in smc_clc_send_confirm_accept(), conn->rmb_desc->mr[link->link_idx] will be accessed. Since the link->link_idx is not correct, the related MR may not have been initialized, so crash happens. | Try SMCRv2 device first | |-> conn->lgr: assign existed SMCRv2 link group; | |-> conn->link: assign existed SMCRv2 link (link_idx may be 1 in SMC_LGR_SYMMETRIC); | |-> sndbuf & RMB creation fails, quit; | | Try SMCRv1 device then | |-> conn->lgr: create SMCRv1 link group and assign; | |-> conn->link: keep SMCRv2 link mistakenly; | |-> sndbuf & RMB creation succeed, only RMB->mr[link_idx = 0] | initialized. | | Then smc_clc_send_confirm_accept() accesses | conn->rmb_desc->mr[conn->link->link_idx, which is 1], then crash. v This patch tries to fix this by cleaning conn->lnk before assigning link. In addition, it is better to reset the connection and clean the resources assigned if trying SMCRv2 failed in buffer creation or registration. Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2") Link: https://lore.kernel.org/r/20220523055056.2078994-1-liuyacan@corp.netease.com/ Signed-off-by: Wen Gu <guwen@linux.alibaba.com> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-18 05:14:55 +00:00
if (rc)
smc_conn_abort(new_smc, ini->first_contact_local);
}
if (!rc)
return;
ini->smcr_version = smcr_version;
smc_find_ism_store_rc(rc, ini);
not_found:
ini->smcr_version &= ~SMC_V2;
ini->smcrv2.ib_dev_v2 = NULL;
ini->check_smcrv2 = false;
}
static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc,
struct smc_init_info *ini)
{
int rc;
if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1))
return SMC_CLC_DECL_NOSMCDEV;
/* prepare RDMA check */
memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE);
memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
rc = smc_find_rdma_device(new_smc, ini);
if (rc) {
/* no RDMA device found */
return SMC_CLC_DECL_NOSMCDEV;
}
rc = smc_listen_rdma_init(new_smc, ini);
if (rc)
return rc;
return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
}
/* determine the local device matching to proposal */
static int smc_listen_find_device(struct smc_sock *new_smc,
struct smc_clc_msg_proposal *pclc,
struct smc_init_info *ini)
{
int prfx_rc;
/* check for ISM device matching V2 proposed device */
smc_find_ism_v2_device_serv(new_smc, pclc, ini);
if (ini->ism_dev[0])
return 0;
/* check for matching IP prefix and subnet length (V1) */
prfx_rc = smc_listen_prfx_check(new_smc, pclc);
if (prfx_rc)
smc_find_ism_store_rc(prfx_rc, ini);
/* get vlan id from IP device */
if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
return ini->rc ?: SMC_CLC_DECL_GETVLANERR;
/* check for ISM device matching V1 proposed device */
if (!prfx_rc)
smc_find_ism_v1_device_serv(new_smc, pclc, ini);
if (ini->ism_dev[0])
return 0;
if (!smcr_indicated(pclc->hdr.typev1) &&
!smcr_indicated(pclc->hdr.typev2))
/* skip RDMA and decline */
return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV;
/* check if RDMA V2 is available */
smc_find_rdma_v2_device_serv(new_smc, pclc, ini);
if (ini->smcrv2.ib_dev_v2)
return 0;
/* check if RDMA V1 is available */
if (!prfx_rc) {
int rc;
rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
smc_find_ism_store_rc(rc, ini);
return (!rc) ? 0 : ini->rc;
}
return prfx_rc;
}
/* listen worker: finish RDMA setup */
net/smc: remove duplicate mutex_unlock For a failing smc_listen_rdma_finish() smc_listen_decline() is called. If fallback is possible, the new socket is already enqueued to be accepted in smc_listen_decline(). Avoid enqueuing a second time afterwards in this case, otherwise the smc_create_lgr_pending lock is released twice: [ 373.463976] WARNING: bad unlock balance detected! [ 373.463978] 4.18.0-rc7+ #123 Tainted: G O [ 373.463979] ------------------------------------- [ 373.463980] kworker/1:1/30 is trying to release lock (smc_create_lgr_pending) at: [ 373.463990] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc] [ 373.463991] but there are no more locks to release! [ 373.463991] other info that might help us debug this: [ 373.463993] 2 locks held by kworker/1:1/30: [ 373.463994] #0: 00000000772cbaed ((wq_completion)"events"){+.+.}, at: process_one_work+0x1ec/0x6b0 [ 373.464000] #1: 000000003ad0894a ((work_completion)(&new_smc->smc_listen_work)){+.+.}, at: process_one_work+0x1ec/0x6b0 [ 373.464003] stack backtrace: [ 373.464005] CPU: 1 PID: 30 Comm: kworker/1:1 Kdump: loaded Tainted: G O 4.18.0-rc7uschi+ #123 [ 373.464007] Hardware name: IBM 2827 H43 738 (LPAR) [ 373.464010] Workqueue: events smc_listen_work [smc] [ 373.464011] Call Trace: [ 373.464015] ([<0000000000114100>] show_stack+0x60/0xd8) [ 373.464019] [<0000000000a8c9bc>] dump_stack+0x9c/0xd8 [ 373.464021] [<00000000001dcaf8>] print_unlock_imbalance_bug+0xf8/0x108 [ 373.464022] [<00000000001e045c>] lock_release+0x114/0x4f8 [ 373.464025] [<0000000000aa87fa>] __mutex_unlock_slowpath+0x4a/0x300 [ 373.464027] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc] [ 373.464029] [<0000000000197a68>] process_one_work+0x2a8/0x6b0 [ 373.464030] [<0000000000197ec2>] worker_thread+0x52/0x410 [ 373.464033] [<000000000019fd0e>] kthread+0x15e/0x178 [ 373.464035] [<0000000000aaf58a>] kernel_thread_starter+0x6/0xc [ 373.464052] [<0000000000aaf584>] kernel_thread_starter+0x0/0xc [ 373.464054] INFO: lockdep is turned off. Signed-off-by: Ursula Braun <ubraun@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-09-18 13:46:35 +00:00
static int smc_listen_rdma_finish(struct smc_sock *new_smc,
struct smc_clc_msg_accept_confirm *cclc,
bool local_first,
struct smc_init_info *ini)
{
struct smc_link *link = new_smc->conn.lnk;
int reason_code = 0;
if (local_first)
smc_link_save_peer_info(link, cclc, ini);
if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
return SMC_CLC_DECL_ERR_RTOK;
if (local_first) {
if (smc_ib_ready_link(link))
return SMC_CLC_DECL_ERR_RDYLNK;
/* QP confirmation over RoCE fabric */
smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
reason_code = smcr_serv_conf_first_link(new_smc);
smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
}
net/smc: remove duplicate mutex_unlock For a failing smc_listen_rdma_finish() smc_listen_decline() is called. If fallback is possible, the new socket is already enqueued to be accepted in smc_listen_decline(). Avoid enqueuing a second time afterwards in this case, otherwise the smc_create_lgr_pending lock is released twice: [ 373.463976] WARNING: bad unlock balance detected! [ 373.463978] 4.18.0-rc7+ #123 Tainted: G O [ 373.463979] ------------------------------------- [ 373.463980] kworker/1:1/30 is trying to release lock (smc_create_lgr_pending) at: [ 373.463990] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc] [ 373.463991] but there are no more locks to release! [ 373.463991] other info that might help us debug this: [ 373.463993] 2 locks held by kworker/1:1/30: [ 373.463994] #0: 00000000772cbaed ((wq_completion)"events"){+.+.}, at: process_one_work+0x1ec/0x6b0 [ 373.464000] #1: 000000003ad0894a ((work_completion)(&new_smc->smc_listen_work)){+.+.}, at: process_one_work+0x1ec/0x6b0 [ 373.464003] stack backtrace: [ 373.464005] CPU: 1 PID: 30 Comm: kworker/1:1 Kdump: loaded Tainted: G O 4.18.0-rc7uschi+ #123 [ 373.464007] Hardware name: IBM 2827 H43 738 (LPAR) [ 373.464010] Workqueue: events smc_listen_work [smc] [ 373.464011] Call Trace: [ 373.464015] ([<0000000000114100>] show_stack+0x60/0xd8) [ 373.464019] [<0000000000a8c9bc>] dump_stack+0x9c/0xd8 [ 373.464021] [<00000000001dcaf8>] print_unlock_imbalance_bug+0xf8/0x108 [ 373.464022] [<00000000001e045c>] lock_release+0x114/0x4f8 [ 373.464025] [<0000000000aa87fa>] __mutex_unlock_slowpath+0x4a/0x300 [ 373.464027] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc] [ 373.464029] [<0000000000197a68>] process_one_work+0x2a8/0x6b0 [ 373.464030] [<0000000000197ec2>] worker_thread+0x52/0x410 [ 373.464033] [<000000000019fd0e>] kthread+0x15e/0x178 [ 373.464035] [<0000000000aaf58a>] kernel_thread_starter+0x6/0xc [ 373.464052] [<0000000000aaf584>] kernel_thread_starter+0x0/0xc [ 373.464054] INFO: lockdep is turned off. Signed-off-by: Ursula Braun <ubraun@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-09-18 13:46:35 +00:00
return reason_code;
}
/* setup for connection of server */
static void smc_listen_work(struct work_struct *work)
{
struct smc_sock *new_smc = container_of(work, struct smc_sock,
smc_listen_work);
struct socket *newclcsock = new_smc->clcsock;
struct smc_clc_msg_accept_confirm *cclc;
struct smc_clc_msg_proposal_area *buf;
struct smc_clc_msg_proposal *pclc;
struct smc_init_info *ini = NULL;
u8 proposal_version = SMC_V1;
u8 accept_version;
int rc = 0;
if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
return smc_listen_out_err(new_smc);
if (new_smc->use_fallback) {
smc_listen_out_connected(new_smc);
return;
}
/* check if peer is smc capable */
if (!tcp_sk(newclcsock->sk)->syn_smc) {
rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC);
if (rc)
smc_listen_out_err(new_smc);
else
smc_listen_out_connected(new_smc);
return;
}
/* do inband token exchange -
* wait for and receive SMC Proposal CLC message
*/
buf = kzalloc(sizeof(*buf), GFP_KERNEL);
if (!buf) {
rc = SMC_CLC_DECL_MEM;
goto out_decl;
}
pclc = (struct smc_clc_msg_proposal *)buf;
rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
if (rc)
goto out_decl;
if (pclc->hdr.version > SMC_V1)
proposal_version = SMC_V2;
/* IPSec connections opt out of SMC optimizations */
if (using_ipsec(new_smc)) {
rc = SMC_CLC_DECL_IPSEC;
goto out_decl;
}
ini = kzalloc(sizeof(*ini), GFP_KERNEL);
if (!ini) {
rc = SMC_CLC_DECL_MEM;
goto out_decl;
}
/* initial version checking */
rc = smc_listen_v2_check(new_smc, pclc, ini);
if (rc)
goto out_decl;
rc = smc_clc_srv_v2x_features_validate(new_smc, pclc, ini);
if (rc)
goto out_decl;
mutex_lock(&smc_server_lgr_pending);
smc_close_init(new_smc);
smc_rx_init(new_smc);
smc_tx_init(new_smc);
/* determine ISM or RoCE device used for connection */
rc = smc_listen_find_device(new_smc, pclc, ini);
if (rc)
goto out_unlock;
/* send SMC Accept CLC message */
accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version;
rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
accept_version, ini->negotiated_eid, ini);
if (rc)
goto out_unlock;
/* SMC-D does not need this lock any more */
if (ini->is_smcd)
mutex_unlock(&smc_server_lgr_pending);
/* receive SMC Confirm CLC message */
memset(buf, 0, sizeof(*buf));
cclc = (struct smc_clc_msg_accept_confirm *)buf;
rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
SMC_CLC_CONFIRM, CLC_WAIT_TIME);
if (rc) {
if (!ini->is_smcd)
goto out_unlock;
goto out_decl;
}
rc = smc_clc_v2x_features_confirm_check(cclc, ini);
if (rc) {
if (!ini->is_smcd)
goto out_unlock;
goto out_decl;
}
/* fce smc release version is needed in smc_listen_rdma_finish,
* so save fce info here.
*/
smc_conn_save_peer_info_fce(new_smc, cclc);
/* finish worker */
if (!ini->is_smcd) {
rc = smc_listen_rdma_finish(new_smc, cclc,
ini->first_contact_local, ini);
if (rc)
goto out_unlock;
mutex_unlock(&smc_server_lgr_pending);
net/smc: remove duplicate mutex_unlock For a failing smc_listen_rdma_finish() smc_listen_decline() is called. If fallback is possible, the new socket is already enqueued to be accepted in smc_listen_decline(). Avoid enqueuing a second time afterwards in this case, otherwise the smc_create_lgr_pending lock is released twice: [ 373.463976] WARNING: bad unlock balance detected! [ 373.463978] 4.18.0-rc7+ #123 Tainted: G O [ 373.463979] ------------------------------------- [ 373.463980] kworker/1:1/30 is trying to release lock (smc_create_lgr_pending) at: [ 373.463990] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc] [ 373.463991] but there are no more locks to release! [ 373.463991] other info that might help us debug this: [ 373.463993] 2 locks held by kworker/1:1/30: [ 373.463994] #0: 00000000772cbaed ((wq_completion)"events"){+.+.}, at: process_one_work+0x1ec/0x6b0 [ 373.464000] #1: 000000003ad0894a ((work_completion)(&new_smc->smc_listen_work)){+.+.}, at: process_one_work+0x1ec/0x6b0 [ 373.464003] stack backtrace: [ 373.464005] CPU: 1 PID: 30 Comm: kworker/1:1 Kdump: loaded Tainted: G O 4.18.0-rc7uschi+ #123 [ 373.464007] Hardware name: IBM 2827 H43 738 (LPAR) [ 373.464010] Workqueue: events smc_listen_work [smc] [ 373.464011] Call Trace: [ 373.464015] ([<0000000000114100>] show_stack+0x60/0xd8) [ 373.464019] [<0000000000a8c9bc>] dump_stack+0x9c/0xd8 [ 373.464021] [<00000000001dcaf8>] print_unlock_imbalance_bug+0xf8/0x108 [ 373.464022] [<00000000001e045c>] lock_release+0x114/0x4f8 [ 373.464025] [<0000000000aa87fa>] __mutex_unlock_slowpath+0x4a/0x300 [ 373.464027] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc] [ 373.464029] [<0000000000197a68>] process_one_work+0x2a8/0x6b0 [ 373.464030] [<0000000000197ec2>] worker_thread+0x52/0x410 [ 373.464033] [<000000000019fd0e>] kthread+0x15e/0x178 [ 373.464035] [<0000000000aaf58a>] kernel_thread_starter+0x6/0xc [ 373.464052] [<0000000000aaf584>] kernel_thread_starter+0x0/0xc [ 373.464054] INFO: lockdep is turned off. Signed-off-by: Ursula Braun <ubraun@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-09-18 13:46:35 +00:00
}
smc_conn_save_peer_info(new_smc, cclc);
if (ini->is_smcd &&
smc_ism_support_dmb_nocopy(new_smc->conn.lgr->smcd)) {
rc = smcd_buf_attach(new_smc);
if (rc)
goto out_decl;
}
smc_listen_out_connected(new_smc);
SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
goto out_free;
out_unlock:
mutex_unlock(&smc_server_lgr_pending);
out_decl:
smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
proposal_version);
out_free:
kfree(ini);
kfree(buf);
}
static void smc_tcp_listen_work(struct work_struct *work)
{
struct smc_sock *lsmc = container_of(work, struct smc_sock,
tcp_listen_work);
struct sock *lsk = &lsmc->sk;
struct smc_sock *new_smc;
int rc = 0;
lock_sock(lsk);
while (lsk->sk_state == SMC_LISTEN) {
rc = smc_clcsock_accept(lsmc, &new_smc);
if (rc) /* clcsock accept queue empty or error */
goto out;
if (!new_smc)
continue;
net/smc: Limit backlog connections Current implementation does not handling backlog semantics, one potential risk is that server will be flooded by infinite amount connections, even if client was SMC-incapable. This patch works to put a limit on backlog connections, referring to the TCP implementation, we divides SMC connections into two categories: 1. Half SMC connection, which includes all TCP established while SMC not connections. 2. Full SMC connection, which includes all SMC established connections. For half SMC connection, since all half SMC connections starts with TCP established, we can achieve our goal by put a limit before TCP established. Refer to the implementation of TCP, this limits will based on not only the half SMC connections but also the full connections, which is also a constraint on full SMC connections. For full SMC connections, although we know exactly where it starts, it's quite hard to put a limit before it. The easiest way is to block wait before receive SMC confirm CLC message, while it's under protection by smc_server_lgr_pending, a global lock, which leads this limit to the entire host instead of a single listen socket. Another way is to drop the full connections, but considering the cast of SMC connections, we prefer to keep full SMC connections. Even so, the limits of full SMC connections still exists, see commits about half SMC connection below. After this patch, the limits of backend connection shows like: For SMC: 1. Client with SMC-capability can makes 2 * backlog full SMC connections or 1 * backlog half SMC connections and 1 * backlog full SMC connections at most. 2. Client without SMC-capability can only makes 1 * backlog half TCP connections and 1 * backlog full TCP connections. Signed-off-by: D. Wythe <alibuda@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 09:11:35 +00:00
if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
atomic_inc(&lsmc->queued_smc_hs);
new_smc->listen_smc = lsmc;
new_smc->use_fallback = lsmc->use_fallback;
new_smc->fallback_rsn = lsmc->fallback_rsn;
sock_hold(lsk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc);
sock_hold(&new_smc->sk); /* sock_put in passive closing */
if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
sock_put(&new_smc->sk);
}
out:
release_sock(lsk);
sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
}
static void smc_clcsock_data_ready(struct sock *listen_clcsock)
{
struct smc_sock *lsmc;
read_lock_bh(&listen_clcsock->sk_callback_lock);
lsmc = smc_clcsock_user_data(listen_clcsock);
if (!lsmc)
goto out;
lsmc->clcsk_data_ready(listen_clcsock);
if (lsmc->sk.sk_state == SMC_LISTEN) {
sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
sock_put(&lsmc->sk);
}
out:
read_unlock_bh(&listen_clcsock->sk_callback_lock);
}
static int smc_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc;
smc = smc_sk(sk);
lock_sock(sk);
rc = -EINVAL;
if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
smc->connect_nonblock || sock->state != SS_UNCONNECTED)
goto out;
rc = 0;
if (sk->sk_state == SMC_LISTEN) {
sk->sk_max_ack_backlog = backlog;
goto out;
}
/* some socket options are handled in core, so we could not apply
* them to the clc socket -- copy smc socket options to clc socket
*/
smc_copy_sock_settings_to_clc(smc);
if (!smc->use_fallback)
tcp_sk(smc->clcsock->sk)->syn_smc = 1;
/* save original sk_data_ready function and establish
* smc-specific sk_data_ready function
*/
write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
smc->clcsock->sk->sk_user_data =
(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready,
smc_clcsock_data_ready, &smc->clcsk_data_ready);
write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
net/smc: Limit backlog connections Current implementation does not handling backlog semantics, one potential risk is that server will be flooded by infinite amount connections, even if client was SMC-incapable. This patch works to put a limit on backlog connections, referring to the TCP implementation, we divides SMC connections into two categories: 1. Half SMC connection, which includes all TCP established while SMC not connections. 2. Full SMC connection, which includes all SMC established connections. For half SMC connection, since all half SMC connections starts with TCP established, we can achieve our goal by put a limit before TCP established. Refer to the implementation of TCP, this limits will based on not only the half SMC connections but also the full connections, which is also a constraint on full SMC connections. For full SMC connections, although we know exactly where it starts, it's quite hard to put a limit before it. The easiest way is to block wait before receive SMC confirm CLC message, while it's under protection by smc_server_lgr_pending, a global lock, which leads this limit to the entire host instead of a single listen socket. Another way is to drop the full connections, but considering the cast of SMC connections, we prefer to keep full SMC connections. Even so, the limits of full SMC connections still exists, see commits about half SMC connection below. After this patch, the limits of backend connection shows like: For SMC: 1. Client with SMC-capability can makes 2 * backlog full SMC connections or 1 * backlog half SMC connections and 1 * backlog full SMC connections at most. 2. Client without SMC-capability can only makes 1 * backlog half TCP connections and 1 * backlog full TCP connections. Signed-off-by: D. Wythe <alibuda@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-10 09:11:35 +00:00
/* save original ops */
smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;
smc->af_ops = *smc->ori_af_ops;
smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;
inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
if (smc->limit_smc_hs)
tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
2022-02-10 09:11:36 +00:00
rc = kernel_listen(smc->clcsock, backlog);
if (rc) {
write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready,
&smc->clcsk_data_ready);
smc->clcsock->sk->sk_user_data = NULL;
write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
goto out;
}
sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
sk->sk_state = SMC_LISTEN;
out:
release_sock(sk);
return rc;
}
static int smc_accept(struct socket *sock, struct socket *new_sock,
net: Work around lockdep limitation in sockets that use sockets Lockdep issues a circular dependency warning when AFS issues an operation through AF_RXRPC from a context in which the VFS/VM holds the mmap_sem. The theory lockdep comes up with is as follows: (1) If the pagefault handler decides it needs to read pages from AFS, it calls AFS with mmap_sem held and AFS begins an AF_RXRPC call, but creating a call requires the socket lock: mmap_sem must be taken before sk_lock-AF_RXRPC (2) afs_open_socket() opens an AF_RXRPC socket and binds it. rxrpc_bind() binds the underlying UDP socket whilst holding its socket lock. inet_bind() takes its own socket lock: sk_lock-AF_RXRPC must be taken before sk_lock-AF_INET (3) Reading from a TCP socket into a userspace buffer might cause a fault and thus cause the kernel to take the mmap_sem, but the TCP socket is locked whilst doing this: sk_lock-AF_INET must be taken before mmap_sem However, lockdep's theory is wrong in this instance because it deals only with lock classes and not individual locks. The AF_INET lock in (2) isn't really equivalent to the AF_INET lock in (3) as the former deals with a socket entirely internal to the kernel that never sees userspace. This is a limitation in the design of lockdep. Fix the general case by: (1) Double up all the locking keys used in sockets so that one set are used if the socket is created by userspace and the other set is used if the socket is created by the kernel. (2) Store the kern parameter passed to sk_alloc() in a variable in the sock struct (sk_kern_sock). This informs sock_lock_init(), sock_init_data() and sk_clone_lock() as to the lock keys to be used. Note that the child created by sk_clone_lock() inherits the parent's kern setting. (3) Add a 'kern' parameter to ->accept() that is analogous to the one passed in to ->create() that distinguishes whether kernel_accept() or sys_accept4() was the caller and can be passed to sk_alloc(). Note that a lot of accept functions merely dequeue an already allocated socket. I haven't touched these as the new socket already exists before we get the parameter. Note also that there are a couple of places where I've made the accepted socket unconditionally kernel-based: irda_accept() rds_rcp_accept_one() tcp_accept_from_sock() because they follow a sock_create_kern() and accept off of that. Whilst creating this, I noticed that lustre and ocfs don't create sockets through sock_create_kern() and thus they aren't marked as for-kernel, though they appear to be internal. I wonder if these should do that so that they use the new set of lock keys. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-09 08:09:05 +00:00
int flags, bool kern)
{
struct sock *sk = sock->sk, *nsk;
DECLARE_WAITQUEUE(wait, current);
struct smc_sock *lsmc;
long timeo;
int rc = 0;
lsmc = smc_sk(sk);
sock_hold(sk); /* sock_put below */
lock_sock(sk);
if (lsmc->sk.sk_state != SMC_LISTEN) {
rc = -EINVAL;
release_sock(sk);
goto out;
}
/* Wait for an incoming connection */
timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
add_wait_queue_exclusive(sk_sleep(sk), &wait);
while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
set_current_state(TASK_INTERRUPTIBLE);
if (!timeo) {
rc = -EAGAIN;
break;
}
release_sock(sk);
timeo = schedule_timeout(timeo);
/* wakeup by sk_data_ready in smc_listen_work() */
sched_annotate_sleep();
lock_sock(sk);
if (signal_pending(current)) {
rc = sock_intr_errno(timeo);
break;
}
}
set_current_state(TASK_RUNNING);
remove_wait_queue(sk_sleep(sk), &wait);
if (!rc)
rc = sock_error(nsk);
release_sock(sk);
if (rc)
goto out;
if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
/* wait till data arrives on the socket */
timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
MSEC_PER_SEC);
if (smc_sk(nsk)->use_fallback) {
struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
lock_sock(clcsk);
if (skb_queue_empty(&clcsk->sk_receive_queue))
sk_wait_data(clcsk, &timeo, NULL);
release_sock(clcsk);
} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
lock_sock(nsk);
smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
release_sock(nsk);
}
}
out:
sock_put(sk); /* sock_hold above */
return rc;
}
static int smc_getname(struct socket *sock, struct sockaddr *addr,
net: make getname() functions return length rather than use int* parameter Changes since v1: Added changes in these files: drivers/infiniband/hw/usnic/usnic_transport.c drivers/staging/lustre/lnet/lnet/lib-socket.c drivers/target/iscsi/iscsi_target_login.c drivers/vhost/net.c fs/dlm/lowcomms.c fs/ocfs2/cluster/tcp.c security/tomoyo/network.c Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. text data bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> CC: David S. Miller <davem@davemloft.net> CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: linux-bluetooth@vger.kernel.org CC: linux-decnet-user@lists.sourceforge.net CC: linux-wireless@vger.kernel.org CC: linux-rdma@vger.kernel.org CC: linux-sctp@vger.kernel.org CC: linux-nfs@vger.kernel.org CC: linux-x25@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net>
2018-02-12 19:00:20 +00:00
int peer)
{
struct smc_sock *smc;
if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
(sock->sk->sk_state != SMC_APPCLOSEWAIT1))
return -ENOTCONN;
smc = smc_sk(sock->sk);
net: make getname() functions return length rather than use int* parameter Changes since v1: Added changes in these files: drivers/infiniband/hw/usnic/usnic_transport.c drivers/staging/lustre/lnet/lnet/lib-socket.c drivers/target/iscsi/iscsi_target_login.c drivers/vhost/net.c fs/dlm/lowcomms.c fs/ocfs2/cluster/tcp.c security/tomoyo/network.c Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. text data bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com> CC: David S. Miller <davem@davemloft.net> CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: linux-bluetooth@vger.kernel.org CC: linux-decnet-user@lists.sourceforge.net CC: linux-wireless@vger.kernel.org CC: linux-rdma@vger.kernel.org CC: linux-sctp@vger.kernel.org CC: linux-nfs@vger.kernel.org CC: linux-x25@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net>
2018-02-12 19:00:20 +00:00
return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
}
static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc;
smc = smc_sk(sk);
lock_sock(sk);
/* SMC does not support connect with fastopen */
if (msg->msg_flags & MSG_FASTOPEN) {
/* not connected yet, fallback */
if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
if (rc)
goto out;
} else {
rc = -EINVAL;
goto out;
}
} else if ((sk->sk_state != SMC_ACTIVE) &&
(sk->sk_state != SMC_APPCLOSEWAIT1) &&
(sk->sk_state != SMC_INIT)) {
rc = -EPIPE;
goto out;
}
if (smc->use_fallback) {
rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
} else {
rc = smc_tx_sendmsg(smc, msg, len);
SMC_STAT_TX_PAYLOAD(smc, len, rc);
}
out:
release_sock(sk);
return rc;
}
static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
int flags)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc = -ENOTCONN;
smc = smc_sk(sk);
lock_sock(sk);
if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
/* socket was connected before, no more data to read */
rc = 0;
goto out;
}
if ((sk->sk_state == SMC_INIT) ||
(sk->sk_state == SMC_LISTEN) ||
(sk->sk_state == SMC_CLOSED))
goto out;
if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
rc = 0;
goto out;
}
if (smc->use_fallback) {
rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
} else {
msg->msg_namelen = 0;
rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
SMC_STAT_RX_PAYLOAD(smc, rc, rc);
}
out:
release_sock(sk);
return rc;
}
static __poll_t smc_accept_poll(struct sock *parent)
{
struct smc_sock *isk = smc_sk(parent);
__poll_t mask = 0;
spin_lock(&isk->accept_q_lock);
if (!list_empty(&isk->accept_q))
mask = EPOLLIN | EPOLLRDNORM;
spin_unlock(&isk->accept_q_lock);
return mask;
}
static __poll_t smc_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
__poll_t mask = 0;
if (!sk)
return EPOLLNVAL;
smc = smc_sk(sock->sk);
if (smc->use_fallback) {
/* delegate to CLC child sock */
mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
sk->sk_err = smc->clcsock->sk->sk_err;
} else {
if (sk->sk_state != SMC_CLOSED)
sock_poll_wait(file, sock, wait);
if (sk->sk_err)
mask |= EPOLLERR;
if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
(sk->sk_state == SMC_CLOSED))
mask |= EPOLLHUP;
if (sk->sk_state == SMC_LISTEN) {
/* woken up by sk_data_ready in smc_listen_work() */
mask |= smc_accept_poll(sk);
} else if (smc->use_fallback) { /* as result of connect_work()*/
mask |= smc->clcsock->ops->poll(file, smc->clcsock,
wait);
sk->sk_err = smc->clcsock->sk->sk_err;
} else {
if ((sk->sk_state != SMC_INIT &&
atomic_read(&smc->conn.sndbuf_space)) ||
sk->sk_shutdown & SEND_SHUTDOWN) {
mask |= EPOLLOUT | EPOLLWRNORM;
} else {
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
}
if (atomic_read(&smc->conn.bytes_to_rcv))
mask |= EPOLLIN | EPOLLRDNORM;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
if (sk->sk_state == SMC_APPCLOSEWAIT1)
mask |= EPOLLIN;
if (smc->conn.urg_state == SMC_URG_VALID)
mask |= EPOLLPRI;
}
}
return mask;
}
static int smc_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
bool do_shutdown = true;
struct smc_sock *smc;
int rc = -EINVAL;
int old_state;
int rc1 = 0;
smc = smc_sk(sk);
if ((how < SHUT_RD) || (how > SHUT_RDWR))
return rc;
lock_sock(sk);
if (sock->state == SS_CONNECTING) {
if (sk->sk_state == SMC_ACTIVE)
sock->state = SS_CONNECTED;
else if (sk->sk_state == SMC_PEERCLOSEWAIT1 ||
sk->sk_state == SMC_PEERCLOSEWAIT2 ||
sk->sk_state == SMC_APPCLOSEWAIT1 ||
sk->sk_state == SMC_APPCLOSEWAIT2 ||
sk->sk_state == SMC_APPFINCLOSEWAIT)
sock->state = SS_DISCONNECTING;
}
rc = -ENOTCONN;
if ((sk->sk_state != SMC_ACTIVE) &&
(sk->sk_state != SMC_PEERCLOSEWAIT1) &&
(sk->sk_state != SMC_PEERCLOSEWAIT2) &&
(sk->sk_state != SMC_APPCLOSEWAIT1) &&
(sk->sk_state != SMC_APPCLOSEWAIT2) &&
(sk->sk_state != SMC_APPFINCLOSEWAIT))
goto out;
if (smc->use_fallback) {
rc = kernel_sock_shutdown(smc->clcsock, how);
sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
if (sk->sk_shutdown == SHUTDOWN_MASK) {
sk->sk_state = SMC_CLOSED;
sk->sk_socket->state = SS_UNCONNECTED;
sock_put(sk);
}
goto out;
}
switch (how) {
case SHUT_RDWR: /* shutdown in both directions */
old_state = sk->sk_state;
rc = smc_close_active(smc);
if (old_state == SMC_ACTIVE &&
sk->sk_state == SMC_PEERCLOSEWAIT1)
do_shutdown = false;
break;
case SHUT_WR:
rc = smc_close_shutdown_write(smc);
break;
case SHUT_RD:
rc = 0;
/* nothing more to do because peer is not involved */
break;
}
if (do_shutdown && smc->clcsock)
rc1 = kernel_sock_shutdown(smc->clcsock, how);
/* map sock_shutdown_cmd constants to sk_shutdown value range */
sk->sk_shutdown |= how + 1;
if (sk->sk_state == SMC_CLOSED)
sock->state = SS_UNCONNECTED;
else
sock->state = SS_DISCONNECTING;
out:
release_sock(sk);
return rc ? rc : rc1;
}
static int __smc_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct smc_sock *smc;
int val, len;
smc = smc_sk(sock->sk);
if (get_user(len, optlen))
return -EFAULT;
len = min_t(int, len, sizeof(int));
if (len < 0)
return -EINVAL;
switch (optname) {
case SMC_LIMIT_HS:
val = smc->limit_smc_hs;
break;
default:
return -EOPNOTSUPP;
}
if (put_user(len, optlen))
return -EFAULT;
if (copy_to_user(optval, &val, len))
return -EFAULT;
return 0;
}
static int __smc_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int val, rc;
smc = smc_sk(sk);
lock_sock(sk);
switch (optname) {
case SMC_LIMIT_HS:
if (optlen < sizeof(int)) {
rc = -EINVAL;
break;
}
if (copy_from_sockptr(&val, optval, sizeof(int))) {
rc = -EFAULT;
break;
}
smc->limit_smc_hs = !!val;
rc = 0;
break;
default:
rc = -EOPNOTSUPP;
break;
}
release_sock(sk);
return rc;
}
static int smc_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int val, rc;
if (level == SOL_TCP && optname == TCP_ULP)
return -EOPNOTSUPP;
else if (level == SOL_SMC)
return __smc_setsockopt(sock, level, optname, optval, optlen);
smc = smc_sk(sk);
/* generic setsockopts reaching us here always apply to the
* CLC socket
*/
mutex_lock(&smc->clcsock_release_lock);
if (!smc->clcsock) {
mutex_unlock(&smc->clcsock_release_lock);
return -EBADF;
}
if (unlikely(!smc->clcsock->ops->setsockopt))
rc = -EOPNOTSUPP;
else
rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
optval, optlen);
if (smc->clcsock->sk->sk_err) {
sk->sk_err = smc->clcsock->sk->sk_err;
sk_error_report(sk);
}
mutex_unlock(&smc->clcsock_release_lock);
if (optlen < sizeof(int))
return -EINVAL;
if (copy_from_sockptr(&val, optval, sizeof(int)))
return -EFAULT;
lock_sock(sk);
if (rc || smc->use_fallback)
goto out;
switch (optname) {
case TCP_FASTOPEN:
case TCP_FASTOPEN_CONNECT:
case TCP_FASTOPEN_KEY:
case TCP_FASTOPEN_NO_COOKIE:
/* option not supported by SMC */
if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
} else {
rc = -EINVAL;
}
break;
case TCP_NODELAY:
if (sk->sk_state != SMC_INIT &&
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
if (val) {
SMC_STAT_INC(smc, ndly_cnt);
smc_tx_pending(&smc->conn);
cancel_delayed_work(&smc->conn.tx_work);
}
}
break;
case TCP_CORK:
if (sk->sk_state != SMC_INIT &&
sk->sk_state != SMC_LISTEN &&
sk->sk_state != SMC_CLOSED) {
if (!val) {
SMC_STAT_INC(smc, cork_cnt);
smc_tx_pending(&smc->conn);
cancel_delayed_work(&smc->conn.tx_work);
}
}
break;
case TCP_DEFER_ACCEPT:
smc->sockopt_defer_accept = val;
break;
default:
break;
}
out:
release_sock(sk);
return rc;
}
static int smc_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct smc_sock *smc;
int rc;
if (level == SOL_SMC)
return __smc_getsockopt(sock, level, optname, optval, optlen);
smc = smc_sk(sock->sk);
mutex_lock(&smc->clcsock_release_lock);
if (!smc->clcsock) {
mutex_unlock(&smc->clcsock_release_lock);
return -EBADF;
}
/* socket options apply to the CLC socket */
if (unlikely(!smc->clcsock->ops->getsockopt)) {
mutex_unlock(&smc->clcsock_release_lock);
return -EOPNOTSUPP;
}
rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
optval, optlen);
mutex_unlock(&smc->clcsock_release_lock);
return rc;
}
static int smc_ioctl(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
union smc_host_cursor cons, urg;
struct smc_connection *conn;
struct smc_sock *smc;
int answ;
smc = smc_sk(sock->sk);
conn = &smc->conn;
lock_sock(&smc->sk);
if (smc->use_fallback) {
if (!smc->clcsock) {
release_sock(&smc->sk);
return -EBADF;
}
answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
release_sock(&smc->sk);
return answ;
}
switch (cmd) {
case SIOCINQ: /* same as FIONREAD */
if (smc->sk.sk_state == SMC_LISTEN) {
release_sock(&smc->sk);
return -EINVAL;
}
if (smc->sk.sk_state == SMC_INIT ||
smc->sk.sk_state == SMC_CLOSED)
answ = 0;
else
answ = atomic_read(&smc->conn.bytes_to_rcv);
break;
case SIOCOUTQ:
/* output queue size (not send + not acked) */
if (smc->sk.sk_state == SMC_LISTEN) {
release_sock(&smc->sk);
return -EINVAL;
}
if (smc->sk.sk_state == SMC_INIT ||
smc->sk.sk_state == SMC_CLOSED)
answ = 0;
else
answ = smc->conn.sndbuf_desc->len -
atomic_read(&smc->conn.sndbuf_space);
break;
case SIOCOUTQNSD:
/* output queue size (not send only) */
if (smc->sk.sk_state == SMC_LISTEN) {
release_sock(&smc->sk);
return -EINVAL;
}
if (smc->sk.sk_state == SMC_INIT ||
smc->sk.sk_state == SMC_CLOSED)
answ = 0;
else
answ = smc_tx_prepared_sends(&smc->conn);
break;
case SIOCATMARK:
if (smc->sk.sk_state == SMC_LISTEN) {
release_sock(&smc->sk);
return -EINVAL;
}
if (smc->sk.sk_state == SMC_INIT ||
smc->sk.sk_state == SMC_CLOSED) {
answ = 0;
} else {
smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
smc_curs_copy(&urg, &conn->urg_curs, conn);
answ = smc_curs_diff(conn->rmb_desc->len,
&cons, &urg) == 1;
}
break;
default:
release_sock(&smc->sk);
return -ENOIOCTLCMD;
}
release_sock(&smc->sk);
return put_user(answ, (int __user *)arg);
}
/* Map the affected portions of the rmbe into an spd, note the number of bytes
* to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
* updates till whenever a respective page has been fully processed.
* Note that subsequent recv() calls have to wait till all splice() processing
* completed.
*/
static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
struct sock *sk = sock->sk;
struct smc_sock *smc;
int rc = -ENOTCONN;
smc = smc_sk(sk);
lock_sock(sk);
if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
/* socket was connected before, no more data to read */
rc = 0;
goto out;
}
if (sk->sk_state == SMC_INIT ||
sk->sk_state == SMC_LISTEN ||
sk->sk_state == SMC_CLOSED)
goto out;
if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
rc = 0;
goto out;
}
if (smc->use_fallback) {
rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
pipe, len, flags);
} else {
if (*ppos) {
rc = -ESPIPE;
goto out;
}
if (flags & SPLICE_F_NONBLOCK)
flags = MSG_DONTWAIT;
else
flags = 0;
SMC_STAT_INC(smc, splice_cnt);
rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
}
out:
release_sock(sk);
return rc;
}
/* must look like tcp */
static const struct proto_ops smc_sock_ops = {
.family = PF_SMC,
.owner = THIS_MODULE,
.release = smc_release,
.bind = smc_bind,
.connect = smc_connect,
.socketpair = sock_no_socketpair,
.accept = smc_accept,
.getname = smc_getname,
.poll = smc_poll,
.ioctl = smc_ioctl,
.listen = smc_listen,
.shutdown = smc_shutdown,
.setsockopt = smc_setsockopt,
.getsockopt = smc_getsockopt,
.sendmsg = smc_sendmsg,
.recvmsg = smc_recvmsg,
.mmap = sock_no_mmap,
.splice_read = smc_splice_read,
};
static int __smc_create(struct net *net, struct socket *sock, int protocol,
int kern, struct socket *clcsock)
{
int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
struct smc_sock *smc;
struct sock *sk;
int rc;
rc = -ESOCKTNOSUPPORT;
if (sock->type != SOCK_STREAM)
goto out;
rc = -EPROTONOSUPPORT;
if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
goto out;
rc = -ENOBUFS;
sock->ops = &smc_sock_ops;
sock->state = SS_UNCONNECTED;
sk = smc_sock_alloc(net, sock, protocol);
if (!sk)
goto out;
/* create internal TCP socket for CLC handshake and fallback */
smc = smc_sk(sk);
smc->use_fallback = false; /* assume rdma capability first */
smc->fallback_rsn = 0;
/* default behavior from limit_smc_hs in every net namespace */
smc->limit_smc_hs = net->smc.limit_smc_hs;
rc = 0;
if (!clcsock) {
rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
&smc->clcsock);
if (rc) {
sk_common_release(sk);
goto out;
}
smc: Fix use-after-free in tcp_write_timer_handler(). With Eric's ref tracker, syzbot finally found a repro for use-after-free in tcp_write_timer_handler() by kernel TCP sockets. [0] If SMC creates a kernel socket in __smc_create(), the kernel socket is supposed to be freed in smc_clcsock_release() by calling sock_release() when we close() the parent SMC socket. However, at the end of smc_clcsock_release(), the kernel socket's sk_state might not be TCP_CLOSE. This means that we have not called inet_csk_destroy_sock() in __tcp_close() and have not stopped the TCP timers. The kernel socket's TCP timers can be fired later, so we need to hold a refcnt for net as we do for MPTCP subflows in mptcp_subflow_create_socket(). [0]: leaked reference. sk_alloc (./include/net/net_namespace.h:335 net/core/sock.c:2108) inet_create (net/ipv4/af_inet.c:319 net/ipv4/af_inet.c:244) __sock_create (net/socket.c:1546) smc_create (net/smc/af_smc.c:3269 net/smc/af_smc.c:3284) __sock_create (net/socket.c:1546) __sys_socket (net/socket.c:1634 net/socket.c:1618 net/socket.c:1661) __x64_sys_socket (net/socket.c:1672) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) ================================================================== BUG: KASAN: slab-use-after-free in tcp_write_timer_handler (net/ipv4/tcp_timer.c:378 net/ipv4/tcp_timer.c:624 net/ipv4/tcp_timer.c:594) Read of size 1 at addr ffff888052b65e0d by task syzrepro/18091 CPU: 0 PID: 18091 Comm: syzrepro Tainted: G W 6.3.0-rc4-01174-gb5d54eb5899a #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.amzn2022.0.1 04/01/2014 Call Trace: <IRQ> dump_stack_lvl (lib/dump_stack.c:107) print_report (mm/kasan/report.c:320 mm/kasan/report.c:430) kasan_report (mm/kasan/report.c:538) tcp_write_timer_handler (net/ipv4/tcp_timer.c:378 net/ipv4/tcp_timer.c:624 net/ipv4/tcp_timer.c:594) tcp_write_timer (./include/linux/spinlock.h:390 net/ipv4/tcp_timer.c:643) call_timer_fn (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/timer.h:127 kernel/time/timer.c:1701) __run_timers.part.0 (kernel/time/timer.c:1752 kernel/time/timer.c:2022) run_timer_softirq (kernel/time/timer.c:2037) __do_softirq (./arch/x86/include/asm/jump_label.h:27 ./include/linux/jump_label.h:207 ./include/trace/events/irq.h:142 kernel/softirq.c:572) __irq_exit_rcu (kernel/softirq.c:445 kernel/softirq.c:650) irq_exit_rcu (kernel/softirq.c:664) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1107 (discriminator 14)) </IRQ> Fixes: ac7138746e14 ("smc: establish new socket family") Reported-by: syzbot+7e1e1bdb852961150198@syzkaller.appspotmail.com Link: https://lore.kernel.org/netdev/000000000000a3f51805f8bcc43a@google.com/ Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Reviewed-by: Tony Lu <tonylu@linux.alibaba.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-04-08 18:49:43 +00:00
/* smc_clcsock_release() does not wait smc->clcsock->sk's
* destruction; its sk_state might not be TCP_CLOSE after
* smc->sk is close()d, and TCP timers can be fired later,
* which need net ref.
*/
sk = smc->clcsock->sk;
__netns_tracker_free(net, &sk->ns_tracker, false);
sk->sk_net_refcnt = 1;
get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
sock_inuse_add(net, 1);
} else {
smc->clcsock = clcsock;
net/smc: fix NULL pointer dereference on sock_create_kern() error path when sock_create_kern(..., a) returns an error, 'a' might not be a valid pointer, so it shouldn't be dereferenced to read a->sk->sk_sndbuf and and a->sk->sk_rcvbuf; not doing that caused the following crash: general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 4254 Comm: syzkaller919713 Not tainted 4.16.0-rc1+ #18 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:smc_create+0x14e/0x300 net/smc/af_smc.c:1410 RSP: 0018:ffff8801b06afbc8 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff8801b63457c0 RCX: ffffffff85a3e746 RDX: 0000000000000004 RSI: 00000000ffffffff RDI: 0000000000000020 RBP: ffff8801b06afbf0 R08: 00000000000007c0 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffff8801b6345c08 R14: 00000000ffffffe9 R15: ffffffff8695ced0 FS: 0000000001afb880(0000) GS:ffff8801db200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000040 CR3: 00000001b0721004 CR4: 00000000001606f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __sock_create+0x4d4/0x850 net/socket.c:1285 sock_create net/socket.c:1325 [inline] SYSC_socketpair net/socket.c:1409 [inline] SyS_socketpair+0x1c0/0x6f0 net/socket.c:1366 do_syscall_64+0x282/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x26/0x9b RIP: 0033:0x4404b9 RSP: 002b:00007fff44ab6908 EFLAGS: 00000246 ORIG_RAX: 0000000000000035 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004404b9 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 000000000000002b RBP: 00007fff44ab6910 R08: 0000000000000002 R09: 00007fff44003031 R10: 0000000020000040 R11: 0000000000000246 R12: ffffffffffffffff R13: 0000000000000006 R14: 0000000000000000 R15: 0000000000000000 Code: 48 c1 ea 03 80 3c 02 00 0f 85 b3 01 00 00 4c 8b a3 48 04 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7c 24 20 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 82 01 00 00 4d 8b 7c 24 20 48 b8 00 00 00 00 RIP: smc_create+0x14e/0x300 net/smc/af_smc.c:1410 RSP: ffff8801b06afbc8 Fixes: cd6851f30386 smc: remote memory buffers (RMBs) Reported-and-tested-by: syzbot+aa0227369be2dcc26ebe@syzkaller.appspotmail.com Signed-off-by: Davide Caratti <dcaratti@redhat.com> Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-02-28 11:44:09 +00:00
}
out:
return rc;
}
static int smc_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
return __smc_create(net, sock, protocol, kern, NULL);
}
static const struct net_proto_family smc_sock_family_ops = {
.family = PF_SMC,
.owner = THIS_MODULE,
.create = smc_create,
};
static int smc_ulp_init(struct sock *sk)
{
struct socket *tcp = sk->sk_socket;
struct net *net = sock_net(sk);
struct socket *smcsock;
int protocol, ret;
/* only TCP can be replaced */
if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP ||
(sk->sk_family != AF_INET && sk->sk_family != AF_INET6))
return -ESOCKTNOSUPPORT;
/* don't handle wq now */
if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list)
return -ENOTCONN;
if (sk->sk_family == AF_INET)
protocol = SMCPROTO_SMC;
else
protocol = SMCPROTO_SMC6;
smcsock = sock_alloc();
if (!smcsock)
return -ENFILE;
smcsock->type = SOCK_STREAM;
__module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */
ret = __smc_create(net, smcsock, protocol, 1, tcp);
if (ret) {
sock_release(smcsock); /* module_put() which ops won't be NULL */
return ret;
}
/* replace tcp socket to smc */
smcsock->file = tcp->file;
smcsock->file->private_data = smcsock;
smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */
smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */
tcp->file = NULL;
return ret;
}
static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk,
const gfp_t priority)
{
struct inet_connection_sock *icsk = inet_csk(newsk);
/* don't inherit ulp ops to child when listen */
icsk->icsk_ulp_ops = NULL;
}
static struct tcp_ulp_ops smc_ulp_ops __read_mostly = {
.name = "smc",
.owner = THIS_MODULE,
.init = smc_ulp_init,
.clone = smc_ulp_clone,
};
unsigned int smc_net_id;
static __net_init int smc_net_init(struct net *net)
{
net/smc: fix compile warning for smc_sysctl kernel test robot reports multiple warning for smc_sysctl: In file included from net/smc/smc_sysctl.c:17: >> net/smc/smc_sysctl.h:23:5: warning: no previous prototype \ for function 'smc_sysctl_init' [-Wmissing-prototypes] int smc_sysctl_init(void) ^ and >> WARNING: modpost: vmlinux.o(.text+0x12ced2d): Section mismatch \ in reference from the function smc_sysctl_exit() to the variable .init.data:smc_sysctl_ops The function smc_sysctl_exit() references the variable __initdata smc_sysctl_ops. This is often because smc_sysctl_exit lacks a __initdata annotation or the annotation of smc_sysctl_ops is wrong. and net/smc/smc_sysctl.c: In function 'smc_sysctl_init_net': net/smc/smc_sysctl.c:47:17: error: 'struct netns_smc' has no member named 'smc_hdr' 47 | net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); Since we don't need global sysctl initialization. To make things clean and simple, remove the global pernet_operations and smc_sysctl_{init|exit}. Call smc_sysctl_net_{init|exit} directly from smc_net_{init|exit}. Also initialized sysctl_autocorking_size if CONFIG_SYSCTL it not set, this make sure SMC autocorking is enabled by default if CONFIG_SYSCTL is not set. Fixes: 462791bbfa35 ("net/smc: add sysctl interface for SMC") Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Dust Li <dust.li@linux.alibaba.com> Tested-by: Randy Dunlap <rdunlap@infradead.org> # build-tested Signed-off-by: David S. Miller <davem@davemloft.net>
2022-03-07 01:54:24 +00:00
int rc;
rc = smc_sysctl_net_init(net);
if (rc)
return rc;
return smc_pnet_net_init(net);
}
static void __net_exit smc_net_exit(struct net *net)
{
net/smc: fix compile warning for smc_sysctl kernel test robot reports multiple warning for smc_sysctl: In file included from net/smc/smc_sysctl.c:17: >> net/smc/smc_sysctl.h:23:5: warning: no previous prototype \ for function 'smc_sysctl_init' [-Wmissing-prototypes] int smc_sysctl_init(void) ^ and >> WARNING: modpost: vmlinux.o(.text+0x12ced2d): Section mismatch \ in reference from the function smc_sysctl_exit() to the variable .init.data:smc_sysctl_ops The function smc_sysctl_exit() references the variable __initdata smc_sysctl_ops. This is often because smc_sysctl_exit lacks a __initdata annotation or the annotation of smc_sysctl_ops is wrong. and net/smc/smc_sysctl.c: In function 'smc_sysctl_init_net': net/smc/smc_sysctl.c:47:17: error: 'struct netns_smc' has no member named 'smc_hdr' 47 | net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); Since we don't need global sysctl initialization. To make things clean and simple, remove the global pernet_operations and smc_sysctl_{init|exit}. Call smc_sysctl_net_{init|exit} directly from smc_net_{init|exit}. Also initialized sysctl_autocorking_size if CONFIG_SYSCTL it not set, this make sure SMC autocorking is enabled by default if CONFIG_SYSCTL is not set. Fixes: 462791bbfa35 ("net/smc: add sysctl interface for SMC") Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Dust Li <dust.li@linux.alibaba.com> Tested-by: Randy Dunlap <rdunlap@infradead.org> # build-tested Signed-off-by: David S. Miller <davem@davemloft.net>
2022-03-07 01:54:24 +00:00
smc_sysctl_net_exit(net);
smc_pnet_net_exit(net);
}
static __net_init int smc_net_stat_init(struct net *net)
{
return smc_stats_init(net);
}
static void __net_exit smc_net_stat_exit(struct net *net)
{
smc_stats_exit(net);
}
static struct pernet_operations smc_net_ops = {
.init = smc_net_init,
.exit = smc_net_exit,
.id = &smc_net_id,
.size = sizeof(struct smc_net),
};
static struct pernet_operations smc_net_stat_ops = {
.init = smc_net_stat_init,
.exit = smc_net_stat_exit,
};
static int __init smc_init(void)
{
int rc;
rc = register_pernet_subsys(&smc_net_ops);
if (rc)
return rc;
rc = register_pernet_subsys(&smc_net_stat_ops);
if (rc)
goto out_pernet_subsys;
rc = smc_ism_init();
if (rc)
goto out_pernet_subsys_stat;
smc_clc_init();
rc = smc_nl_init();
if (rc)
goto out_ism;
rc = smc_pnet_init();
if (rc)
goto out_nl;
rc = -ENOMEM;
smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
if (!smc_tcp_ls_wq)
goto out_pnet;
smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
if (!smc_hs_wq)
goto out_alloc_tcp_ls_wq;
smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
if (!smc_close_wq)
goto out_alloc_hs_wq;
rc = smc_core_init();
if (rc) {
pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
goto out_alloc_wqs;
}
rc = smc_llc_init();
if (rc) {
pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
goto out_core;
}
rc = smc_cdc_init();
if (rc) {
pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
goto out_core;
}
rc = proto_register(&smc_proto, 1);
if (rc) {
pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
goto out_core;
}
rc = proto_register(&smc_proto6, 1);
if (rc) {
pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
goto out_proto;
}
rc = sock_register(&smc_sock_family_ops);
if (rc) {
pr_err("%s: sock_register fails with %d\n", __func__, rc);
goto out_proto6;
}
INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
rc = smc_ib_register_client();
if (rc) {
pr_err("%s: ib_register fails with %d\n", __func__, rc);
goto out_sock;
}
rc = smc_loopback_init();
if (rc) {
pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc);
goto out_ib;
}
rc = tcp_register_ulp(&smc_ulp_ops);
if (rc) {
pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
goto out_lo;
}
static_branch_enable(&tcp_have_smc);
return 0;
out_lo:
smc_loopback_exit();
out_ib:
smc_ib_unregister_client();
out_sock:
sock_unregister(PF_SMC);
out_proto6:
proto_unregister(&smc_proto6);
out_proto:
proto_unregister(&smc_proto);
out_core:
smc_core_exit();
out_alloc_wqs:
destroy_workqueue(smc_close_wq);
out_alloc_hs_wq:
destroy_workqueue(smc_hs_wq);
out_alloc_tcp_ls_wq:
destroy_workqueue(smc_tcp_ls_wq);
out_pnet:
smc_pnet_exit();
out_nl:
smc_nl_exit();
out_ism:
smc_clc_exit();
smc_ism_exit();
out_pernet_subsys_stat:
unregister_pernet_subsys(&smc_net_stat_ops);
out_pernet_subsys:
unregister_pernet_subsys(&smc_net_ops);
return rc;
}
static void __exit smc_exit(void)
{
static_branch_disable(&tcp_have_smc);
tcp_unregister_ulp(&smc_ulp_ops);
sock_unregister(PF_SMC);
smc_core_exit();
smc_loopback_exit();
smc_ib_unregister_client();
smc_ism_exit();
destroy_workqueue(smc_close_wq);
destroy_workqueue(smc_tcp_ls_wq);
destroy_workqueue(smc_hs_wq);
proto_unregister(&smc_proto6);
proto_unregister(&smc_proto);
smc_pnet_exit();
smc_nl_exit();
smc_clc_exit();
unregister_pernet_subsys(&smc_net_stat_ops);
unregister_pernet_subsys(&smc_net_ops);
rcu_barrier();
}
module_init(smc_init);
module_exit(smc_exit);
MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
MODULE_DESCRIPTION("smc socket address family");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_SMC);
MODULE_ALIAS_TCP_ULP("smc");
MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME);