Merge branch 'fix-ktls-with-sk_skb_verdict'

John Fastabend says:

====================
If a socket is running a BPF_SK_SKB_SREAM_VERDICT program and KTLS is
enabled the data stream may be broken if both TLS stream parser and
BPF stream parser try to handle data. Fix this here by making KTLS
stream parser run first to ensure TLS messages are received correctly
and then calling the verdict program. This analogous to how we handle
a similar conflict on the TX side.

Note, this is a fix but it doesn't make sense to push this late to
bpf tree so targeting bpf-next and keeping fixes tags.
====================

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2020-06-01 14:16:48 -07:00
commit 7b805819c4
6 changed files with 296 additions and 48 deletions

View File

@ -437,4 +437,12 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs)
psock_set_prog(&progs->skb_verdict, NULL);
}
int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb);
static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
{
if (!psock)
return false;
return psock->parser.enabled;
}
#endif /* _LINUX_SKMSG_H */

View File

@ -571,6 +571,15 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk)
return !!tls_sw_ctx_tx(ctx);
}
static inline bool tls_sw_has_ctx_rx(const struct sock *sk)
{
struct tls_context *ctx = tls_get_ctx(sk);
if (!ctx)
return false;
return !!tls_sw_ctx_rx(ctx);
}
void tls_sw_write_space(struct sock *sk, struct tls_context *ctx);
void tls_device_write_space(struct sock *sk, struct tls_context *ctx);

View File

@ -7,6 +7,7 @@
#include <net/sock.h>
#include <net/tcp.h>
#include <net/tls.h>
static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
{
@ -682,13 +683,75 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
return container_of(parser, struct sk_psock, parser);
}
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
bool ingress;
sk_other = tcp_skb_bpf_redirect_fetch(skb);
if (unlikely(!sk_other)) {
kfree_skb(skb);
return;
}
psock_other = sk_psock(sk_other);
if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
kfree_skb(skb);
return;
}
ingress = tcp_skb_bpf_ingress(skb);
if ((!ingress && sock_writeable(sk_other)) ||
(ingress &&
atomic_read(&sk_other->sk_rmem_alloc) <=
sk_other->sk_rcvbuf)) {
if (!ingress)
skb_set_owner_w(skb, sk_other);
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
} else {
kfree_skb(skb);
}
}
static void sk_psock_tls_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
switch (verdict) {
case __SK_REDIRECT:
sk_psock_skb_redirect(psock, skb);
break;
case __SK_PASS:
case __SK_DROP:
default:
break;
}
}
int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
{
struct bpf_prog *prog;
int ret = __SK_PASS;
rcu_read_lock();
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
}
rcu_read_unlock();
sk_psock_tls_verdict_apply(psock, skb, ret);
return ret;
}
EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
struct sock *sk_other;
switch (verdict) {
case __SK_PASS:
sk_other = psock->sk;
@ -707,25 +770,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
}
goto out_free;
case __SK_REDIRECT:
sk_other = tcp_skb_bpf_redirect_fetch(skb);
if (unlikely(!sk_other))
goto out_free;
psock_other = sk_psock(sk_other);
if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
goto out_free;
ingress = tcp_skb_bpf_ingress(skb);
if ((!ingress && sock_writeable(sk_other)) ||
(ingress &&
atomic_read(&sk_other->sk_rmem_alloc) <=
sk_other->sk_rcvbuf)) {
if (!ingress)
skb_set_owner_w(skb, sk_other);
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
break;
}
/* fall-through */
sk_psock_skb_redirect(psock, skb);
break;
case __SK_DROP:
/* fall-through */
default:
@ -779,9 +825,13 @@ static void sk_psock_strp_data_ready(struct sock *sk)
rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock)) {
write_lock_bh(&sk->sk_callback_lock);
strp_data_ready(&psock->parser.strp);
write_unlock_bh(&sk->sk_callback_lock);
if (tls_sw_has_ctx_rx(sk)) {
psock->parser.saved_data_ready(sk);
} else {
write_lock_bh(&sk->sk_callback_lock);
strp_data_ready(&psock->parser.strp);
write_unlock_bh(&sk->sk_callback_lock);
}
}
rcu_read_unlock();
}

View File

@ -1742,6 +1742,7 @@ int tls_sw_recvmsg(struct sock *sk,
long timeo;
bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
bool is_peek = flags & MSG_PEEK;
bool bpf_strp_enabled;
int num_async = 0;
int pending;
@ -1752,6 +1753,7 @@ int tls_sw_recvmsg(struct sock *sk,
psock = sk_psock_get(sk);
lock_sock(sk);
bpf_strp_enabled = sk_psock_strp_enabled(psock);
/* Process pending decrypted records. It must be non-zero-copy */
err = process_rx_list(ctx, msg, &control, &cmsg, 0, len, false,
@ -1805,11 +1807,12 @@ int tls_sw_recvmsg(struct sock *sk,
if (to_decrypt <= len && !is_kvec && !is_peek &&
ctx->control == TLS_RECORD_TYPE_DATA &&
prot->version != TLS_1_3_VERSION)
prot->version != TLS_1_3_VERSION &&
!bpf_strp_enabled)
zc = true;
/* Do not use async mode if record is non-data */
if (ctx->control == TLS_RECORD_TYPE_DATA)
if (ctx->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled)
async_capable = ctx->async_capable;
else
async_capable = false;
@ -1859,6 +1862,19 @@ int tls_sw_recvmsg(struct sock *sk,
goto pick_next_record;
if (!zc) {
if (bpf_strp_enabled) {
err = sk_psock_tls_strp_read(psock, skb);
if (err != __SK_PASS) {
rxm->offset = rxm->offset + rxm->full_len;
rxm->full_len = 0;
if (err == __SK_DROP)
consume_skb(skb);
ctx->recv_pkt = NULL;
__strp_unpause(&ctx->strp);
continue;
}
}
if (rxm->full_len > len) {
retain_skb = true;
chunk = len;

View File

@ -79,11 +79,18 @@ struct {
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__uint(max_entries, 2);
__type(key, int);
__type(value, int);
} sock_skb_opts SEC(".maps");
struct {
__uint(type, TEST_MAP_TYPE);
__uint(max_entries, 20);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
} tls_sock_map SEC(".maps");
SEC("sk_skb1")
int bpf_prog1(struct __sk_buff *skb)
{
@ -118,6 +125,43 @@ int bpf_prog2(struct __sk_buff *skb)
}
SEC("sk_skb3")
int bpf_prog3(struct __sk_buff *skb)
{
const int one = 1;
int err, *f, ret = SK_PASS;
void *data_end;
char *c;
err = bpf_skb_pull_data(skb, 19);
if (err)
goto tls_out;
c = (char *)(long)skb->data;
data_end = (void *)(long)skb->data_end;
if (c + 18 < data_end)
memcpy(&c[13], "PASS", 4);
f = bpf_map_lookup_elem(&sock_skb_opts, &one);
if (f && *f) {
__u64 flags = 0;
ret = 0;
flags = *f;
#ifdef SOCKMAP
return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags);
#else
return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags);
#endif
}
f = bpf_map_lookup_elem(&sock_skb_opts, &one);
if (f && *f)
ret = SK_DROP;
tls_out:
return ret;
}
SEC("sockops")
int bpf_sockmap(struct bpf_sock_ops *skops)
{

View File

@ -63,8 +63,8 @@ int s1, s2, c1, c2, p1, p2;
int test_cnt;
int passed;
int failed;
int map_fd[8];
struct bpf_map *maps[8];
int map_fd[9];
struct bpf_map *maps[9];
int prog_fd[11];
int txmsg_pass;
@ -79,7 +79,10 @@ int txmsg_end_push;
int txmsg_start_pop;
int txmsg_pop;
int txmsg_ingress;
int txmsg_skb;
int txmsg_redir_skb;
int txmsg_ktls_skb;
int txmsg_ktls_skb_drop;
int txmsg_ktls_skb_redir;
int ktls;
int peek_flag;
@ -104,7 +107,7 @@ static const struct option long_options[] = {
{"txmsg_start_pop", required_argument, NULL, 'w'},
{"txmsg_pop", required_argument, NULL, 'x'},
{"txmsg_ingress", no_argument, &txmsg_ingress, 1 },
{"txmsg_skb", no_argument, &txmsg_skb, 1 },
{"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 },
{"ktls", no_argument, &ktls, 1 },
{"peek", no_argument, &peek_flag, 1 },
{"whitelist", required_argument, NULL, 'n' },
@ -169,7 +172,8 @@ static void test_reset(void)
txmsg_start_push = txmsg_end_push = 0;
txmsg_pass = txmsg_drop = txmsg_redir = 0;
txmsg_apply = txmsg_cork = 0;
txmsg_ingress = txmsg_skb = 0;
txmsg_ingress = txmsg_redir_skb = 0;
txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0;
}
static int test_start_subtest(const struct _test *t, struct sockmap_options *o)
@ -502,14 +506,41 @@ unwind_iov:
static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
{
int i, j, bytes_cnt = 0;
int i, j = 0, bytes_cnt = 0;
unsigned char k = 0;
for (i = 0; i < msg->msg_iovlen; i++) {
unsigned char *d = msg->msg_iov[i].iov_base;
for (j = 0;
j < msg->msg_iov[i].iov_len && size; j++) {
/* Special case test for skb ingress + ktls */
if (i == 0 && txmsg_ktls_skb) {
if (msg->msg_iov[i].iov_len < 4)
return -EIO;
if (txmsg_ktls_skb_redir) {
if (memcmp(&d[13], "PASS", 4) != 0) {
fprintf(stderr,
"detected redirect ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[13], d[14], d[15], d[16]);
return -EIO;
}
d[13] = 0;
d[14] = 1;
d[15] = 2;
d[16] = 3;
j = 13;
} else if (txmsg_ktls_skb) {
if (memcmp(d, "PASS", 4) != 0) {
fprintf(stderr,
"detected ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[0], d[1], d[2], d[3]);
return -EIO;
}
d[0] = 0;
d[1] = 1;
d[2] = 2;
d[3] = 3;
}
}
for (; j < msg->msg_iov[i].iov_len && size; j++) {
if (d[j] != k++) {
fprintf(stderr,
"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
@ -724,7 +755,7 @@ static int sendmsg_test(struct sockmap_options *opt)
rxpid = fork();
if (rxpid == 0) {
iov_buf -= (txmsg_pop - txmsg_start_pop + 1);
if (opt->drop_expected)
if (opt->drop_expected || txmsg_ktls_skb_drop)
_exit(0);
if (!iov_buf) /* zero bytes sent case */
@ -911,8 +942,28 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test)
return err;
}
/* Attach programs to TLS sockmap */
if (txmsg_ktls_skb) {
err = bpf_prog_attach(prog_fd[0], map_fd[8],
BPF_SK_SKB_STREAM_PARSER, 0);
if (err) {
fprintf(stderr,
"ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n",
prog_fd[0], map_fd[8], err, strerror(errno));
return err;
}
err = bpf_prog_attach(prog_fd[2], map_fd[8],
BPF_SK_SKB_STREAM_VERDICT, 0);
if (err) {
fprintf(stderr, "ERROR: bpf_prog_attach (TLS sockmap): %d (%s)\n",
err, strerror(errno));
return err;
}
}
/* Attach to cgroups */
err = bpf_prog_attach(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
err = bpf_prog_attach(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
if (err) {
fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n",
err, strerror(errno));
@ -928,15 +979,15 @@ run:
/* Attach txmsg program to sockmap */
if (txmsg_pass)
tx_prog_fd = prog_fd[3];
else if (txmsg_redir)
tx_prog_fd = prog_fd[4];
else if (txmsg_apply)
else if (txmsg_redir)
tx_prog_fd = prog_fd[5];
else if (txmsg_cork)
else if (txmsg_apply)
tx_prog_fd = prog_fd[6];
else if (txmsg_drop)
else if (txmsg_cork)
tx_prog_fd = prog_fd[7];
else if (txmsg_drop)
tx_prog_fd = prog_fd[8];
else
tx_prog_fd = 0;
@ -1108,7 +1159,35 @@ run:
}
}
if (txmsg_skb) {
if (txmsg_ktls_skb) {
int ingress = BPF_F_INGRESS;
i = 0;
err = bpf_map_update_elem(map_fd[8], &i, &p2, BPF_ANY);
if (err) {
fprintf(stderr,
"ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n",
err, strerror(errno));
}
if (txmsg_ktls_skb_redir) {
i = 1;
err = bpf_map_update_elem(map_fd[7],
&i, &ingress, BPF_ANY);
if (err) {
fprintf(stderr,
"ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n",
err, strerror(errno));
}
}
if (txmsg_ktls_skb_drop) {
i = 1;
err = bpf_map_update_elem(map_fd[7], &i, &i, BPF_ANY);
}
}
if (txmsg_redir_skb) {
int skb_fd = (test == SENDMSG || test == SENDPAGE) ?
p2 : p1;
int ingress = BPF_F_INGRESS;
@ -1123,8 +1202,7 @@ run:
}
i = 3;
err = bpf_map_update_elem(map_fd[0],
&i, &skb_fd, BPF_ANY);
err = bpf_map_update_elem(map_fd[0], &i, &skb_fd, BPF_ANY);
if (err) {
fprintf(stderr,
"ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n",
@ -1158,9 +1236,12 @@ run:
fprintf(stderr, "unknown test\n");
out:
/* Detatch and zero all the maps */
bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS);
bpf_prog_detach2(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS);
bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER);
bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT);
bpf_prog_detach2(prog_fd[0], map_fd[8], BPF_SK_SKB_STREAM_PARSER);
bpf_prog_detach2(prog_fd[2], map_fd[8], BPF_SK_SKB_STREAM_VERDICT);
if (tx_prog_fd >= 0)
bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT);
@ -1229,8 +1310,10 @@ static void test_options(char *options)
}
if (txmsg_ingress)
strncat(options, "ingress,", OPTSTRING);
if (txmsg_skb)
strncat(options, "skb,", OPTSTRING);
if (txmsg_redir_skb)
strncat(options, "redir_skb,", OPTSTRING);
if (txmsg_ktls_skb)
strncat(options, "ktls_skb,", OPTSTRING);
if (ktls)
strncat(options, "ktls,", OPTSTRING);
if (peek_flag)
@ -1362,6 +1445,40 @@ static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt)
test_send(opt, cgrp);
}
static void test_txmsg_skb(int cgrp, struct sockmap_options *opt)
{
bool data = opt->data_test;
int k = ktls;
opt->data_test = true;
ktls = 1;
txmsg_pass = txmsg_drop = 0;
txmsg_ingress = txmsg_redir = 0;
txmsg_ktls_skb = 1;
txmsg_pass = 1;
/* Using data verification so ensure iov layout is
* expected from test receiver side. e.g. has enough
* bytes to write test code.
*/
opt->iov_length = 100;
opt->iov_count = 1;
opt->rate = 1;
test_exec(cgrp, opt);
txmsg_ktls_skb_drop = 1;
test_exec(cgrp, opt);
txmsg_ktls_skb_drop = 0;
txmsg_ktls_skb_redir = 1;
test_exec(cgrp, opt);
opt->data_test = data;
ktls = k;
}
/* Test cork with hung data. This tests poor usage patterns where
* cork can leave data on the ring if user program is buggy and
* doesn't flush them somehow. They do take some time however
@ -1542,11 +1659,13 @@ char *map_names[] = {
"sock_bytes",
"sock_redir_flags",
"sock_skb_opts",
"tls_sock_map",
};
int prog_attach_type[] = {
BPF_SK_SKB_STREAM_PARSER,
BPF_SK_SKB_STREAM_VERDICT,
BPF_SK_SKB_STREAM_VERDICT,
BPF_CGROUP_SOCK_OPS,
BPF_SK_MSG_VERDICT,
BPF_SK_MSG_VERDICT,
@ -1558,6 +1677,7 @@ int prog_attach_type[] = {
};
int prog_type[] = {
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_SOCK_OPS,
@ -1620,6 +1740,7 @@ struct _test test[] = {
{"txmsg test redirect", test_txmsg_redir},
{"txmsg test drop", test_txmsg_drop},
{"txmsg test ingress redirect", test_txmsg_ingress_redir},
{"txmsg test skb", test_txmsg_skb},
{"txmsg test apply", test_txmsg_apply},
{"txmsg test cork", test_txmsg_cork},
{"txmsg test hanging corks", test_txmsg_cork_hangs},