From c329ef9684de9517d82af5b4758c9e1b64a8a11a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Nov 2019 12:16:39 -0800 Subject: [PATCH 01/24] net/tls: take into account that bpf_exec_tx_verdict() may free the record bpf_exec_tx_verdict() may free the record if tls_push_record() fails, or if the entire record got consumed by BPF. Re-check ctx->open_rec before touching the data. Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index da9f9ce51e7b..70e3c0c1af50 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -984,7 +984,7 @@ alloc_encrypted: num_async++; else if (ret == -ENOMEM) goto wait_for_memory; - else if (ret == -ENOSPC) + else if (ctx->open_rec && ret == -ENOSPC) goto rollback_iter; else if (ret != -EAGAIN) goto send_end; @@ -1053,11 +1053,12 @@ wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { trim_sgl: - tls_trim_both_msgs(sk, orig_size); + if (ctx->open_rec) + tls_trim_both_msgs(sk, orig_size); goto send_end; } - if (msg_en->sg.size < required_size) + if (ctx->open_rec && msg_en->sg.size < required_size) goto alloc_encrypted; } @@ -1190,11 +1191,13 @@ wait_for_sndbuf: wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { - tls_trim_both_msgs(sk, msg_pl->sg.size); + if (ctx->open_rec) + tls_trim_both_msgs(sk, msg_pl->sg.size); goto sendpage_end; } - goto alloc_payload; + if (ctx->open_rec) + goto alloc_payload; } if (num_async) { From d10523d0b3d78153ee58d19853ced26c9004c8c4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Nov 2019 12:16:40 -0800 Subject: [PATCH 02/24] net/tls: free the record on encryption error When tls_do_encryption() fails the SG lists are left with the SG_END and SG_CHAIN marks in place. One could hope that once encryption fails we will never see the record again, but that is in fact not true. Commit d3b18ad31f93 ("tls: add bpf support to sk_msg handling") added special handling to ENOMEM and ENOSPC errors which mean we may see the same record re-submitted. As suggested by John free the record, the BPF code is already doing just that. Reported-by: syzbot+df0d4ec12332661dd1f9@syzkaller.appspotmail.com Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 70e3c0c1af50..dbba51b69d21 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -771,8 +771,14 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, policy = !(flags & MSG_SENDPAGE_NOPOLICY); psock = sk_psock_get(sk); - if (!psock || !policy) - return tls_push_record(sk, flags, record_type); + if (!psock || !policy) { + err = tls_push_record(sk, flags, record_type); + if (err) { + *copied -= sk_msg_free(sk, msg); + tls_free_open_rec(sk); + } + return err; + } more_data: enospc = sk_msg_full(msg); if (psock->eval == __SK_NONE) { From 031097d9e079e40dce401031d1012e83d80eaf01 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Nov 2019 12:16:41 -0800 Subject: [PATCH 03/24] net: skmsg: fix TLS 1.3 crash with full sk_msg TLS 1.3 started using the entry at the end of the SG array for chaining-in the single byte content type entry. This mostly works: [ E E E E E E . . ] ^ ^ start end E < content type / [ E E E E E E C . ] ^ ^ start end (Where E denotes a populated SG entry; C denotes a chaining entry.) If the array is full, however, the end will point to the start: [ E E E E E E E E ] ^ start end And we end up overwriting the start: E < content type / [ C E E E E E E E ] ^ start end The sg array is supposed to be a circular buffer with start and end markers pointing anywhere. In case where start > end (i.e. the circular buffer has "wrapped") there is an extra entry reserved at the end to chain the two halves together. [ E E E E E E . . l ] (Where l is the reserved entry for "looping" back to front. As suggested by John, let's reserve another entry for chaining SG entries after the main circular buffer. Note that this entry has to be pointed to by the end entry so its position is not fixed. Examples of full messages: [ E E E E E E E E . l ] ^ ^ start end <---------------. [ E E . E E E E E E l ] ^ ^ end start Now the end will always point to an unused entry, so TLS 1.3 can always use it. Fixes: 130b392c6cd6 ("net: tls: Add tls 1.3 support") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/skmsg.h | 28 ++++++++++++++-------------- net/core/filter.c | 8 ++++---- net/core/skmsg.c | 2 +- net/ipv4/tcp_bpf.c | 2 +- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 6cb077b646a5..ef7031f8a304 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -14,6 +14,7 @@ #include #define MAX_MSG_FRAGS MAX_SKB_FRAGS +#define NR_MSG_FRAG_IDS (MAX_MSG_FRAGS + 1) enum __sk_action { __SK_DROP = 0, @@ -29,13 +30,15 @@ struct sk_msg_sg { u32 size; u32 copybreak; unsigned long copy; - /* The extra element is used for chaining the front and sections when - * the list becomes partitioned (e.g. end < start). The crypto APIs - * require the chaining. + /* The extra two elements: + * 1) used for chaining the front and sections when the list becomes + * partitioned (e.g. end < start). The crypto APIs require the + * chaining; + * 2) to chain tailer SG entries after the message. */ - struct scatterlist data[MAX_MSG_FRAGS + 1]; + struct scatterlist data[MAX_MSG_FRAGS + 2]; }; -static_assert(BITS_PER_LONG >= MAX_MSG_FRAGS); +static_assert(BITS_PER_LONG >= NR_MSG_FRAG_IDS); /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { @@ -142,13 +145,13 @@ static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) static inline u32 sk_msg_iter_dist(u32 start, u32 end) { - return end >= start ? end - start : end + (MAX_MSG_FRAGS - start); + return end >= start ? end - start : end + (NR_MSG_FRAG_IDS - start); } #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ - var = MAX_MSG_FRAGS - 1; \ + var = NR_MSG_FRAG_IDS - 1; \ else \ var--; \ } while (0) @@ -156,7 +159,7 @@ static inline u32 sk_msg_iter_dist(u32 start, u32 end) #define sk_msg_iter_var_next(var) \ do { \ var++; \ - if (var == MAX_MSG_FRAGS) \ + if (var == NR_MSG_FRAG_IDS) \ var = 0; \ } while (0) @@ -173,9 +176,9 @@ static inline void sk_msg_clear_meta(struct sk_msg *msg) static inline void sk_msg_init(struct sk_msg *msg) { - BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != MAX_MSG_FRAGS); + BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); memset(msg, 0, sizeof(*msg)); - sg_init_marker(msg->sg.data, MAX_MSG_FRAGS); + sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, @@ -196,14 +199,11 @@ static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) static inline bool sk_msg_full(const struct sk_msg *msg) { - return (msg->sg.end == msg->sg.start) && msg->sg.size; + return sk_msg_iter_dist(msg->sg.start, msg->sg.end) == MAX_MSG_FRAGS; } static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { - if (sk_msg_full(msg)) - return MAX_MSG_FRAGS; - return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } diff --git a/net/core/filter.c b/net/core/filter.c index b0ed048585ba..f1e703eed3d2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2299,7 +2299,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, WARN_ON_ONCE(last_sge == first_sge); shift = last_sge > first_sge ? last_sge - first_sge - 1 : - MAX_SKB_FRAGS - first_sge + last_sge - 1; + NR_MSG_FRAG_IDS - first_sge + last_sge - 1; if (!shift) goto out; @@ -2308,8 +2308,8 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, do { u32 move_from; - if (i + shift >= MAX_MSG_FRAGS) - move_from = i + shift - MAX_MSG_FRAGS; + if (i + shift >= NR_MSG_FRAG_IDS) + move_from = i + shift - NR_MSG_FRAG_IDS; else move_from = i + shift; if (move_from == msg->sg.end) @@ -2323,7 +2323,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, } while (1); msg->sg.end = msg->sg.end - shift > msg->sg.end ? - msg->sg.end - shift + MAX_MSG_FRAGS : + msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a469d2124f3f..ded2d5227678 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -421,7 +421,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) copied = skb->len; msg->sg.start = 0; msg->sg.size = copied; - msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge; + msg->sg.end = num_sge; msg->skb = skb; sk_psock_queue_msg(psock, msg); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 8a56e09cfb0e..e38705165ac9 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -301,7 +301,7 @@ EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { - bool cork = false, enospc = msg->sg.start == msg->sg.end; + bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; u32 tosend, delta = 0; int ret; From 65190f77424d7b82c4aad7326c9cce6bd91a2fcc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Nov 2019 12:16:42 -0800 Subject: [PATCH 04/24] selftests/tls: add a test for fragmented messages Add a sendmsg test with very fragmented messages. This should fill up sk_msg and test the boundary conditions. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 60 +++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 1c8f194d6556..46abcae47dee 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -268,6 +268,38 @@ TEST_F(tls, sendmsg_single) EXPECT_EQ(memcmp(buf, test_str, send_len), 0); } +#define MAX_FRAGS 64 +#define SEND_LEN 13 +TEST_F(tls, sendmsg_fragmented) +{ + char const *test_str = "test_sendmsg"; + char buf[SEND_LEN * MAX_FRAGS]; + struct iovec vec[MAX_FRAGS]; + struct msghdr msg; + int i, frags; + + for (frags = 1; frags <= MAX_FRAGS; frags++) { + for (i = 0; i < frags; i++) { + vec[i].iov_base = (char *)test_str; + vec[i].iov_len = SEND_LEN; + } + + memset(&msg, 0, sizeof(struct msghdr)); + msg.msg_iov = vec; + msg.msg_iovlen = frags; + + EXPECT_EQ(sendmsg(self->fd, &msg, 0), SEND_LEN * frags); + EXPECT_EQ(recv(self->cfd, buf, SEND_LEN * frags, MSG_WAITALL), + SEND_LEN * frags); + + for (i = 0; i < frags; i++) + EXPECT_EQ(memcmp(buf + SEND_LEN * i, + test_str, SEND_LEN), 0); + } +} +#undef MAX_FRAGS +#undef SEND_LEN + TEST_F(tls, sendmsg_large) { void *mem = malloc(16384); @@ -694,6 +726,34 @@ TEST_F(tls, recv_lowat) EXPECT_EQ(memcmp(send_mem, recv_mem + 10, 5), 0); } +TEST_F(tls, recv_rcvbuf) +{ + char send_mem[4096]; + char recv_mem[4096]; + int rcv_buf = 1024; + + memset(send_mem, 0x1c, sizeof(send_mem)); + + EXPECT_EQ(setsockopt(self->cfd, SOL_SOCKET, SO_RCVBUF, + &rcv_buf, sizeof(rcv_buf)), 0); + + EXPECT_EQ(send(self->fd, send_mem, 512, 0), 512); + memset(recv_mem, 0, sizeof(recv_mem)); + EXPECT_EQ(recv(self->cfd, recv_mem, sizeof(recv_mem), 0), 512); + EXPECT_EQ(memcmp(send_mem, recv_mem, 512), 0); + + if (self->notls) + return; + + EXPECT_EQ(send(self->fd, send_mem, 4096, 0), 4096); + memset(recv_mem, 0, sizeof(recv_mem)); + EXPECT_EQ(recv(self->cfd, recv_mem, sizeof(recv_mem), 0), -1); + EXPECT_EQ(errno, EMSGSIZE); + + EXPECT_EQ(recv(self->cfd, recv_mem, sizeof(recv_mem), 0), -1); + EXPECT_EQ(errno, EMSGSIZE); +} + TEST_F(tls, bidir) { char const *test_str = "test_read"; From 9e5ffed37df68d0ccfb2fdc528609e23a1e70ebe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Nov 2019 12:16:43 -0800 Subject: [PATCH 05/24] net/tls: remove the dead inplace_crypto code Looks like when BPF support was added by commit d3b18ad31f93 ("tls: add bpf support to sk_msg handling") and commit d829e9c4112b ("tls: convert to generic sk_msg interface") it broke/removed the support for in-place crypto as added by commit 4e6d47206c32 ("tls: Add support for inplace records encryption"). The inplace_crypto member of struct tls_rec is dead, inited to zero, and sometimes set to zero again. It used to be set to 1 when record was allocated, but the skmsg code doesn't seem to have been written with the idea of in-place crypto in mind. Since non trivial effort is required to bring the feature back and we don't really have the HW to measure the benefit just remove the left over support for now to avoid confusing readers. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 1 - net/tls/tls_sw.c | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 6ed91e82edd0..9d32f7ce6b31 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -100,7 +100,6 @@ struct tls_rec { struct list_head list; int tx_ready; int tx_flags; - int inplace_crypto; struct sk_msg msg_plaintext; struct sk_msg msg_encrypted; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index dbba51b69d21..5989dfe5c443 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -710,8 +710,7 @@ static int tls_push_record(struct sock *sk, int flags, } i = msg_pl->sg.start; - sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ? - &msg_en->sg.data[i] : &msg_pl->sg.data[i]); + sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); i = msg_en->sg.end; sk_msg_iter_var_prev(i); @@ -976,8 +975,6 @@ alloc_encrypted: if (ret) goto fallback_to_reg_send; - rec->inplace_crypto = 0; - num_zc++; copied += try_to_copy; @@ -1176,7 +1173,6 @@ alloc_payload: tls_ctx->pending_open_record_frags = true; if (full_record || eor || sk_msg_full(msg_pl)) { - rec->inplace_crypto = 0; ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, record_type, &copied, flags); if (ret) { From c5daa6cccdc2f94aca2c9b3fa5f94e4469997293 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Nov 2019 12:16:44 -0800 Subject: [PATCH 06/24] net/tls: use sg_next() to walk sg entries Partially sent record cleanup path increments an SG entry directly instead of using sg_next(). This should not be a problem today, as encrypted messages should be always allocated as arrays. But given this is a cleanup path it's easy to miss was this ever to change. Use sg_next(), and simplify the code. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- net/tls/tls_main.c | 13 ++----------- net/tls/tls_sw.c | 3 ++- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 9d32f7ce6b31..df630f5fc723 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -376,7 +376,7 @@ int tls_push_sg(struct sock *sk, struct tls_context *ctx, int flags); int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, int flags); -bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx); +void tls_free_partial_record(struct sock *sk, struct tls_context *ctx); static inline struct tls_msg *tls_msg(struct sk_buff *skb) { diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index bdca31ffe6da..b3da6c5ab999 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -209,24 +209,15 @@ int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, return tls_push_sg(sk, ctx, sg, offset, flags); } -bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx) +void tls_free_partial_record(struct sock *sk, struct tls_context *ctx) { struct scatterlist *sg; - sg = ctx->partially_sent_record; - if (!sg) - return false; - - while (1) { + for (sg = ctx->partially_sent_record; sg; sg = sg_next(sg)) { put_page(sg_page(sg)); sk_mem_uncharge(sk, sg->length); - - if (sg_is_last(sg)) - break; - sg++; } ctx->partially_sent_record = NULL; - return true; } static void tls_write_space(struct sock *sk) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5989dfe5c443..2b2d0bae14a9 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2089,7 +2089,8 @@ void tls_sw_release_resources_tx(struct sock *sk) /* Free up un-sent records in tx_list. First, free * the partially sent record if any at head of tx_list. */ - if (tls_free_partial_record(sk, tls_ctx)) { + if (tls_ctx->partially_sent_record) { + tls_free_partial_record(sk, tls_ctx); rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); list_del(&rec->list); From 4b67c515036313f3c3ecba3cb2babb9cbddb3f85 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Nov 2019 12:16:45 -0800 Subject: [PATCH 07/24] selftests: bpf: test_sockmap: handle file creation failures gracefully test_sockmap creates a temporary file to use for sendpage. this may fail for various reasons. Handle the error rather than segfault. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_sockmap.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 3845144e2c91..8b838e91cfe5 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -332,6 +332,10 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, int i, fp; file = fopen(".sendpage_tst.tmp", "w+"); + if (!file) { + perror("create file for sendpage"); + return 1; + } for (i = 0; i < iov_length * cnt; i++, k++) fwrite(&k, sizeof(char), 1, file); fflush(file); @@ -339,6 +343,11 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, fclose(file); fp = open(".sendpage_tst.tmp", O_RDONLY); + if (fp < 0) { + perror("reopen file for sendpage"); + return 1; + } + clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { int sent = sendfile(fd, fp, NULL, iov_length); From e5dc9dd3258098bf8b5ceb75fc3433b41eff618a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Nov 2019 12:16:46 -0800 Subject: [PATCH 08/24] selftests: bpf: correct perror strings perror(str) is basically equivalent to print("%s: %s\n", str, strerror(errno)). New line or colon at the end of str is a mistake/breaks formatting. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_sockmap.c | 38 +++++++++++----------- tools/testing/selftests/bpf/xdping.c | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 8b838e91cfe5..4a851513c842 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -240,14 +240,14 @@ static int sockmap_init_sockets(int verbose) addr.sin_port = htons(S1_PORT); err = bind(s1, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0) { - perror("bind s1 failed()\n"); + perror("bind s1 failed()"); return errno; } addr.sin_port = htons(S2_PORT); err = bind(s2, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0) { - perror("bind s2 failed()\n"); + perror("bind s2 failed()"); return errno; } @@ -255,14 +255,14 @@ static int sockmap_init_sockets(int verbose) addr.sin_port = htons(S1_PORT); err = listen(s1, 32); if (err < 0) { - perror("listen s1 failed()\n"); + perror("listen s1 failed()"); return errno; } addr.sin_port = htons(S2_PORT); err = listen(s2, 32); if (err < 0) { - perror("listen s1 failed()\n"); + perror("listen s1 failed()"); return errno; } @@ -270,14 +270,14 @@ static int sockmap_init_sockets(int verbose) addr.sin_port = htons(S1_PORT); err = connect(c1, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0 && errno != EINPROGRESS) { - perror("connect c1 failed()\n"); + perror("connect c1 failed()"); return errno; } addr.sin_port = htons(S2_PORT); err = connect(c2, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0 && errno != EINPROGRESS) { - perror("connect c2 failed()\n"); + perror("connect c2 failed()"); return errno; } else if (err < 0) { err = 0; @@ -286,13 +286,13 @@ static int sockmap_init_sockets(int verbose) /* Accept Connecrtions */ p1 = accept(s1, NULL, NULL); if (p1 < 0) { - perror("accept s1 failed()\n"); + perror("accept s1 failed()"); return errno; } p2 = accept(s2, NULL, NULL); if (p2 < 0) { - perror("accept s1 failed()\n"); + perror("accept s1 failed()"); return errno; } @@ -353,7 +353,7 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, int sent = sendfile(fd, fp, NULL, iov_length); if (!drop && sent < 0) { - perror("send loop error:"); + perror("send loop error"); close(fp); return sent; } else if (drop && sent >= 0) { @@ -472,7 +472,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, int sent = sendmsg(fd, &msg, flags); if (!drop && sent < 0) { - perror("send loop error:"); + perror("send loop error"); goto out_errno; } else if (drop && sent >= 0) { printf("send loop error expected: %i\n", sent); @@ -508,7 +508,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, total_bytes -= txmsg_pop_total; err = clock_gettime(CLOCK_MONOTONIC, &s->start); if (err < 0) - perror("recv start time: "); + perror("recv start time"); while (s->bytes_recvd < total_bytes) { if (txmsg_cork) { timeout.tv_sec = 0; @@ -552,7 +552,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, if (recv < 0) { if (errno != EWOULDBLOCK) { clock_gettime(CLOCK_MONOTONIC, &s->end); - perror("recv failed()\n"); + perror("recv failed()"); goto out_errno; } } @@ -566,7 +566,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, errno = msg_verify_data(&msg, recv, chunk_sz); if (errno) { - perror("data verify msg failed\n"); + perror("data verify msg failed"); goto out_errno; } if (recvp) { @@ -574,7 +574,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, recvp, chunk_sz); if (errno) { - perror("data verify msg_peek failed\n"); + perror("data verify msg_peek failed"); goto out_errno; } } @@ -663,7 +663,7 @@ static int sendmsg_test(struct sockmap_options *opt) err = 0; exit(err ? 1 : 0); } else if (rxpid == -1) { - perror("msg_loop_rx: "); + perror("msg_loop_rx"); return errno; } @@ -690,7 +690,7 @@ static int sendmsg_test(struct sockmap_options *opt) s.bytes_recvd, recvd_Bps, recvd_Bps/giga); exit(err ? 1 : 0); } else if (txpid == -1) { - perror("msg_loop_tx: "); + perror("msg_loop_tx"); return errno; } @@ -724,7 +724,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt) /* Ping/Pong data from client to server */ sc = send(c1, buf, sizeof(buf), 0); if (sc < 0) { - perror("send failed()\n"); + perror("send failed()"); return sc; } @@ -757,7 +757,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt) rc = recv(i, buf, sizeof(buf), 0); if (rc < 0) { if (errno != EWOULDBLOCK) { - perror("recv failed()\n"); + perror("recv failed()"); return rc; } } @@ -769,7 +769,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt) sc = send(i, buf, rc, 0); if (sc < 0) { - perror("send failed()\n"); + perror("send failed()"); return sc; } } diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c index d60a343b1371..842d9155d36c 100644 --- a/tools/testing/selftests/bpf/xdping.c +++ b/tools/testing/selftests/bpf/xdping.c @@ -45,7 +45,7 @@ static int get_stats(int fd, __u16 count, __u32 raddr) printf("\nXDP RTT data:\n"); if (bpf_map_lookup_elem(fd, &raddr, &pinginfo)) { - perror("bpf_map_lookup elem: "); + perror("bpf_map_lookup elem"); return 1; } From 61183b056b49e2937ff92a1424291ba36a6f6d05 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Thu, 28 Nov 2019 10:00:21 +0800 Subject: [PATCH 09/24] net: macb: add missed tasklet_kill This driver forgets to kill tasklet in remove. Add the call to fix it. Fixes: 032dc41ba6e2 ("net: macb: Handle HRESP error") Signed-off-by: Chuhong Yuan Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index d5ae2e1e0b0e..9c767ee252ac 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4422,6 +4422,7 @@ static int macb_remove(struct platform_device *pdev) mdiobus_free(bp->mii_bus); unregister_netdev(dev); + tasklet_kill(&bp->hresp_err_tasklet); pm_runtime_disable(&pdev->dev); pm_runtime_dont_use_autosuspend(&pdev->dev); if (!pm_runtime_suspended(&pdev->dev)) { From 2fe97a578d7bad3116a89dc8a6692a51e6fc1d9c Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 28 Nov 2019 10:10:05 +0700 Subject: [PATCH 10/24] tipc: fix potential memory leak in __tipc_sendmsg() When initiating a connection message to a server side, the connection message is cloned and added to the socket write queue. However, if the cloning is failed, only the socket write queue is purged. It causes memory leak because the original connection message is not freed. This commit fixes it by purging the list of connection message when it cannot be cloned. Fixes: 6787927475e5 ("tipc: buffer overflow handling in listener socket") Reported-by: Hoang Le Signed-off-by: Tung Nguyen Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index a1c8d722ca20..7baed2c2c93d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1447,8 +1447,10 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; - if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) + if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) { + __skb_queue_purge(&pkts); return -ENOMEM; + } trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " "); rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); From 91a4a3eb433e4d786420c41f3c08d1d16c605962 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 28 Nov 2019 10:10:06 +0700 Subject: [PATCH 11/24] tipc: fix wrong socket reference counter after tipc_sk_timeout() returns When tipc_sk_timeout() is executed but user space is grabbing ownership, this function rearms itself and returns. However, the socket reference counter is not reduced. This causes potential unexpected behavior. This commit fixes it by calling sock_put() before tipc_sk_timeout() returns in the above-mentioned case. Fixes: afe8792fec69 ("tipc: refactor function tipc_sk_timeout()") Signed-off-by: Tung Nguyen Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7baed2c2c93d..fb5595081a05 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2759,6 +2759,7 @@ static void tipc_sk_timeout(struct timer_list *t) if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20); bh_unlock_sock(sk); + sock_put(sk); return; } From 12db3c8083fcab4270866a88191933f2d9f24f89 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 28 Nov 2019 10:10:07 +0700 Subject: [PATCH 12/24] tipc: fix wrong timeout input for tipc_wait_for_cond() In function __tipc_shutdown(), the timeout value passed to tipc_wait_for_cond() is not jiffies. This commit fixes it by converting that value from milliseconds to jiffies. Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") Signed-off-by: Tung Nguyen Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index fb5595081a05..da5fb84852a6 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -532,7 +532,7 @@ static void __tipc_shutdown(struct socket *sock, int error) struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); - long timeout = CONN_TIMEOUT_DEFAULT; + long timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); u32 dnode = tsk_peer_node(tsk); struct sk_buff *skb; From d34910e1751be79672db9cc61ca8892fbb4763f2 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 28 Nov 2019 10:10:08 +0700 Subject: [PATCH 13/24] tipc: fix duplicate SYN messages under link congestion Scenario: 1. A client socket initiates a SYN message to a listening socket. 2. The send link is congested, the SYN message is put in the send link and a wakeup message is put in wakeup queue. 3. The congestion situation is abated, the wakeup message is pulled out of the wakeup queue. Function tipc_sk_push_backlog() is called to send out delayed messages by Nagle. However, the client socket is still in CONNECTING state. So, it sends the SYN message in the socket write queue to the listening socket again. 4. The listening socket receives the first SYN message and creates first server socket. The client socket receives ACK- and establishes a connection to the first server socket. The client socket closes its connection with the first server socket. 5. The listening socket receives the second SYN message and creates second server socket. The second server socket sends ACK- to the client socket, but it has been closed. It results in connection reset error when reading from the server socket in user space. Solution: return from function tipc_sk_push_backlog() immediately if there is pending SYN message in the socket write queue. Fixes: c0bceb97db9e ("tipc: add smart nagle feature") Signed-off-by: Tung Nguyen Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index da5fb84852a6..41688da233ab 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -540,12 +540,10 @@ static void __tipc_shutdown(struct socket *sock, int error) tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); - /* Push out unsent messages or remove if pending SYN */ - skb = skb_peek(&sk->sk_write_queue); - if (skb && !msg_is_syn(buf_msg(skb))) - tipc_sk_push_backlog(tsk); - else - __skb_queue_purge(&sk->sk_write_queue); + /* Push out delayed messages if in Nagle mode */ + tipc_sk_push_backlog(tsk); + /* Remove pending SYN */ + __skb_queue_purge(&sk->sk_write_queue); /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). @@ -1248,9 +1246,14 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk) struct sk_buff_head *txq = &tsk->sk.sk_write_queue; struct net *net = sock_net(&tsk->sk); u32 dnode = tsk_peer_node(tsk); + struct sk_buff *skb = skb_peek(txq); int rc; - if (skb_queue_empty(txq) || tsk->cong_link_cnt) + if (!skb || tsk->cong_link_cnt) + return; + + /* Do not send SYN again after congestion */ + if (msg_is_syn(buf_msg(skb))) return; tsk->snt_unacked += tsk->snd_backlog; From 2745aea6750ff0d2c48285d25bdb00e5b636ec8b Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 28 Nov 2019 15:58:06 -0300 Subject: [PATCH 14/24] selftests: pmtu: use -oneline for ip route list cache Some versions of iproute2 will output more than one line per entry, which will cause the test to fail, like: TEST: ipv6: list and flush cached exceptions [FAIL] can't list cached exceptions That happens, for example, with iproute2 4.15.0. When using the -oneline option, this will work just fine: TEST: ipv6: list and flush cached exceptions [ OK ] This also works just fine with a more recent version of iproute2, like 5.4.0. For some reason, two lines are printed for the IPv4 test no matter what version of iproute2 is used. Use the same -oneline parameter there instead of counting the lines twice. Fixes: b964641e9925 ("selftests: pmtu: Make list_flush_ipv6_exception test more demanding") Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index ab367e75f095..d697815d2785 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -1249,8 +1249,7 @@ test_list_flush_ipv4_exception() { done run_cmd ${ns_a} ping -q -M want -i 0.1 -c 2 -s 1800 "${dst2}" - # Each exception is printed as two lines - if [ "$(${ns_a} ip route list cache | wc -l)" -ne 202 ]; then + if [ "$(${ns_a} ip -oneline route list cache | wc -l)" -ne 101 ]; then err " can't list cached exceptions" fail=1 fi @@ -1300,7 +1299,7 @@ test_list_flush_ipv6_exception() { run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s 1800 "${dst_prefix1}${i}" done run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s 1800 "${dst2}" - if [ "$(${ns_a} ip -6 route list cache | wc -l)" -ne 101 ]; then + if [ "$(${ns_a} ip -oneline -6 route list cache | wc -l)" -ne 101 ]; then err " can't list cached exceptions" fail=1 fi From 5f9fc3325ef95398c363b9b7813a7e99d4d85d7d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 27 Nov 2019 21:27:00 +0800 Subject: [PATCH 15/24] net/mlx5e: Fix build error without IPV6 If IPV6 is not set and CONFIG_MLX5_ESWITCH is y, building fails: drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c:322:5: error: redefinition of mlx5e_tc_tun_create_header_ipv6 int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c:7:0: drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h:67:1: note: previous definition of mlx5e_tc_tun_create_header_ipv6 was here mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Use #ifdef to guard this, also move mlx5e_route_lookup_ipv6 to cleanup unused warning. Reported-by: Hulk Robot Fixes: e689e998e102 ("net/mlx5e: TC, Stub out ipv6 tun create header function") Signed-off-by: YueHaibing Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlx5/core/en/tc_tun.c | 74 ++++++++++--------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 784b1e26f414..6ed87534d314 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -130,42 +130,6 @@ static const char *mlx5e_netdev_kind(struct net_device *dev) return "unknown"; } -static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv, - struct net_device *mirred_dev, - struct net_device **out_dev, - struct net_device **route_dev, - struct flowi6 *fl6, - struct neighbour **out_n, - u8 *out_ttl) -{ - struct dst_entry *dst; - struct neighbour *n; - - int ret; - - ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst, - fl6); - if (ret < 0) - return ret; - - if (!(*out_ttl)) - *out_ttl = ip6_dst_hoplimit(dst); - - ret = get_route_and_out_devs(priv, dst->dev, route_dev, out_dev); - if (ret < 0) { - dst_release(dst); - return ret; - } - - n = dst_neigh_lookup(dst, &fl6->daddr); - dst_release(dst); - if (!n) - return -ENOMEM; - - *out_n = n; - return 0; -} - static int mlx5e_gen_ip_tunnel_header(char buf[], __u8 *ip_proto, struct mlx5e_encap_entry *e) { @@ -319,6 +283,43 @@ release_neigh: return err; } +#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) +static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct net_device **out_dev, + struct net_device **route_dev, + struct flowi6 *fl6, + struct neighbour **out_n, + u8 *out_ttl) +{ + struct dst_entry *dst; + struct neighbour *n; + + int ret; + + ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst, + fl6); + if (ret < 0) + return ret; + + if (!(*out_ttl)) + *out_ttl = ip6_dst_hoplimit(dst); + + ret = get_route_and_out_devs(priv, dst->dev, route_dev, out_dev); + if (ret < 0) { + dst_release(dst); + return ret; + } + + n = dst_neigh_lookup(dst, &fl6->daddr); + dst_release(dst); + if (!n) + return -ENOMEM; + + *out_n = n; + return 0; +} + int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, struct net_device *mirred_dev, struct mlx5e_encap_entry *e) @@ -436,6 +437,7 @@ release_neigh: neigh_release(n); return err; } +#endif bool mlx5e_tc_tun_device_to_offload(struct mlx5e_priv *priv, struct net_device *netdev) From 395eba7d0c122c650fb8b6ae8c6815cbfdc39ca9 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 29 Nov 2019 19:58:09 +0200 Subject: [PATCH 16/24] net: ethernet: ti: ale: ensure vlan/mdb deleted when no members The recently updated ALE APIs cpsw_ale_del_mcast() and cpsw_ale_del_vlan_modify() have an issue and will not delete ALE entry even if VLAN/mcast group has no more members. Hence fix it here and delete ALE entry if !port_mask. The issue affected only new cpsw switchdev driver. Fixes: e85c14370783 ("net: ethernet: ti: ale: modify vlan/mdb api for switchdev") Signed-off-by: Grygorii Strashko Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_ale.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c index 929f3d3354e3..ecdbde539eb7 100644 --- a/drivers/net/ethernet/ti/cpsw_ale.c +++ b/drivers/net/ethernet/ti/cpsw_ale.c @@ -384,7 +384,7 @@ int cpsw_ale_del_mcast(struct cpsw_ale *ale, const u8 *addr, int port_mask, int flags, u16 vid) { u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0}; - int mcast_members; + int mcast_members = 0; int idx; idx = cpsw_ale_match_addr(ale, addr, (flags & ALE_VLAN) ? vid : 0); @@ -397,11 +397,13 @@ int cpsw_ale_del_mcast(struct cpsw_ale *ale, const u8 *addr, int port_mask, mcast_members = cpsw_ale_get_port_mask(ale_entry, ale->port_mask_bits); mcast_members &= ~port_mask; + } + + if (mcast_members) cpsw_ale_set_port_mask(ale_entry, mcast_members, ale->port_mask_bits); - } else { + else cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE); - } cpsw_ale_write(ale, idx, ale_entry); return 0; @@ -478,6 +480,10 @@ static void cpsw_ale_del_vlan_modify(struct cpsw_ale *ale, u32 *ale_entry, members = cpsw_ale_get_vlan_member_list(ale_entry, ale->vlan_field_bits); members &= ~port_mask; + if (!members) { + cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE); + return; + } untag = cpsw_ale_get_vlan_untag_force(ale_entry, ale->vlan_field_bits); From 14e54ab9143fa60794d13ea0a66c792a2046a8f3 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Thu, 28 Nov 2019 14:29:09 +0800 Subject: [PATCH 17/24] net: sched: fix `tc -s class show` no bstats on class with nolock subqueues When a classful qdisc's child qdisc has set the flag TCQ_F_CPUSTATS (pfifo_fast for example), the child qdisc's cpu_bstats should be passed to gnet_stats_copy_basic(), but many classful qdisc didn't do that. As a result, `tc -s class show dev DEV` always return 0 for bytes and packets in this case. Pass the child qdisc's cpu_bstats to gnet_stats_copy_basic() to fix this issue. The qstats also has this problem, but it has been fixed in 5dd431b6b9 ("net: sched: introduce and use qstats read...") and bstats still remains buggy. Fixes: 22e0f8b9322c ("net: sched: make bstats per cpu and estimator RCU safe") Signed-off-by: Dust Li Signed-off-by: Tony Lu Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_mq.c | 3 ++- net/sched/sch_mqprio.c | 4 ++-- net/sched/sch_multiq.c | 2 +- net/sched/sch_prio.c | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 0d578333e967..278c0b2dc523 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -245,7 +245,8 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats, + &sch->bstats) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 46980b8d66c5..0d0113a24962 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -557,8 +557,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, + sch->cpu_bstats, &sch->bstats) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index b2b7fdb06fc6..1330ad224931 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -339,7 +339,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl_q->bstats) < 0 || + d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 0f8fedb8809a..18b884cfdfe8 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -356,7 +356,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl_q->bstats) < 0 || + d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; From 408469d31e9106878d2b3e428dcca9984664dfc9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 28 Nov 2019 13:38:57 +0100 Subject: [PATCH 18/24] selftests: forwarding: fix race between packet receive and tc check It is possible that tc stats get checked before the packet we check for actually arrived into the interface and accounted for. Fix it by checking for the expected result in a loop until timeout is reached (by default 1 second). Fixes: 07e5c75184a1 ("selftests: forwarding: Introduce tc flower matching tests") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../selftests/net/forwarding/tc_common.sh | 39 +++++++++++++++---- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh index d93589bd4d1d..64f652633585 100644 --- a/tools/testing/selftests/net/forwarding/tc_common.sh +++ b/tools/testing/selftests/net/forwarding/tc_common.sh @@ -3,16 +3,42 @@ CHECK_TC="yes" +# Can be overridden by the configuration file. See lib.sh +TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms + +__tc_check_packets() +{ + local id=$1 + local handle=$2 + local count=$3 + local operator=$4 + + start_time="$(date -u +%s%3N)" + while true + do + cmd_jq "tc -j -s filter show $id" \ + ".[] | select(.options.handle == $handle) | \ + select(.options.actions[0].stats.packets $operator $count)" \ + &> /dev/null + ret=$? + if [[ $ret -eq 0 ]]; then + return $ret + fi + current_time="$(date -u +%s%3N)" + diff=$(expr $current_time - $start_time) + if [ "$diff" -gt "$TC_HIT_TIMEOUT" ]; then + return 1 + fi + done +} + tc_check_packets() { local id=$1 local handle=$2 local count=$3 - cmd_jq "tc -j -s filter show $id" \ - ".[] | select(.options.handle == $handle) | \ - select(.options.actions[0].stats.packets == $count)" \ - &> /dev/null + __tc_check_packets "$id" "$handle" "$count" "==" } tc_check_packets_hitting() @@ -20,8 +46,5 @@ tc_check_packets_hitting() local id=$1 local handle=$2 - cmd_jq "tc -j -s filter show $id" \ - ".[] | select(.options.handle == $handle) | \ - select(.options.actions[0].stats.packets > 0)" \ - &> /dev/null + __tc_check_packets "$id" "$handle" 0 ">" } From 9bd19c63a4bcfa0fd18f3bc0de30675fe2addea6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 28 Nov 2019 14:17:59 -0800 Subject: [PATCH 19/24] net: emulex: benet: indent a Kconfig depends continuation line Indent a Kconfig continuation line to improve readability. Signed-off-by: Randy Dunlap Cc: Sathya Perla Cc: Ajit Khaparde Cc: Sriharsha Basavapatna Cc: Somnath Kotur Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/emulex/benet/Kconfig b/drivers/net/ethernet/emulex/benet/Kconfig index 17d300ea9955..f51dca1526c6 100644 --- a/drivers/net/ethernet/emulex/benet/Kconfig +++ b/drivers/net/ethernet/emulex/benet/Kconfig @@ -49,4 +49,4 @@ config BE2NET_SKYHAWK comment "WARNING: be2net is useless without any enabled chip" depends on BE2NET_BE2=n && BE2NET_BE3=n && BE2NET_LANCER=n && \ - BE2NET_SKYHAWK=n && BE2NET + BE2NET_SKYHAWK=n && BE2NET From 14012c9f3bb922b9e0751ba43d15cc580a6049bf Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Dec 2019 10:27:14 +0100 Subject: [PATCH 20/24] r8169: fix jumbo configuration for RTL8168evl Alan reported [0] that network is broken since the referenced commit when using jumbo frames. This commit isn't wrong, it just revealed another issue that has been existing before. According to the vendor driver the RTL8168e-specific jumbo config doesn't apply for RTL8168evl. [0] https://lkml.org/lkml/2019/11/30/119 Fixes: 4ebcb113edcc ("r8169: fix jumbo packet handling on resume from suspend") Reported-by: Alan J. Wylie Tested-by: Alan J. Wylie Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index d47a038cb8d0..78872cef32ad 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -3872,7 +3872,7 @@ static void rtl_hw_jumbo_enable(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_27 ... RTL_GIGA_MAC_VER_28: r8168dp_hw_jumbo_enable(tp); break; - case RTL_GIGA_MAC_VER_31 ... RTL_GIGA_MAC_VER_34: + case RTL_GIGA_MAC_VER_31 ... RTL_GIGA_MAC_VER_33: r8168e_hw_jumbo_enable(tp); break; default: From 398fd408ccfb5e44b1cbe73a209d2281d3efa83c Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Dec 2019 10:39:56 +0100 Subject: [PATCH 21/24] r8169: fix resume on cable plug-in It was reported [0] that network doesn't wake up on cable plug-in with certain chip versions. Reason is that on these chip versions the PHY doesn't detect cable plug-in when being in power-down mode. So prevent the PHY from powering down if WoL is enabled. [0] https://bugzilla.kernel.org/show_bug.cgi?id=202103 Fixes: 95fb8bb3181b ("net: phy: force phy suspend when calling phy_stop") Reported-by: jhdskag3 Tested-by: jhdskag3 Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 78872cef32ad..38d212686123 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1542,6 +1542,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) rtl_lock_config_regs(tp); device_set_wakeup_enable(tp_to_dev(tp), wolopts); + tp->dev->wol_enabled = wolopts ? 1 : 0; } static int rtl8169_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) From f3284e014850e45f63fbc74c7382453c876fbc35 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Dec 2019 10:51:47 +0100 Subject: [PATCH 22/24] net: phy: realtek: fix using paged operations with RTL8105e / RTL8208 It was reported [0] that since the referenced commit a warning is triggered in phylib that complains about paged operations being used with a PHY driver that doesn't support this. The commit isn't wrong, just for one chip version (RTL8105e) no dedicated PHY driver exists yet. So add the missing PHY driver. [0] https://bugzilla.kernel.org/show_bug.cgi?id=202103 Fixes: 3a129e3f9ac4 ("r8169: switch to phylib functions in more places") Reported-by: jhdskag3 Tested-by: jhdskag3 Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/realtek.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index 677c45985338..476db5345e1a 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -439,6 +439,15 @@ static struct phy_driver realtek_drvs[] = { .resume = genphy_resume, .read_page = rtl821x_read_page, .write_page = rtl821x_write_page, + }, { + PHY_ID_MATCH_MODEL(0x001cc880), + .name = "RTL8208 Fast Ethernet", + .read_mmd = genphy_read_mmd_unsupported, + .write_mmd = genphy_write_mmd_unsupported, + .suspend = genphy_suspend, + .resume = genphy_resume, + .read_page = rtl821x_read_page, + .write_page = rtl821x_write_page, }, { PHY_ID_MATCH_EXACT(0x001cc910), .name = "RTL8211 Gigabit Ethernet", From 8ffeb03fbba3b599690b361467bfd2373e8c450f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sun, 1 Dec 2019 18:41:24 +0100 Subject: [PATCH 23/24] openvswitch: drop unneeded BUG_ON() in ovs_flow_cmd_build_info() All the callers of ovs_flow_cmd_build_info() already deal with error return code correctly, so we can handle the error condition in a more gracefull way. Still dump a warning to preserve debuggability. v1 -> v2: - clarify the commit message - clean the skb and report the error (DaveM) Fixes: ccb1352e76cf ("net: Add Open vSwitch kernel components.") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 293d5289c4a1..8cab3435d8da 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -905,7 +905,10 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, info->snd_portid, info->snd_seq, 0, cmd, ufid_flags); - BUG_ON(retval < 0); + if (WARN_ON_ONCE(retval < 0)) { + kfree_skb(skb); + skb = ERR_PTR(retval); + } return skb; } From 8a574f86652a4540a2433946ba826ccb87f398cc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sun, 1 Dec 2019 18:41:25 +0100 Subject: [PATCH 24/24] openvswitch: remove another BUG_ON() If we can't build the flow del notification, we can simply delete the flow, no need to crash the kernel. Still keep a WARN_ON to preserve debuggability. Note: the BUG_ON() predates the Fixes tag, but this change can be applied only after the mentioned commit. v1 -> v2: - do not leak an skb on error Fixes: aed067783e50 ("openvswitch: Minimize ovs_flow_cmd_del critical section.") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 8cab3435d8da..1047e8043084 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1372,7 +1372,10 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) OVS_FLOW_CMD_DEL, ufid_flags); rcu_read_unlock(); - BUG_ON(err < 0); + if (WARN_ON_ONCE(err < 0)) { + kfree_skb(reply); + goto out_free; + } ovs_notify(&dp_flow_genl_family, reply, info); } else { @@ -1380,6 +1383,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) } } +out_free: ovs_flow_free(flow, true); return 0; unlock: