From c329ef9684de9517d82af5b4758c9e1b64a8a11a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:39 -0800
Subject: [PATCH 01/24] net/tls: take into account that bpf_exec_tx_verdict()
 may free the record

bpf_exec_tx_verdict() may free the record if tls_push_record()
fails, or if the entire record got consumed by BPF. Re-check
ctx->open_rec before touching the data.

Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index da9f9ce51e7b..70e3c0c1af50 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -984,7 +984,7 @@ alloc_encrypted:
 					num_async++;
 				else if (ret == -ENOMEM)
 					goto wait_for_memory;
-				else if (ret == -ENOSPC)
+				else if (ctx->open_rec && ret == -ENOSPC)
 					goto rollback_iter;
 				else if (ret != -EAGAIN)
 					goto send_end;
@@ -1053,11 +1053,12 @@ wait_for_memory:
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret) {
 trim_sgl:
-			tls_trim_both_msgs(sk, orig_size);
+			if (ctx->open_rec)
+				tls_trim_both_msgs(sk, orig_size);
 			goto send_end;
 		}
 
-		if (msg_en->sg.size < required_size)
+		if (ctx->open_rec && msg_en->sg.size < required_size)
 			goto alloc_encrypted;
 	}
 
@@ -1190,11 +1191,13 @@ wait_for_sndbuf:
 wait_for_memory:
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret) {
-			tls_trim_both_msgs(sk, msg_pl->sg.size);
+			if (ctx->open_rec)
+				tls_trim_both_msgs(sk, msg_pl->sg.size);
 			goto sendpage_end;
 		}
 
-		goto alloc_payload;
+		if (ctx->open_rec)
+			goto alloc_payload;
 	}
 
 	if (num_async) {

From d10523d0b3d78153ee58d19853ced26c9004c8c4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:40 -0800
Subject: [PATCH 02/24] net/tls: free the record on encryption error

When tls_do_encryption() fails the SG lists are left with the
SG_END and SG_CHAIN marks in place. One could hope that once
encryption fails we will never see the record again, but that
is in fact not true. Commit d3b18ad31f93 ("tls: add bpf support
to sk_msg handling") added special handling to ENOMEM and ENOSPC
errors which mean we may see the same record re-submitted.

As suggested by John free the record, the BPF code is already
doing just that.

Reported-by: syzbot+df0d4ec12332661dd1f9@syzkaller.appspotmail.com
Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 70e3c0c1af50..dbba51b69d21 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -771,8 +771,14 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
 
 	policy = !(flags & MSG_SENDPAGE_NOPOLICY);
 	psock = sk_psock_get(sk);
-	if (!psock || !policy)
-		return tls_push_record(sk, flags, record_type);
+	if (!psock || !policy) {
+		err = tls_push_record(sk, flags, record_type);
+		if (err) {
+			*copied -= sk_msg_free(sk, msg);
+			tls_free_open_rec(sk);
+		}
+		return err;
+	}
 more_data:
 	enospc = sk_msg_full(msg);
 	if (psock->eval == __SK_NONE) {

From 031097d9e079e40dce401031d1012e83d80eaf01 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:41 -0800
Subject: [PATCH 03/24] net: skmsg: fix TLS 1.3 crash with full sk_msg

TLS 1.3 started using the entry at the end of the SG array
for chaining-in the single byte content type entry. This mostly
works:

[ E E E E E E . . ]
  ^           ^
   start       end

                 E < content type
               /
[ E E E E E E C . ]
  ^           ^
   start       end

(Where E denotes a populated SG entry; C denotes a chaining entry.)

If the array is full, however, the end will point to the start:

[ E E E E E E E E ]
  ^
   start
   end

And we end up overwriting the start:

    E < content type
   /
[ C E E E E E E E ]
  ^
   start
   end

The sg array is supposed to be a circular buffer with start and
end markers pointing anywhere. In case where start > end
(i.e. the circular buffer has "wrapped") there is an extra entry
reserved at the end to chain the two halves together.

[ E E E E E E . . l ]

(Where l is the reserved entry for "looping" back to front.

As suggested by John, let's reserve another entry for chaining
SG entries after the main circular buffer. Note that this entry
has to be pointed to by the end entry so its position is not fixed.

Examples of full messages:

[ E E E E E E E E . l ]
  ^               ^
   start           end

   <---------------.
[ E E . E E E E E E l ]
      ^ ^
   end   start

Now the end will always point to an unused entry, so TLS 1.3
can always use it.

Fixes: 130b392c6cd6 ("net: tls: Add tls 1.3 support")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skmsg.h | 28 ++++++++++++++--------------
 net/core/filter.c     |  8 ++++----
 net/core/skmsg.c      |  2 +-
 net/ipv4/tcp_bpf.c    |  2 +-
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 6cb077b646a5..ef7031f8a304 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -14,6 +14,7 @@
 #include <net/strparser.h>
 
 #define MAX_MSG_FRAGS			MAX_SKB_FRAGS
+#define NR_MSG_FRAG_IDS			(MAX_MSG_FRAGS + 1)
 
 enum __sk_action {
 	__SK_DROP = 0,
@@ -29,13 +30,15 @@ struct sk_msg_sg {
 	u32				size;
 	u32				copybreak;
 	unsigned long			copy;
-	/* The extra element is used for chaining the front and sections when
-	 * the list becomes partitioned (e.g. end < start). The crypto APIs
-	 * require the chaining.
+	/* The extra two elements:
+	 * 1) used for chaining the front and sections when the list becomes
+	 *    partitioned (e.g. end < start). The crypto APIs require the
+	 *    chaining;
+	 * 2) to chain tailer SG entries after the message.
 	 */
-	struct scatterlist		data[MAX_MSG_FRAGS + 1];
+	struct scatterlist		data[MAX_MSG_FRAGS + 2];
 };
-static_assert(BITS_PER_LONG >= MAX_MSG_FRAGS);
+static_assert(BITS_PER_LONG >= NR_MSG_FRAG_IDS);
 
 /* UAPI in filter.c depends on struct sk_msg_sg being first element. */
 struct sk_msg {
@@ -142,13 +145,13 @@ static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes)
 
 static inline u32 sk_msg_iter_dist(u32 start, u32 end)
 {
-	return end >= start ? end - start : end + (MAX_MSG_FRAGS - start);
+	return end >= start ? end - start : end + (NR_MSG_FRAG_IDS - start);
 }
 
 #define sk_msg_iter_var_prev(var)			\
 	do {						\
 		if (var == 0)				\
-			var = MAX_MSG_FRAGS - 1;	\
+			var = NR_MSG_FRAG_IDS - 1;	\
 		else					\
 			var--;				\
 	} while (0)
@@ -156,7 +159,7 @@ static inline u32 sk_msg_iter_dist(u32 start, u32 end)
 #define sk_msg_iter_var_next(var)			\
 	do {						\
 		var++;					\
-		if (var == MAX_MSG_FRAGS)		\
+		if (var == NR_MSG_FRAG_IDS)		\
 			var = 0;			\
 	} while (0)
 
@@ -173,9 +176,9 @@ static inline void sk_msg_clear_meta(struct sk_msg *msg)
 
 static inline void sk_msg_init(struct sk_msg *msg)
 {
-	BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != MAX_MSG_FRAGS);
+	BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS);
 	memset(msg, 0, sizeof(*msg));
-	sg_init_marker(msg->sg.data, MAX_MSG_FRAGS);
+	sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS);
 }
 
 static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
@@ -196,14 +199,11 @@ static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src)
 
 static inline bool sk_msg_full(const struct sk_msg *msg)
 {
-	return (msg->sg.end == msg->sg.start) && msg->sg.size;
+	return sk_msg_iter_dist(msg->sg.start, msg->sg.end) == MAX_MSG_FRAGS;
 }
 
 static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
 {
-	if (sk_msg_full(msg))
-		return MAX_MSG_FRAGS;
-
 	return sk_msg_iter_dist(msg->sg.start, msg->sg.end);
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index b0ed048585ba..f1e703eed3d2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2299,7 +2299,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 	WARN_ON_ONCE(last_sge == first_sge);
 	shift = last_sge > first_sge ?
 		last_sge - first_sge - 1 :
-		MAX_SKB_FRAGS - first_sge + last_sge - 1;
+		NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
 	if (!shift)
 		goto out;
 
@@ -2308,8 +2308,8 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 	do {
 		u32 move_from;
 
-		if (i + shift >= MAX_MSG_FRAGS)
-			move_from = i + shift - MAX_MSG_FRAGS;
+		if (i + shift >= NR_MSG_FRAG_IDS)
+			move_from = i + shift - NR_MSG_FRAG_IDS;
 		else
 			move_from = i + shift;
 		if (move_from == msg->sg.end)
@@ -2323,7 +2323,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 	} while (1);
 
 	msg->sg.end = msg->sg.end - shift > msg->sg.end ?
-		      msg->sg.end - shift + MAX_MSG_FRAGS :
+		      msg->sg.end - shift + NR_MSG_FRAG_IDS :
 		      msg->sg.end - shift;
 out:
 	msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index a469d2124f3f..ded2d5227678 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -421,7 +421,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
 	copied = skb->len;
 	msg->sg.start = 0;
 	msg->sg.size = copied;
-	msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
+	msg->sg.end = num_sge;
 	msg->skb = skb;
 
 	sk_psock_queue_msg(psock, msg);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 8a56e09cfb0e..e38705165ac9 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -301,7 +301,7 @@ EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
 static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
 				struct sk_msg *msg, int *copied, int flags)
 {
-	bool cork = false, enospc = msg->sg.start == msg->sg.end;
+	bool cork = false, enospc = sk_msg_full(msg);
 	struct sock *sk_redir;
 	u32 tosend, delta = 0;
 	int ret;

From 65190f77424d7b82c4aad7326c9cce6bd91a2fcc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:42 -0800
Subject: [PATCH 04/24] selftests/tls: add a test for fragmented messages

Add a sendmsg test with very fragmented messages. This should
fill up sk_msg and test the boundary conditions.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/tls.c | 60 +++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 1c8f194d6556..46abcae47dee 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -268,6 +268,38 @@ TEST_F(tls, sendmsg_single)
 	EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
 }
 
+#define MAX_FRAGS	64
+#define SEND_LEN	13
+TEST_F(tls, sendmsg_fragmented)
+{
+	char const *test_str = "test_sendmsg";
+	char buf[SEND_LEN * MAX_FRAGS];
+	struct iovec vec[MAX_FRAGS];
+	struct msghdr msg;
+	int i, frags;
+
+	for (frags = 1; frags <= MAX_FRAGS; frags++) {
+		for (i = 0; i < frags; i++) {
+			vec[i].iov_base = (char *)test_str;
+			vec[i].iov_len = SEND_LEN;
+		}
+
+		memset(&msg, 0, sizeof(struct msghdr));
+		msg.msg_iov = vec;
+		msg.msg_iovlen = frags;
+
+		EXPECT_EQ(sendmsg(self->fd, &msg, 0), SEND_LEN * frags);
+		EXPECT_EQ(recv(self->cfd, buf, SEND_LEN * frags, MSG_WAITALL),
+			  SEND_LEN * frags);
+
+		for (i = 0; i < frags; i++)
+			EXPECT_EQ(memcmp(buf + SEND_LEN * i,
+					 test_str, SEND_LEN), 0);
+	}
+}
+#undef MAX_FRAGS
+#undef SEND_LEN
+
 TEST_F(tls, sendmsg_large)
 {
 	void *mem = malloc(16384);
@@ -694,6 +726,34 @@ TEST_F(tls, recv_lowat)
 	EXPECT_EQ(memcmp(send_mem, recv_mem + 10, 5), 0);
 }
 
+TEST_F(tls, recv_rcvbuf)
+{
+	char send_mem[4096];
+	char recv_mem[4096];
+	int rcv_buf = 1024;
+
+	memset(send_mem, 0x1c, sizeof(send_mem));
+
+	EXPECT_EQ(setsockopt(self->cfd, SOL_SOCKET, SO_RCVBUF,
+			     &rcv_buf, sizeof(rcv_buf)), 0);
+
+	EXPECT_EQ(send(self->fd, send_mem, 512, 0), 512);
+	memset(recv_mem, 0, sizeof(recv_mem));
+	EXPECT_EQ(recv(self->cfd, recv_mem, sizeof(recv_mem), 0), 512);
+	EXPECT_EQ(memcmp(send_mem, recv_mem, 512), 0);
+
+	if (self->notls)
+		return;
+
+	EXPECT_EQ(send(self->fd, send_mem, 4096, 0), 4096);
+	memset(recv_mem, 0, sizeof(recv_mem));
+	EXPECT_EQ(recv(self->cfd, recv_mem, sizeof(recv_mem), 0), -1);
+	EXPECT_EQ(errno, EMSGSIZE);
+
+	EXPECT_EQ(recv(self->cfd, recv_mem, sizeof(recv_mem), 0), -1);
+	EXPECT_EQ(errno, EMSGSIZE);
+}
+
 TEST_F(tls, bidir)
 {
 	char const *test_str = "test_read";

From 9e5ffed37df68d0ccfb2fdc528609e23a1e70ebe Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:43 -0800
Subject: [PATCH 05/24] net/tls: remove the dead inplace_crypto code

Looks like when BPF support was added by commit d3b18ad31f93
("tls: add bpf support to sk_msg handling") and
commit d829e9c4112b ("tls: convert to generic sk_msg interface")
it broke/removed the support for in-place crypto as added by
commit 4e6d47206c32 ("tls: Add support for inplace records
encryption").

The inplace_crypto member of struct tls_rec is dead, inited
to zero, and sometimes set to zero again. It used to be
set to 1 when record was allocated, but the skmsg code doesn't
seem to have been written with the idea of in-place crypto
in mind.

Since non trivial effort is required to bring the feature back
and we don't really have the HW to measure the benefit just
remove the left over support for now to avoid confusing readers.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 1 -
 net/tls/tls_sw.c  | 6 +-----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 6ed91e82edd0..9d32f7ce6b31 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -100,7 +100,6 @@ struct tls_rec {
 	struct list_head list;
 	int tx_ready;
 	int tx_flags;
-	int inplace_crypto;
 
 	struct sk_msg msg_plaintext;
 	struct sk_msg msg_encrypted;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index dbba51b69d21..5989dfe5c443 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -710,8 +710,7 @@ static int tls_push_record(struct sock *sk, int flags,
 	}
 
 	i = msg_pl->sg.start;
-	sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ?
-		 &msg_en->sg.data[i] : &msg_pl->sg.data[i]);
+	sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]);
 
 	i = msg_en->sg.end;
 	sk_msg_iter_var_prev(i);
@@ -976,8 +975,6 @@ alloc_encrypted:
 			if (ret)
 				goto fallback_to_reg_send;
 
-			rec->inplace_crypto = 0;
-
 			num_zc++;
 			copied += try_to_copy;
 
@@ -1176,7 +1173,6 @@ alloc_payload:
 
 		tls_ctx->pending_open_record_frags = true;
 		if (full_record || eor || sk_msg_full(msg_pl)) {
-			rec->inplace_crypto = 0;
 			ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
 						  record_type, &copied, flags);
 			if (ret) {

From c5daa6cccdc2f94aca2c9b3fa5f94e4469997293 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:44 -0800
Subject: [PATCH 06/24] net/tls: use sg_next() to walk sg entries

Partially sent record cleanup path increments an SG entry
directly instead of using sg_next(). This should not be a
problem today, as encrypted messages should be always
allocated as arrays. But given this is a cleanup path it's
easy to miss was this ever to change. Use sg_next(), and
simplify the code.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h  |  2 +-
 net/tls/tls_main.c | 13 ++-----------
 net/tls/tls_sw.c   |  3 ++-
 3 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 9d32f7ce6b31..df630f5fc723 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -376,7 +376,7 @@ int tls_push_sg(struct sock *sk, struct tls_context *ctx,
 		int flags);
 int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
 			    int flags);
-bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx);
+void tls_free_partial_record(struct sock *sk, struct tls_context *ctx);
 
 static inline struct tls_msg *tls_msg(struct sk_buff *skb)
 {
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index bdca31ffe6da..b3da6c5ab999 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -209,24 +209,15 @@ int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
 	return tls_push_sg(sk, ctx, sg, offset, flags);
 }
 
-bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx)
+void tls_free_partial_record(struct sock *sk, struct tls_context *ctx)
 {
 	struct scatterlist *sg;
 
-	sg = ctx->partially_sent_record;
-	if (!sg)
-		return false;
-
-	while (1) {
+	for (sg = ctx->partially_sent_record; sg; sg = sg_next(sg)) {
 		put_page(sg_page(sg));
 		sk_mem_uncharge(sk, sg->length);
-
-		if (sg_is_last(sg))
-			break;
-		sg++;
 	}
 	ctx->partially_sent_record = NULL;
-	return true;
 }
 
 static void tls_write_space(struct sock *sk)
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 5989dfe5c443..2b2d0bae14a9 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2089,7 +2089,8 @@ void tls_sw_release_resources_tx(struct sock *sk)
 	/* Free up un-sent records in tx_list. First, free
 	 * the partially sent record if any at head of tx_list.
 	 */
-	if (tls_free_partial_record(sk, tls_ctx)) {
+	if (tls_ctx->partially_sent_record) {
+		tls_free_partial_record(sk, tls_ctx);
 		rec = list_first_entry(&ctx->tx_list,
 				       struct tls_rec, list);
 		list_del(&rec->list);

From 4b67c515036313f3c3ecba3cb2babb9cbddb3f85 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:45 -0800
Subject: [PATCH 07/24] selftests: bpf: test_sockmap: handle file creation
 failures gracefully

test_sockmap creates a temporary file to use for sendpage.
this may fail for various reasons. Handle the error rather
than segfault.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 3845144e2c91..8b838e91cfe5 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -332,6 +332,10 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt,
 	int i, fp;
 
 	file = fopen(".sendpage_tst.tmp", "w+");
+	if (!file) {
+		perror("create file for sendpage");
+		return 1;
+	}
 	for (i = 0; i < iov_length * cnt; i++, k++)
 		fwrite(&k, sizeof(char), 1, file);
 	fflush(file);
@@ -339,6 +343,11 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt,
 	fclose(file);
 
 	fp = open(".sendpage_tst.tmp", O_RDONLY);
+	if (fp < 0) {
+		perror("reopen file for sendpage");
+		return 1;
+	}
+
 	clock_gettime(CLOCK_MONOTONIC, &s->start);
 	for (i = 0; i < cnt; i++) {
 		int sent = sendfile(fd, fp, NULL, iov_length);

From e5dc9dd3258098bf8b5ceb75fc3433b41eff618a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:46 -0800
Subject: [PATCH 08/24] selftests: bpf: correct perror strings

perror(str) is basically equivalent to
print("%s: %s\n", str, strerror(errno)).
New line or colon at the end of str is
a mistake/breaks formatting.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 38 +++++++++++-----------
 tools/testing/selftests/bpf/xdping.c       |  2 +-
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 8b838e91cfe5..4a851513c842 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -240,14 +240,14 @@ static int sockmap_init_sockets(int verbose)
 	addr.sin_port = htons(S1_PORT);
 	err = bind(s1, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0) {
-		perror("bind s1 failed()\n");
+		perror("bind s1 failed()");
 		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = bind(s2, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0) {
-		perror("bind s2 failed()\n");
+		perror("bind s2 failed()");
 		return errno;
 	}
 
@@ -255,14 +255,14 @@ static int sockmap_init_sockets(int verbose)
 	addr.sin_port = htons(S1_PORT);
 	err = listen(s1, 32);
 	if (err < 0) {
-		perror("listen s1 failed()\n");
+		perror("listen s1 failed()");
 		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = listen(s2, 32);
 	if (err < 0) {
-		perror("listen s1 failed()\n");
+		perror("listen s1 failed()");
 		return errno;
 	}
 
@@ -270,14 +270,14 @@ static int sockmap_init_sockets(int verbose)
 	addr.sin_port = htons(S1_PORT);
 	err = connect(c1, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0 && errno != EINPROGRESS) {
-		perror("connect c1 failed()\n");
+		perror("connect c1 failed()");
 		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = connect(c2, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0 && errno != EINPROGRESS) {
-		perror("connect c2 failed()\n");
+		perror("connect c2 failed()");
 		return errno;
 	} else if (err < 0) {
 		err = 0;
@@ -286,13 +286,13 @@ static int sockmap_init_sockets(int verbose)
 	/* Accept Connecrtions */
 	p1 = accept(s1, NULL, NULL);
 	if (p1 < 0) {
-		perror("accept s1 failed()\n");
+		perror("accept s1 failed()");
 		return errno;
 	}
 
 	p2 = accept(s2, NULL, NULL);
 	if (p2 < 0) {
-		perror("accept s1 failed()\n");
+		perror("accept s1 failed()");
 		return errno;
 	}
 
@@ -353,7 +353,7 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt,
 		int sent = sendfile(fd, fp, NULL, iov_length);
 
 		if (!drop && sent < 0) {
-			perror("send loop error:");
+			perror("send loop error");
 			close(fp);
 			return sent;
 		} else if (drop && sent >= 0) {
@@ -472,7 +472,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 			int sent = sendmsg(fd, &msg, flags);
 
 			if (!drop && sent < 0) {
-				perror("send loop error:");
+				perror("send loop error");
 				goto out_errno;
 			} else if (drop && sent >= 0) {
 				printf("send loop error expected: %i\n", sent);
@@ -508,7 +508,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		total_bytes -= txmsg_pop_total;
 		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
 		if (err < 0)
-			perror("recv start time: ");
+			perror("recv start time");
 		while (s->bytes_recvd < total_bytes) {
 			if (txmsg_cork) {
 				timeout.tv_sec = 0;
@@ -552,7 +552,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 			if (recv < 0) {
 				if (errno != EWOULDBLOCK) {
 					clock_gettime(CLOCK_MONOTONIC, &s->end);
-					perror("recv failed()\n");
+					perror("recv failed()");
 					goto out_errno;
 				}
 			}
@@ -566,7 +566,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 
 				errno = msg_verify_data(&msg, recv, chunk_sz);
 				if (errno) {
-					perror("data verify msg failed\n");
+					perror("data verify msg failed");
 					goto out_errno;
 				}
 				if (recvp) {
@@ -574,7 +574,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 								recvp,
 								chunk_sz);
 					if (errno) {
-						perror("data verify msg_peek failed\n");
+						perror("data verify msg_peek failed");
 						goto out_errno;
 					}
 				}
@@ -663,7 +663,7 @@ static int sendmsg_test(struct sockmap_options *opt)
 			err = 0;
 		exit(err ? 1 : 0);
 	} else if (rxpid == -1) {
-		perror("msg_loop_rx: ");
+		perror("msg_loop_rx");
 		return errno;
 	}
 
@@ -690,7 +690,7 @@ static int sendmsg_test(struct sockmap_options *opt)
 				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
 		exit(err ? 1 : 0);
 	} else if (txpid == -1) {
-		perror("msg_loop_tx: ");
+		perror("msg_loop_tx");
 		return errno;
 	}
 
@@ -724,7 +724,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt)
 	/* Ping/Pong data from client to server */
 	sc = send(c1, buf, sizeof(buf), 0);
 	if (sc < 0) {
-		perror("send failed()\n");
+		perror("send failed()");
 		return sc;
 	}
 
@@ -757,7 +757,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt)
 			rc = recv(i, buf, sizeof(buf), 0);
 			if (rc < 0) {
 				if (errno != EWOULDBLOCK) {
-					perror("recv failed()\n");
+					perror("recv failed()");
 					return rc;
 				}
 			}
@@ -769,7 +769,7 @@ static int forever_ping_pong(int rate, struct sockmap_options *opt)
 
 			sc = send(i, buf, rc, 0);
 			if (sc < 0) {
-				perror("send failed()\n");
+				perror("send failed()");
 				return sc;
 			}
 		}
diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c
index d60a343b1371..842d9155d36c 100644
--- a/tools/testing/selftests/bpf/xdping.c
+++ b/tools/testing/selftests/bpf/xdping.c
@@ -45,7 +45,7 @@ static int get_stats(int fd, __u16 count, __u32 raddr)
 	printf("\nXDP RTT data:\n");
 
 	if (bpf_map_lookup_elem(fd, &raddr, &pinginfo)) {
-		perror("bpf_map_lookup elem: ");
+		perror("bpf_map_lookup elem");
 		return 1;
 	}
 

From 61183b056b49e2937ff92a1424291ba36a6f6d05 Mon Sep 17 00:00:00 2001
From: Chuhong Yuan <hslester96@gmail.com>
Date: Thu, 28 Nov 2019 10:00:21 +0800
Subject: [PATCH 09/24] net: macb: add missed tasklet_kill

This driver forgets to kill tasklet in remove.
Add the call to fix it.

Fixes: 032dc41ba6e2 ("net: macb: Handle HRESP error")
Signed-off-by: Chuhong Yuan <hslester96@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cadence/macb_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index d5ae2e1e0b0e..9c767ee252ac 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -4422,6 +4422,7 @@ static int macb_remove(struct platform_device *pdev)
 		mdiobus_free(bp->mii_bus);
 
 		unregister_netdev(dev);
+		tasklet_kill(&bp->hresp_err_tasklet);
 		pm_runtime_disable(&pdev->dev);
 		pm_runtime_dont_use_autosuspend(&pdev->dev);
 		if (!pm_runtime_suspended(&pdev->dev)) {

From 2fe97a578d7bad3116a89dc8a6692a51e6fc1d9c Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Thu, 28 Nov 2019 10:10:05 +0700
Subject: [PATCH 10/24] tipc: fix potential memory leak in __tipc_sendmsg()

When initiating a connection message to a server side, the connection
message is cloned and added to the socket write queue. However, if the
cloning is failed, only the socket write queue is purged. It causes
memory leak because the original connection message is not freed.

This commit fixes it by purging the list of connection message when
it cannot be cloned.

Fixes: 6787927475e5 ("tipc: buffer overflow handling in listener socket")
Reported-by: Hoang Le <hoang.h.le@dektech.com.au>
Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index a1c8d722ca20..7baed2c2c93d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1447,8 +1447,10 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 	rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
 	if (unlikely(rc != dlen))
 		return rc;
-	if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue)))
+	if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) {
+		__skb_queue_purge(&pkts);
 		return -ENOMEM;
+	}
 
 	trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " ");
 	rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);

From 91a4a3eb433e4d786420c41f3c08d1d16c605962 Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Thu, 28 Nov 2019 10:10:06 +0700
Subject: [PATCH 11/24] tipc: fix wrong socket reference counter after
 tipc_sk_timeout() returns

When tipc_sk_timeout() is executed but user space is grabbing
ownership, this function rearms itself and returns. However, the
socket reference counter is not reduced. This causes potential
unexpected behavior.

This commit fixes it by calling sock_put() before tipc_sk_timeout()
returns in the above-mentioned case.

Fixes: afe8792fec69 ("tipc: refactor function tipc_sk_timeout()")
Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 7baed2c2c93d..fb5595081a05 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2759,6 +2759,7 @@ static void tipc_sk_timeout(struct timer_list *t)
 	if (sock_owned_by_user(sk)) {
 		sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20);
 		bh_unlock_sock(sk);
+		sock_put(sk);
 		return;
 	}
 

From 12db3c8083fcab4270866a88191933f2d9f24f89 Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Thu, 28 Nov 2019 10:10:07 +0700
Subject: [PATCH 12/24] tipc: fix wrong timeout input for tipc_wait_for_cond()

In function __tipc_shutdown(), the timeout value passed to
tipc_wait_for_cond() is not jiffies.

This commit fixes it by converting that value from milliseconds
to jiffies.

Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index fb5595081a05..da5fb84852a6 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -532,7 +532,7 @@ static void __tipc_shutdown(struct socket *sock, int error)
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct net *net = sock_net(sk);
-	long timeout = CONN_TIMEOUT_DEFAULT;
+	long timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
 	u32 dnode = tsk_peer_node(tsk);
 	struct sk_buff *skb;
 

From d34910e1751be79672db9cc61ca8892fbb4763f2 Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Thu, 28 Nov 2019 10:10:08 +0700
Subject: [PATCH 13/24] tipc: fix duplicate SYN messages under link congestion

Scenario:
1. A client socket initiates a SYN message to a listening socket.
2. The send link is congested, the SYN message is put in the
send link and a wakeup message is put in wakeup queue.
3. The congestion situation is abated, the wakeup message is
pulled out of the wakeup queue. Function tipc_sk_push_backlog()
is called to send out delayed messages by Nagle. However,
the client socket is still in CONNECTING state. So, it sends
the SYN message in the socket write queue to the listening socket
again.
4. The listening socket receives the first SYN message and creates
first server socket. The client socket receives ACK- and establishes
a connection to the first server socket. The client socket closes
its connection with the first server socket.
5. The listening socket receives the second SYN message and creates
second server socket. The second server socket sends ACK- to the
client socket, but it has been closed. It results in connection
reset error when reading from the server socket in user space.

Solution: return from function tipc_sk_push_backlog() immediately
if there is pending SYN message in the socket write queue.

Fixes: c0bceb97db9e ("tipc: add smart nagle feature")
Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index da5fb84852a6..41688da233ab 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -540,12 +540,10 @@ static void __tipc_shutdown(struct socket *sock, int error)
 	tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
 					    !tsk_conn_cong(tsk)));
 
-	/* Push out unsent messages or remove if pending SYN */
-	skb = skb_peek(&sk->sk_write_queue);
-	if (skb && !msg_is_syn(buf_msg(skb)))
-		tipc_sk_push_backlog(tsk);
-	else
-		__skb_queue_purge(&sk->sk_write_queue);
+	/* Push out delayed messages if in Nagle mode */
+	tipc_sk_push_backlog(tsk);
+	/* Remove pending SYN */
+	__skb_queue_purge(&sk->sk_write_queue);
 
 	/* Reject all unreceived messages, except on an active connection
 	 * (which disconnects locally & sends a 'FIN+' to peer).
@@ -1248,9 +1246,14 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk)
 	struct sk_buff_head *txq = &tsk->sk.sk_write_queue;
 	struct net *net = sock_net(&tsk->sk);
 	u32 dnode = tsk_peer_node(tsk);
+	struct sk_buff *skb = skb_peek(txq);
 	int rc;
 
-	if (skb_queue_empty(txq) || tsk->cong_link_cnt)
+	if (!skb || tsk->cong_link_cnt)
+		return;
+
+	/* Do not send SYN again after congestion */
+	if (msg_is_syn(buf_msg(skb)))
 		return;
 
 	tsk->snt_unacked += tsk->snd_backlog;

From 2745aea6750ff0d2c48285d25bdb00e5b636ec8b Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Date: Thu, 28 Nov 2019 15:58:06 -0300
Subject: [PATCH 14/24] selftests: pmtu: use -oneline for ip route list cache

Some versions of iproute2 will output more than one line per entry, which
will cause the test to fail, like:

TEST: ipv6: list and flush cached exceptions                        [FAIL]
  can't list cached exceptions

That happens, for example, with iproute2 4.15.0. When using the -oneline
option, this will work just fine:

TEST: ipv6: list and flush cached exceptions                        [ OK ]

This also works just fine with a more recent version of iproute2, like
5.4.0.

For some reason, two lines are printed for the IPv4 test no matter what
version of iproute2 is used. Use the same -oneline parameter there instead
of counting the lines twice.

Fixes: b964641e9925 ("selftests: pmtu: Make list_flush_ipv6_exception test more demanding")
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Acked-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/pmtu.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index ab367e75f095..d697815d2785 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -1249,8 +1249,7 @@ test_list_flush_ipv4_exception() {
 	done
 	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 2 -s 1800 "${dst2}"
 
-	# Each exception is printed as two lines
-	if [ "$(${ns_a} ip route list cache | wc -l)" -ne 202 ]; then
+	if [ "$(${ns_a} ip -oneline route list cache | wc -l)" -ne 101 ]; then
 		err "  can't list cached exceptions"
 		fail=1
 	fi
@@ -1300,7 +1299,7 @@ test_list_flush_ipv6_exception() {
 		run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s 1800 "${dst_prefix1}${i}"
 	done
 	run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s 1800 "${dst2}"
-	if [ "$(${ns_a} ip -6 route list cache | wc -l)" -ne 101 ]; then
+	if [ "$(${ns_a} ip -oneline -6 route list cache | wc -l)" -ne 101 ]; then
 		err "  can't list cached exceptions"
 		fail=1
 	fi

From 5f9fc3325ef95398c363b9b7813a7e99d4d85d7d Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 27 Nov 2019 21:27:00 +0800
Subject: [PATCH 15/24] net/mlx5e: Fix build error without IPV6

If IPV6 is not set and CONFIG_MLX5_ESWITCH is y,
building fails:

drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c:322:5: error: redefinition of mlx5e_tc_tun_create_header_ipv6
 int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c:7:0:
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h:67:1: note: previous definition of mlx5e_tc_tun_create_header_ipv6 was here
 mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Use #ifdef to guard this, also move mlx5e_route_lookup_ipv6
to cleanup unused warning.

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: e689e998e102 ("net/mlx5e: TC, Stub out ipv6 tun create header function")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/mellanox/mlx5/core/en/tc_tun.c   | 74 ++++++++++---------
 1 file changed, 38 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
index 784b1e26f414..6ed87534d314 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c
@@ -130,42 +130,6 @@ static const char *mlx5e_netdev_kind(struct net_device *dev)
 		return "unknown";
 }
 
-static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
-				   struct net_device *mirred_dev,
-				   struct net_device **out_dev,
-				   struct net_device **route_dev,
-				   struct flowi6 *fl6,
-				   struct neighbour **out_n,
-				   u8 *out_ttl)
-{
-	struct dst_entry *dst;
-	struct neighbour *n;
-
-	int ret;
-
-	ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst,
-					 fl6);
-	if (ret < 0)
-		return ret;
-
-	if (!(*out_ttl))
-		*out_ttl = ip6_dst_hoplimit(dst);
-
-	ret = get_route_and_out_devs(priv, dst->dev, route_dev, out_dev);
-	if (ret < 0) {
-		dst_release(dst);
-		return ret;
-	}
-
-	n = dst_neigh_lookup(dst, &fl6->daddr);
-	dst_release(dst);
-	if (!n)
-		return -ENOMEM;
-
-	*out_n = n;
-	return 0;
-}
-
 static int mlx5e_gen_ip_tunnel_header(char buf[], __u8 *ip_proto,
 				      struct mlx5e_encap_entry *e)
 {
@@ -319,6 +283,43 @@ release_neigh:
 	return err;
 }
 
+#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
+static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
+				   struct net_device *mirred_dev,
+				   struct net_device **out_dev,
+				   struct net_device **route_dev,
+				   struct flowi6 *fl6,
+				   struct neighbour **out_n,
+				   u8 *out_ttl)
+{
+	struct dst_entry *dst;
+	struct neighbour *n;
+
+	int ret;
+
+	ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst,
+					 fl6);
+	if (ret < 0)
+		return ret;
+
+	if (!(*out_ttl))
+		*out_ttl = ip6_dst_hoplimit(dst);
+
+	ret = get_route_and_out_devs(priv, dst->dev, route_dev, out_dev);
+	if (ret < 0) {
+		dst_release(dst);
+		return ret;
+	}
+
+	n = dst_neigh_lookup(dst, &fl6->daddr);
+	dst_release(dst);
+	if (!n)
+		return -ENOMEM;
+
+	*out_n = n;
+	return 0;
+}
+
 int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
 				    struct net_device *mirred_dev,
 				    struct mlx5e_encap_entry *e)
@@ -436,6 +437,7 @@ release_neigh:
 	neigh_release(n);
 	return err;
 }
+#endif
 
 bool mlx5e_tc_tun_device_to_offload(struct mlx5e_priv *priv,
 				    struct net_device *netdev)

From 395eba7d0c122c650fb8b6ae8c6815cbfdc39ca9 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Fri, 29 Nov 2019 19:58:09 +0200
Subject: [PATCH 16/24] net: ethernet: ti: ale: ensure vlan/mdb deleted when no
 members

The recently updated ALE APIs cpsw_ale_del_mcast() and
cpsw_ale_del_vlan_modify() have an issue and will not delete ALE entry even
if VLAN/mcast group has no more members. Hence fix it here and delete ALE
entry if !port_mask.

The issue affected only new cpsw switchdev driver.

Fixes: e85c14370783 ("net: ethernet: ti: ale: modify vlan/mdb api for switchdev")
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpsw_ale.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index 929f3d3354e3..ecdbde539eb7 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -384,7 +384,7 @@ int cpsw_ale_del_mcast(struct cpsw_ale *ale, const u8 *addr, int port_mask,
 		       int flags, u16 vid)
 {
 	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
-	int mcast_members;
+	int mcast_members = 0;
 	int idx;
 
 	idx = cpsw_ale_match_addr(ale, addr, (flags & ALE_VLAN) ? vid : 0);
@@ -397,11 +397,13 @@ int cpsw_ale_del_mcast(struct cpsw_ale *ale, const u8 *addr, int port_mask,
 		mcast_members = cpsw_ale_get_port_mask(ale_entry,
 						       ale->port_mask_bits);
 		mcast_members &= ~port_mask;
+	}
+
+	if (mcast_members)
 		cpsw_ale_set_port_mask(ale_entry, mcast_members,
 				       ale->port_mask_bits);
-	} else {
+	else
 		cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE);
-	}
 
 	cpsw_ale_write(ale, idx, ale_entry);
 	return 0;
@@ -478,6 +480,10 @@ static void cpsw_ale_del_vlan_modify(struct cpsw_ale *ale, u32 *ale_entry,
 	members = cpsw_ale_get_vlan_member_list(ale_entry,
 						ale->vlan_field_bits);
 	members &= ~port_mask;
+	if (!members) {
+		cpsw_ale_set_entry_type(ale_entry, ALE_TYPE_FREE);
+		return;
+	}
 
 	untag = cpsw_ale_get_vlan_untag_force(ale_entry,
 					      ale->vlan_field_bits);

From 14e54ab9143fa60794d13ea0a66c792a2046a8f3 Mon Sep 17 00:00:00 2001
From: Dust Li <dust.li@linux.alibaba.com>
Date: Thu, 28 Nov 2019 14:29:09 +0800
Subject: [PATCH 17/24] net: sched: fix `tc -s class show` no bstats on class
 with nolock subqueues

When a classful qdisc's child qdisc has set the flag
TCQ_F_CPUSTATS (pfifo_fast for example), the child qdisc's
cpu_bstats should be passed to gnet_stats_copy_basic(),
but many classful qdisc didn't do that. As a result,
`tc -s class show dev DEV` always return 0 for bytes and
packets in this case.

Pass the child qdisc's cpu_bstats to gnet_stats_copy_basic()
to fix this issue.

The qstats also has this problem, but it has been fixed
in 5dd431b6b9 ("net: sched: introduce and use qstats read...")
and bstats still remains buggy.

Fixes: 22e0f8b9322c ("net: sched: make bstats per cpu and estimator RCU safe")
Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_mq.c     | 3 ++-
 net/sched/sch_mqprio.c | 4 ++--
 net/sched/sch_multiq.c | 2 +-
 net/sched/sch_prio.c   | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 0d578333e967..278c0b2dc523 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -245,7 +245,8 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
 
 	sch = dev_queue->qdisc_sleeping;
-	if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
+	if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats,
+				  &sch->bstats) < 0 ||
 	    qdisc_qstats_copy(d, sch) < 0)
 		return -1;
 	return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 46980b8d66c5..0d0113a24962 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -557,8 +557,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 		struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
 
 		sch = dev_queue->qdisc_sleeping;
-		if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
-					  d, NULL, &sch->bstats) < 0 ||
+		if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d,
+					  sch->cpu_bstats, &sch->bstats) < 0 ||
 		    qdisc_qstats_copy(d, sch) < 0)
 			return -1;
 	}
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index b2b7fdb06fc6..1330ad224931 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -339,7 +339,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 	cl_q = q->queues[cl - 1];
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
-				  d, NULL, &cl_q->bstats) < 0 ||
+				  d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
 	    qdisc_qstats_copy(d, cl_q) < 0)
 		return -1;
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 0f8fedb8809a..18b884cfdfe8 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -356,7 +356,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 	cl_q = q->queues[cl - 1];
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
-				  d, NULL, &cl_q->bstats) < 0 ||
+				  d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
 	    qdisc_qstats_copy(d, cl_q) < 0)
 		return -1;
 

From 408469d31e9106878d2b3e428dcca9984664dfc9 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 28 Nov 2019 13:38:57 +0100
Subject: [PATCH 18/24] selftests: forwarding: fix race between packet receive
 and tc check

It is possible that tc stats get checked before the packet we check for
actually arrived into the interface and accounted for.
Fix it by checking for the expected result in a loop until
timeout is reached (by default 1 second).

Fixes: 07e5c75184a1 ("selftests: forwarding: Introduce tc flower matching tests")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/tc_common.sh     | 39 +++++++++++++++----
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh
index d93589bd4d1d..64f652633585 100644
--- a/tools/testing/selftests/net/forwarding/tc_common.sh
+++ b/tools/testing/selftests/net/forwarding/tc_common.sh
@@ -3,16 +3,42 @@
 
 CHECK_TC="yes"
 
+# Can be overridden by the configuration file. See lib.sh
+TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms
+
+__tc_check_packets()
+{
+	local id=$1
+	local handle=$2
+	local count=$3
+	local operator=$4
+
+	start_time="$(date -u +%s%3N)"
+	while true
+	do
+		cmd_jq "tc -j -s filter show $id" \
+		       ".[] | select(.options.handle == $handle) | \
+			    select(.options.actions[0].stats.packets $operator $count)" \
+		    &> /dev/null
+		ret=$?
+		if [[ $ret -eq 0 ]]; then
+			return $ret
+		fi
+		current_time="$(date -u +%s%3N)"
+		diff=$(expr $current_time - $start_time)
+		if [ "$diff" -gt "$TC_HIT_TIMEOUT" ]; then
+			return 1
+		fi
+	done
+}
+
 tc_check_packets()
 {
 	local id=$1
 	local handle=$2
 	local count=$3
 
-	cmd_jq "tc -j -s filter show $id" \
-	       ".[] | select(.options.handle == $handle) | \
-	              select(.options.actions[0].stats.packets == $count)" \
-	       &> /dev/null
+	__tc_check_packets "$id" "$handle" "$count" "=="
 }
 
 tc_check_packets_hitting()
@@ -20,8 +46,5 @@ tc_check_packets_hitting()
 	local id=$1
 	local handle=$2
 
-	cmd_jq "tc -j -s filter show $id" \
-	       ".[] | select(.options.handle == $handle) | \
-		      select(.options.actions[0].stats.packets > 0)" \
-	       &> /dev/null
+	__tc_check_packets "$id" "$handle" 0 ">"
 }

From 9bd19c63a4bcfa0fd18f3bc0de30675fe2addea6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 28 Nov 2019 14:17:59 -0800
Subject: [PATCH 19/24] net: emulex: benet: indent a Kconfig depends
 continuation line

Indent a Kconfig continuation line to improve readability.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Sathya Perla <sathya.perla@broadcom.com>
Cc: Ajit Khaparde <ajit.khaparde@broadcom.com>
Cc: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
Cc: Somnath Kotur <somnath.kotur@broadcom.com>
Cc: netdev@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/Kconfig b/drivers/net/ethernet/emulex/benet/Kconfig
index 17d300ea9955..f51dca1526c6 100644
--- a/drivers/net/ethernet/emulex/benet/Kconfig
+++ b/drivers/net/ethernet/emulex/benet/Kconfig
@@ -49,4 +49,4 @@ config BE2NET_SKYHAWK
 
 comment "WARNING: be2net is useless without any enabled chip"
 	depends on BE2NET_BE2=n && BE2NET_BE3=n && BE2NET_LANCER=n && \
-	BE2NET_SKYHAWK=n && BE2NET
+		BE2NET_SKYHAWK=n && BE2NET

From 14012c9f3bb922b9e0751ba43d15cc580a6049bf Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 1 Dec 2019 10:27:14 +0100
Subject: [PATCH 20/24] r8169: fix jumbo configuration for RTL8168evl

Alan reported [0] that network is broken since the referenced commit
when using jumbo frames. This commit isn't wrong, it just revealed
another issue that has been existing before. According to the vendor
driver the RTL8168e-specific jumbo config doesn't apply for RTL8168evl.

[0] https://lkml.org/lkml/2019/11/30/119

Fixes: 4ebcb113edcc ("r8169: fix jumbo packet handling on resume from suspend")
Reported-by: Alan J. Wylie <alan@wylie.me.uk>
Tested-by: Alan J. Wylie <alan@wylie.me.uk>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index d47a038cb8d0..78872cef32ad 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -3872,7 +3872,7 @@ static void rtl_hw_jumbo_enable(struct rtl8169_private *tp)
 	case RTL_GIGA_MAC_VER_27 ... RTL_GIGA_MAC_VER_28:
 		r8168dp_hw_jumbo_enable(tp);
 		break;
-	case RTL_GIGA_MAC_VER_31 ... RTL_GIGA_MAC_VER_34:
+	case RTL_GIGA_MAC_VER_31 ... RTL_GIGA_MAC_VER_33:
 		r8168e_hw_jumbo_enable(tp);
 		break;
 	default:

From 398fd408ccfb5e44b1cbe73a209d2281d3efa83c Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 1 Dec 2019 10:39:56 +0100
Subject: [PATCH 21/24] r8169: fix resume on cable plug-in

It was reported [0] that network doesn't wake up on cable plug-in with
certain chip versions. Reason is that on these chip versions the PHY
doesn't detect cable plug-in when being in power-down mode. So prevent
the PHY from powering down if WoL is enabled.

[0] https://bugzilla.kernel.org/show_bug.cgi?id=202103

Fixes: 95fb8bb3181b ("net: phy: force phy suspend when calling phy_stop")
Reported-by: jhdskag3 <jhdskag3@tutanota.com>
Tested-by: jhdskag3 <jhdskag3@tutanota.com>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 78872cef32ad..38d212686123 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -1542,6 +1542,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts)
 	rtl_lock_config_regs(tp);
 
 	device_set_wakeup_enable(tp_to_dev(tp), wolopts);
+	tp->dev->wol_enabled = wolopts ? 1 : 0;
 }
 
 static int rtl8169_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)

From f3284e014850e45f63fbc74c7382453c876fbc35 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 1 Dec 2019 10:51:47 +0100
Subject: [PATCH 22/24] net: phy: realtek: fix using paged operations with
 RTL8105e / RTL8208

It was reported [0] that since the referenced commit a warning is
triggered in phylib that complains about paged operations being used
with a PHY driver that doesn't support this. The commit isn't wrong,
just for one chip version (RTL8105e) no dedicated PHY driver exists
yet. So add the missing PHY driver.

[0] https://bugzilla.kernel.org/show_bug.cgi?id=202103

Fixes: 3a129e3f9ac4 ("r8169: switch to phylib functions in more places")
Reported-by: jhdskag3 <jhdskag3@tutanota.com>
Tested-by: jhdskag3 <jhdskag3@tutanota.com>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/realtek.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
index 677c45985338..476db5345e1a 100644
--- a/drivers/net/phy/realtek.c
+++ b/drivers/net/phy/realtek.c
@@ -439,6 +439,15 @@ static struct phy_driver realtek_drvs[] = {
 		.resume		= genphy_resume,
 		.read_page	= rtl821x_read_page,
 		.write_page	= rtl821x_write_page,
+	}, {
+		PHY_ID_MATCH_MODEL(0x001cc880),
+		.name		= "RTL8208 Fast Ethernet",
+		.read_mmd	= genphy_read_mmd_unsupported,
+		.write_mmd	= genphy_write_mmd_unsupported,
+		.suspend	= genphy_suspend,
+		.resume		= genphy_resume,
+		.read_page	= rtl821x_read_page,
+		.write_page	= rtl821x_write_page,
 	}, {
 		PHY_ID_MATCH_EXACT(0x001cc910),
 		.name		= "RTL8211 Gigabit Ethernet",

From 8ffeb03fbba3b599690b361467bfd2373e8c450f Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Sun, 1 Dec 2019 18:41:24 +0100
Subject: [PATCH 23/24] openvswitch: drop unneeded BUG_ON() in
 ovs_flow_cmd_build_info()

All the callers of ovs_flow_cmd_build_info() already deal with
error return code correctly, so we can handle the error condition
in a more gracefull way. Still dump a warning to preserve
debuggability.

v1 -> v2:
 - clarify the commit message
 - clean the skb and report the error (DaveM)

Fixes: ccb1352e76cf ("net: Add Open vSwitch kernel components.")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 293d5289c4a1..8cab3435d8da 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -905,7 +905,10 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
 	retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
 					info->snd_portid, info->snd_seq, 0,
 					cmd, ufid_flags);
-	BUG_ON(retval < 0);
+	if (WARN_ON_ONCE(retval < 0)) {
+		kfree_skb(skb);
+		skb = ERR_PTR(retval);
+	}
 	return skb;
 }
 

From 8a574f86652a4540a2433946ba826ccb87f398cc Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Sun, 1 Dec 2019 18:41:25 +0100
Subject: [PATCH 24/24] openvswitch: remove another BUG_ON()

If we can't build the flow del notification, we can simply delete
the flow, no need to crash the kernel. Still keep a WARN_ON to
preserve debuggability.

Note: the BUG_ON() predates the Fixes tag, but this change
can be applied only after the mentioned commit.

v1 -> v2:
 - do not leak an skb on error

Fixes: aed067783e50 ("openvswitch: Minimize ovs_flow_cmd_del critical section.")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 8cab3435d8da..1047e8043084 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1372,7 +1372,10 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 						     OVS_FLOW_CMD_DEL,
 						     ufid_flags);
 			rcu_read_unlock();
-			BUG_ON(err < 0);
+			if (WARN_ON_ONCE(err < 0)) {
+				kfree_skb(reply);
+				goto out_free;
+			}
 
 			ovs_notify(&dp_flow_genl_family, reply, info);
 		} else {
@@ -1380,6 +1383,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+out_free:
 	ovs_flow_free(flow, true);
 	return 0;
 unlock: