From cc01572e2fb080e279ca625f239aca61f435ebf3 Mon Sep 17 00:00:00 2001
From: Yossi Kuperman <yossiku@mellanox.com>
Date: Wed, 17 Jan 2018 15:52:41 +0200
Subject: [PATCH 01/19] xfrm: Add SA to hardware at the end of
 xfrm_state_construct()

Current code configures the hardware with a new SA before the state has been
fully initialized. During this time interval, an incoming ESP packet can cause
a crash due to a NULL dereference. More specifically, xfrm_input() considers
the packet as valid, and yet, anti-replay mechanism is not initialized.

Move hardware configuration to the end of xfrm_state_construct(), and mark
the state as valid once the SA is fully initialized.

Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API")
Signed-off-by: Aviad Yehezkel <aviadye@mellnaox.com>
Signed-off-by: Aviv Heller <avivh@mellanox.com>
Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 10 +++++++---
 net/xfrm/xfrm_user.c  | 18 +++++++++++-------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 429957412633..2d486492acdb 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2272,8 +2272,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
 			goto error;
 	}
 
-	x->km.state = XFRM_STATE_VALID;
-
 error:
 	return err;
 }
@@ -2282,7 +2280,13 @@ EXPORT_SYMBOL(__xfrm_init_state);
 
 int xfrm_init_state(struct xfrm_state *x)
 {
-	return __xfrm_init_state(x, true, false);
+	int err;
+
+	err = __xfrm_init_state(x, true, false);
+	if (!err)
+		x->km.state = XFRM_STATE_VALID;
+
+	return err;
 }
 
 EXPORT_SYMBOL(xfrm_init_state);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index bdb48e5dba04..7f52b8eb177d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -598,13 +598,6 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 			goto error;
 	}
 
-	if (attrs[XFRMA_OFFLOAD_DEV]) {
-		err = xfrm_dev_state_add(net, x,
-					 nla_data(attrs[XFRMA_OFFLOAD_DEV]));
-		if (err)
-			goto error;
-	}
-
 	if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
 					       attrs[XFRMA_REPLAY_ESN_VAL])))
 		goto error;
@@ -620,6 +613,14 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 	/* override default values from above */
 	xfrm_update_ae_params(x, attrs, 0);
 
+	/* configure the hardware if offload is requested */
+	if (attrs[XFRMA_OFFLOAD_DEV]) {
+		err = xfrm_dev_state_add(net, x,
+					 nla_data(attrs[XFRMA_OFFLOAD_DEV]));
+		if (err)
+			goto error;
+	}
+
 	return x;
 
 error:
@@ -662,6 +663,9 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto out;
 	}
 
+	if (x->km.state == XFRM_STATE_VOID)
+		x->km.state = XFRM_STATE_VALID;
+
 	c.seq = nlh->nlmsg_seq;
 	c.portid = nlh->nlmsg_pid;
 	c.event = nlh->nlmsg_type;

From aa5dd6fa6f5d4bdc82a67e952bba8ad2e98d77e2 Mon Sep 17 00:00:00 2001
From: Aviad Yehezkel <aviadye@mellanox.com>
Date: Thu, 18 Jan 2018 15:41:51 +0200
Subject: [PATCH 02/19] xfrm: fix error flow in case of add state fails

If add state fails in case of device offload, netdev refcount
will be negative since gc task is attempting to dev_free this state.
This is fixed by putting NULL in state dev field.

Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: Boris Pismeny <borisp@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_device.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 30e5746085b8..ac9477189d1c 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -102,6 +102,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 
 	err = dev->xfrmdev_ops->xdo_dev_state_add(x);
 	if (err) {
+		xso->dev = NULL;
 		dev_put(dev);
 		return err;
 	}

From 5efec5c655dd31944af440ff2b0a93facc4a7762 Mon Sep 17 00:00:00 2001
From: Yossi Kuperman <yossiku@mellanox.com>
Date: Tue, 23 Jan 2018 00:16:21 +0200
Subject: [PATCH 03/19] xfrm: Fix eth_hdr(skb)->h_proto to reflect inner IP
 version

IPSec tunnel mode supports encapsulation of IPv4 over IPv6 and vice-versa.

The outer IP header is stripped and the inner IP inherits the original
Ethernet header. Tcpdump fails to properly decode the inner packet in
case that h_proto is different than the inner IP version.

Fix h_proto to reflect the inner IP version.

Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_mode_tunnel.c | 1 +
 net/ipv6/xfrm6_mode_tunnel.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index e6265e2c274e..20ca486b3cad 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -92,6 +92,7 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	skb_reset_network_header(skb);
 	skb_mac_header_rebuild(skb);
+	eth_hdr(skb)->h_proto = skb->protocol;
 
 	err = 0;
 
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index 02556e356f87..dc93002ff9d1 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -92,6 +92,7 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
 
 	skb_reset_network_header(skb);
 	skb_mac_header_rebuild(skb);
+	eth_hdr(skb)->h_proto = skb->protocol;
 
 	err = 0;
 

From 545d8ae7affff7fb4f8bfd327c7c7790056535c4 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Mon, 22 Jan 2018 16:34:09 -0600
Subject: [PATCH 04/19] xfrm: fix boolean assignment in xfrm_get_type_offload

Assign true or false to boolean variables instead of an integer value.

This issue was detected with the help of Coccinelle.

Fixes: ffdb5211da1c ("xfrm: Auto-load xfrm offload modules")
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 2d486492acdb..a3785f538018 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -317,7 +317,7 @@ xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load)
 
 	if (!type && try_load) {
 		request_module("xfrm-offload-%d-%d", family, proto);
-		try_load = 0;
+		try_load = false;
 		goto retry;
 	}
 

From 02612bb05e51df8489db5e94d0cf8d1c81f87b0c Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Mon, 22 Jan 2018 18:06:37 +0100
Subject: [PATCH 05/19] pppoe: take ->needed_headroom of lower device into
 account on xmit

In pppoe_sendmsg(), reserving dev->hard_header_len bytes of headroom
was probably fine before the introduction of ->needed_headroom in
commit f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom").

But now, virtual devices typically advertise the size of their overhead
in dev->needed_headroom, so we must also take it into account in
skb_reserve().
Allocation size of skb is also updated to take dev->needed_tailroom
into account and replace the arbitrary 32 bytes with the real size of
a PPPoE header.

This issue was discovered by syzbot, who connected a pppoe socket to a
gre device which had dev->header_ops->create == ipgre_header and
dev->hard_header_len == 0. Therefore, PPPoE didn't reserve any
headroom, and dev_hard_header() crashed when ipgre_header() tried to
prepend its header to skb->data.

skbuff: skb_under_panic: text:000000001d390b3a len:31 put:24
head:00000000d8ed776f data:000000008150e823 tail:0x7 end:0xc0 dev:gre0
------------[ cut here ]------------
kernel BUG at net/core/skbuff.c:104!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
    (ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 3670 Comm: syzkaller801466 Not tainted
4.15.0-rc7-next-20180115+ #97
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:skb_panic+0x162/0x1f0 net/core/skbuff.c:100
RSP: 0018:ffff8801d9bd7840 EFLAGS: 00010282
RAX: 0000000000000083 RBX: ffff8801d4f083c0 RCX: 0000000000000000
RDX: 0000000000000083 RSI: 1ffff1003b37ae92 RDI: ffffed003b37aefc
RBP: ffff8801d9bd78a8 R08: 1ffff1003b37ae8a R09: 0000000000000000
R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff86200de0
R13: ffffffff84a981ad R14: 0000000000000018 R15: ffff8801d2d34180
FS:  00000000019c4880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000208bc000 CR3: 00000001d9111001 CR4: 00000000001606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  skb_under_panic net/core/skbuff.c:114 [inline]
  skb_push+0xce/0xf0 net/core/skbuff.c:1714
  ipgre_header+0x6d/0x4e0 net/ipv4/ip_gre.c:879
  dev_hard_header include/linux/netdevice.h:2723 [inline]
  pppoe_sendmsg+0x58e/0x8b0 drivers/net/ppp/pppoe.c:890
  sock_sendmsg_nosec net/socket.c:630 [inline]
  sock_sendmsg+0xca/0x110 net/socket.c:640
  sock_write_iter+0x31a/0x5d0 net/socket.c:909
  call_write_iter include/linux/fs.h:1775 [inline]
  do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653
  do_iter_write+0x154/0x540 fs/read_write.c:932
  vfs_writev+0x18a/0x340 fs/read_write.c:977
  do_writev+0xfc/0x2a0 fs/read_write.c:1012
  SYSC_writev fs/read_write.c:1085 [inline]
  SyS_writev+0x27/0x30 fs/read_write.c:1082
  entry_SYSCALL_64_fastpath+0x29/0xa0

Admittedly PPPoE shouldn't be allowed to run on non Ethernet-like
interfaces, but reserving space for ->needed_headroom is a more
fundamental issue that needs to be addressed first.

Same problem exists for __pppoe_xmit(), which also needs to take
dev->needed_headroom into account in skb_cow_head().

Fixes: f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom")
Reported-by: syzbot+ed0838d0fa4c4f2b528e20286e6dc63effc7c14d@syzkaller.appspotmail.com
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/pppoe.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 4e1da1645b15..5aa59f41bf8c 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -842,6 +842,7 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m,
 	struct pppoe_hdr *ph;
 	struct net_device *dev;
 	char *start;
+	int hlen;
 
 	lock_sock(sk);
 	if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) {
@@ -860,16 +861,16 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m,
 	if (total_len > (dev->mtu + dev->hard_header_len))
 		goto end;
 
-
-	skb = sock_wmalloc(sk, total_len + dev->hard_header_len + 32,
-			   0, GFP_KERNEL);
+	hlen = LL_RESERVED_SPACE(dev);
+	skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len +
+			   dev->needed_tailroom, 0, GFP_KERNEL);
 	if (!skb) {
 		error = -ENOMEM;
 		goto end;
 	}
 
 	/* Reserve space for headers. */
-	skb_reserve(skb, dev->hard_header_len);
+	skb_reserve(skb, hlen);
 	skb_reset_network_header(skb);
 
 	skb->dev = dev;
@@ -930,7 +931,7 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb)
 	/* Copy the data if there is no space for the header or if it's
 	 * read-only.
 	 */
-	if (skb_cow_head(skb, sizeof(*ph) + dev->hard_header_len))
+	if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph)))
 		goto abort;
 
 	__skb_push(skb, sizeof(*ph));

From e9191ffb65d8e159680ce0ad2224e1acbde6985c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Mon, 22 Jan 2018 20:06:42 +0000
Subject: [PATCH 06/19] ipv6: Fix getsockopt() for sockets with default
 IPV6_AUTOFLOWLABEL

Commit 513674b5a2c9 ("net: reevalulate autoflowlabel setting after
sysctl setting") removed the initialisation of
ipv6_pinfo::autoflowlabel and added a second flag to indicate
whether this field or the net namespace default should be used.

The getsockopt() handling for this case was not updated, so it
currently returns 0 for all sockets for which IPV6_AUTOFLOWLABEL is
not explicitly enabled.  Fix it to return the effective value, whether
that has been set at the socket or net namespace level.

Fixes: 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl ...")
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h       | 1 +
 net/ipv6/ip6_output.c    | 2 +-
 net/ipv6/ipv6_sockglue.c | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f73797e2fa60..221238254eb7 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -331,6 +331,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
 			   int flags);
 int ip6_flowlabel_init(void);
 void ip6_flowlabel_cleanup(void);
+bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np);
 
 static inline void fl6_sock_release(struct ip6_flowlabel *fl)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 4f7d8de56611..3763dc01e374 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -166,7 +166,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
 
-static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
+bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 {
 	if (!np->autoflowlabel_set)
 		return ip6_default_np_autolabel(net);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 2d4680e0376f..e8ffb5b5d84e 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1336,7 +1336,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_AUTOFLOWLABEL:
-		val = np->autoflowlabel;
+		val = ip6_autoflowlabel(sock_net(sk), np);
 		break;
 
 	case IPV6_RECVFRAGSIZE:

From 848b159835ddef99cc4193083f7e786c3992f580 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 22 Jan 2018 16:06:37 -0500
Subject: [PATCH 07/19] vmxnet3: repair memory leak

with the introduction of commit
b0eb57cb97e7837ebb746404c2c58c6f536f23fa, it appears that rq->buf_info
is improperly handled.  While it is heap allocated when an rx queue is
setup, and freed when torn down, an old line of code in
vmxnet3_rq_destroy was not properly removed, leading to rq->buf_info[0]
being set to NULL prior to its being freed, causing a memory leak, which
eventually exhausts the system on repeated create/destroy operations
(for example, when  the mtu of a vmxnet3 interface is changed
frequently.

Fix is pretty straight forward, just move the NULL set to after the
free.

Tested by myself with successful results

Applies to net, and should likely be queued for stable, please

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-By: boyang@redhat.com
CC: boyang@redhat.com
CC: Shrikrishna Khare <skhare@vmware.com>
CC: "VMware, Inc." <pv-drivers@vmware.com>
CC: David S. Miller <davem@davemloft.net>
Acked-by: Shrikrishna Khare <skhare@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index d1c7029ded7c..cf95290b160c 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1616,7 +1616,6 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq,
 					  rq->rx_ring[i].basePA);
 			rq->rx_ring[i].base = NULL;
 		}
-		rq->buf_info[i] = NULL;
 	}
 
 	if (rq->data_ring.base) {
@@ -1638,6 +1637,7 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq,
 			(rq->rx_ring[0].size + rq->rx_ring[1].size);
 		dma_free_coherent(&adapter->pdev->dev, sz, rq->buf_info[0],
 				  rq->buf_info_pa);
+		rq->buf_info[0] = rq->buf_info[1] = NULL;
 	}
 }
 

From 1ecdaea02ca6bfacf2ecda500dc1af51e9780c42 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 24 Jan 2018 10:02:09 +0100
Subject: [PATCH 08/19] mlxsw: spectrum_router: Don't log an error on missing
 neighbor

Driver periodically samples all neighbors configured in device
in order to update the kernel regarding their state. When finding
an entry configured in HW that doesn't show in neigh_lookup()
driver logs an error message.
This introduces a race when removing multiple neighbors -
it's possible that a given entry would still be configured in HW
as its removal is still being processed but is already removed
from the kernel's neighbor tables.

Simply remove the error message and gracefully accept such events.

Fixes: c723c735fa6b ("mlxsw: spectrum_router: Periodically update the kernel's neigh table")
Fixes: 60f040ca11b9 ("mlxsw: spectrum_router: Periodically dump active IPv6 neighbours")
Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 6c0391c13fe0..7042c855a5d6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1942,11 +1942,8 @@ static void mlxsw_sp_router_neigh_ent_ipv4_process(struct mlxsw_sp *mlxsw_sp,
 	dipn = htonl(dip);
 	dev = mlxsw_sp->router->rifs[rif]->dev;
 	n = neigh_lookup(&arp_tbl, &dipn, dev);
-	if (!n) {
-		netdev_err(dev, "Failed to find matching neighbour for IP=%pI4h\n",
-			   &dip);
+	if (!n)
 		return;
-	}
 
 	netdev_dbg(dev, "Updating neighbour with IP=%pI4h\n", &dip);
 	neigh_event_send(n, NULL);
@@ -1973,11 +1970,8 @@ static void mlxsw_sp_router_neigh_ent_ipv6_process(struct mlxsw_sp *mlxsw_sp,
 
 	dev = mlxsw_sp->router->rifs[rif]->dev;
 	n = neigh_lookup(&nd_tbl, &dip, dev);
-	if (!n) {
-		netdev_err(dev, "Failed to find matching neighbour for IP=%pI6c\n",
-			   &dip);
+	if (!n)
 		return;
-	}
 
 	netdev_dbg(dev, "Updating neighbour with IP=%pI6c\n", &dip);
 	neigh_event_send(n, NULL);

From 560a66075d694e6ec24c60967b4d93d97cbb33d1 Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller@proxmox.com>
Date: Thu, 18 Jan 2018 11:32:35 +0100
Subject: [PATCH 09/19] net: sched: em_nbyte: don't add the data offset twice

'ptr' is shifted by the offset and then validated,
the memcmp should not add it a second time.

Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/em_nbyte.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index df3110d69585..07c10bac06a0 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -51,7 +51,7 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
 	if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
 		return 0;
 
-	return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
+	return !memcmp(ptr, nbyte->pattern, nbyte->hdr.len);
 }
 
 static struct tcf_ematch_ops em_nbyte_ops = {

From d3303a65a00c94372ddab831570647488e6c06e2 Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller@proxmox.com>
Date: Thu, 18 Jan 2018 11:32:36 +0100
Subject: [PATCH 10/19] net: sched: fix TCF_LAYER_LINK case in tcf_get_base_ptr

TCF_LAYER_LINK and TCF_LAYER_NETWORK returned the same pointer as
skb->data points to the network header.
Use skb_mac_header instead.

Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 8e08b6da72f3..753ac9361154 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -522,7 +522,7 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
 {
 	switch (layer) {
 		case TCF_LAYER_LINK:
-			return skb->data;
+			return skb_mac_header(skb);
 		case TCF_LAYER_NETWORK:
 			return skb_network_header(skb);
 		case TCF_LAYER_TRANSPORT:

From 581e7226a5d43f629eb6399a121f85f6a15f81be Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Wed, 24 Jan 2018 12:35:40 -0800
Subject: [PATCH 11/19] kcm: Only allow TCP sockets to be attached to a KCM mux

TCP sockets for IPv4 and IPv6 that are not listeners or in closed
stated are allowed to be attached to a KCM mux.

Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
Reported-by: syzbot+8865eaff7f9acd593945@syzkaller.appspotmail.com
Signed-off-by: Tom Herbert <tom@quantonium.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/kcm/kcmsock.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d4e98f20fc2a..7632797fb68e 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1387,8 +1387,13 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	if (!csk)
 		return -EINVAL;
 
-	/* We must prevent loops or risk deadlock ! */
-	if (csk->sk_family == PF_KCM)
+	/* Only allow TCP sockets to be attached for now */
+	if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) ||
+	    csk->sk_protocol != IPPROTO_TCP)
+		return -EOPNOTSUPP;
+
+	/* Don't allow listeners or closed sockets */
+	if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE)
 		return -EOPNOTSUPP;
 
 	psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);

From e5571240236c5652f3e079b1d5866716a7ad819c Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Wed, 24 Jan 2018 12:35:41 -0800
Subject: [PATCH 12/19] kcm: Check if sk_user_data already set in kcm_attach

This is needed to prevent sk_user_data being overwritten.
The check is done under the callback lock. This should prevent
a socket from being attached twice to a KCM mux. It also prevents
a socket from being attached for other use cases of sk_user_data
as long as the other cases set sk_user_data under the lock.
Followup work is needed to unify all the use cases of sk_user_data
to use the same locking.

Reported-by: syzbot+114b15f2be420a8886c3@syzkaller.appspotmail.com
Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
Signed-off-by: Tom Herbert <tom@quantonium.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/kcm/kcmsock.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 7632797fb68e..4a8d407f8902 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1410,9 +1410,18 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 		return err;
 	}
 
-	sock_hold(csk);
-
 	write_lock_bh(&csk->sk_callback_lock);
+
+	/* Check if sk_user_data is aready by KCM or someone else.
+	 * Must be done under lock to prevent race conditions.
+	 */
+	if (csk->sk_user_data) {
+		write_unlock_bh(&csk->sk_callback_lock);
+		strp_done(&psock->strp);
+		kmem_cache_free(kcm_psockp, psock);
+		return -EALREADY;
+	}
+
 	psock->save_data_ready = csk->sk_data_ready;
 	psock->save_write_space = csk->sk_write_space;
 	psock->save_state_change = csk->sk_state_change;
@@ -1420,8 +1429,11 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	csk->sk_data_ready = psock_data_ready;
 	csk->sk_write_space = psock_write_space;
 	csk->sk_state_change = psock_state_change;
+
 	write_unlock_bh(&csk->sk_callback_lock);
 
+	sock_hold(csk);
+
 	/* Finished initialization, now add the psock to the MUX. */
 	spin_lock_bh(&mux->lock);
 	head = &mux->psocks;

From 4de49474b18936d62797d1dd451c6c4db1a7b119 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Tue, 23 Jan 2018 11:33:46 +0200
Subject: [PATCH 13/19] qed: Remove reserveration of dpi for kernel

Double reservation for kernel dedicated dpi was performed.
Once in the core module and once in qedr.
Remove the reservation from core.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_rdma.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index c8c4b3940564..9d6e2d43d4de 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -615,9 +615,6 @@ static int qed_rdma_reserve_lkey(struct qed_hwfn *p_hwfn)
 {
 	struct qed_rdma_device *dev = p_hwfn->p_rdma_info->dev;
 
-	/* The first DPI is reserved for the Kernel */
-	__set_bit(0, p_hwfn->p_rdma_info->dpi_map.bitmap);
-
 	/* Tid 0 will be used as the key for "reserved MR".
 	 * The driver should allocate memory for it so it can be loaded but no
 	 * ramrod should be passed on it.

From 1fe280a056dff50774bd59c3e61187cf8c0ccf10 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Tue, 23 Jan 2018 11:33:47 +0200
Subject: [PATCH 14/19] qed: Free reserved MR tid

A tid was allocated for reserved MR during initialization but
not freed. This lead to an annoying output message during
rdma unload flow.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_rdma.c | 28 +++++++++++++---------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index 9d6e2d43d4de..b7abb8205d3a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -358,10 +358,27 @@ static void qed_rdma_resc_free(struct qed_hwfn *p_hwfn)
 	kfree(p_rdma_info);
 }
 
+static void qed_rdma_free_tid(void *rdma_cxt, u32 itid)
+{
+        struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
+
+        DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", itid);
+
+        spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+        qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->tid_map, itid);
+        spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
+static void qed_rdma_free_reserved_lkey(struct qed_hwfn *p_hwfn)
+{
+	qed_rdma_free_tid(p_hwfn, p_hwfn->p_rdma_info->dev->reserved_lkey);
+}
+
 static void qed_rdma_free(struct qed_hwfn *p_hwfn)
 {
 	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Freeing RDMA\n");
 
+	qed_rdma_free_reserved_lkey(p_hwfn);
 	qed_rdma_resc_free(p_hwfn);
 }
 
@@ -794,17 +811,6 @@ static struct qed_rdma_device *qed_rdma_query_device(void *rdma_cxt)
 	return p_hwfn->p_rdma_info->dev;
 }
 
-static void qed_rdma_free_tid(void *rdma_cxt, u32 itid)
-{
-	struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
-
-	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", itid);
-
-	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
-	qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->tid_map, itid);
-	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
-}
-
 static void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 qz_offset, u16 prod)
 {
 	struct qed_hwfn *p_hwfn;

From b7051cb8dadd69f85da5989017af2bb35b418950 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 23 Jan 2018 00:08:40 -0800
Subject: [PATCH 15/19] i40e: flower: check if TC offload is enabled on a
 netdev

Since TC block changes drivers are required to check if
the TC hw offload flag is set on the interface themselves.

Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower")
Fixes: 44ae12a768b7 ("net: sched: move the can_offload check from binding phase to rule insertion phase")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Amritha Nambiar <amritha.nambiar@intel.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 42dcaefc4c19..af792112a2d3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7505,6 +7505,8 @@ static int i40e_setup_tc_cls_flower(struct i40e_netdev_priv *np,
 {
 	struct i40e_vsi *vsi = np->vsi;
 
+	if (!tc_can_offload(vsi->netdev))
+		return -EOPNOTSUPP;
 	if (cls_flower->common.chain_index)
 		return -EOPNOTSUPP;
 

From e9cb4239134c860e5f92c75bf5321bd377bb505b Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 23 Jan 2018 17:27:25 +0800
Subject: [PATCH 16/19] vhost: use mutex_lock_nested() in vhost_dev_lock_vqs()

We used to call mutex_lock() in vhost_dev_lock_vqs() which tries to
hold mutexes of all virtqueues. This may confuse lockdep to report a
possible deadlock because of trying to hold locks belong to same
class. Switch to use mutex_lock_nested() to avoid false positive.

Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Reported-by: syzbot+dbb7c1161485e61b0241@syzkaller.appspotmail.com
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 33ac2b186b85..549771a0cd8b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -904,7 +904,7 @@ static void vhost_dev_lock_vqs(struct vhost_dev *d)
 {
 	int i = 0;
 	for (i = 0; i < d->nvqs; ++i)
-		mutex_lock(&d->vqs[i]->mutex);
+		mutex_lock_nested(&d->vqs[i]->mutex, i);
 }
 
 static void vhost_dev_unlock_vqs(struct vhost_dev *d)

From 6f3180afbb22106d96a1320e175562f36a4d3506 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 23 Jan 2018 17:27:26 +0800
Subject: [PATCH 17/19] vhost: do not try to access device IOTLB when not
 initialized

The code will try to access dev->iotlb when processing
VHOST_IOTLB_INVALIDATE even if it was not initialized which may lead
to NULL pointer dereference. Fixes this by check dev->iotlb before.

Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vhost.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 549771a0cd8b..5727b186b3ca 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1015,6 +1015,10 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
 		vhost_iotlb_notify_vq(dev, msg);
 		break;
 	case VHOST_IOTLB_INVALIDATE:
+		if (!dev->iotlb) {
+			ret = -EFAULT;
+			break;
+		}
 		vhost_vq_meta_reset(dev);
 		vhost_del_umem_range(dev->iotlb, msg->iova,
 				     msg->iova + msg->size - 1);

From 45d6e545505fd32edb812f085be7de45b6a5c0af Mon Sep 17 00:00:00 2001
From: Ivan Mikhaylov <ivan@de.ibm.com>
Date: Wed, 24 Jan 2018 15:53:24 +0300
Subject: [PATCH 18/19] net/ibm/emac: add 8192 rx/tx fifo size

emac4syn chips has availability to use 8192 rx/tx fifo buffer sizes,
in current state if we set it up in dts 8192 as example, we will get
only 2048 which may impact on network speed.

Signed-off-by: Ivan Mikhaylov <ivan@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/emac/core.c | 6 ++++++
 drivers/net/ethernet/ibm/emac/emac.h | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index 7feff2450ed6..241db3199b88 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -494,6 +494,9 @@ static u32 __emac_calc_base_mr1(struct emac_instance *dev, int tx_size, int rx_s
 	case 16384:
 		ret |= EMAC_MR1_RFS_16K;
 		break;
+	case 8192:
+		ret |= EMAC4_MR1_RFS_8K;
+		break;
 	case 4096:
 		ret |= EMAC_MR1_RFS_4K;
 		break;
@@ -516,6 +519,9 @@ static u32 __emac4_calc_base_mr1(struct emac_instance *dev, int tx_size, int rx_
 	case 16384:
 		ret |= EMAC4_MR1_TFS_16K;
 		break;
+	case 8192:
+		ret |= EMAC4_MR1_TFS_8K;
+		break;
 	case 4096:
 		ret |= EMAC4_MR1_TFS_4K;
 		break;
diff --git a/drivers/net/ethernet/ibm/emac/emac.h b/drivers/net/ethernet/ibm/emac/emac.h
index 5afcc27ceebb..d0a0e3b3f283 100644
--- a/drivers/net/ethernet/ibm/emac/emac.h
+++ b/drivers/net/ethernet/ibm/emac/emac.h
@@ -151,9 +151,11 @@ struct emac_regs {
 
 #define EMAC4_MR1_RFS_2K		0x00100000
 #define EMAC4_MR1_RFS_4K		0x00180000
+#define EMAC4_MR1_RFS_8K		0x00200000
 #define EMAC4_MR1_RFS_16K		0x00280000
 #define EMAC4_MR1_TFS_2K       		0x00020000
 #define EMAC4_MR1_TFS_4K		0x00030000
+#define EMAC4_MR1_TFS_8K		0x00040000
 #define EMAC4_MR1_TFS_16K		0x00050000
 #define EMAC4_MR1_TR			0x00008000
 #define EMAC4_MR1_MWSW_001		0x00001000

From 624ca9c33c8a853a4a589836e310d776620f4ab9 Mon Sep 17 00:00:00 2001
From: Ivan Mikhaylov <ivan@de.ibm.com>
Date: Wed, 24 Jan 2018 15:53:25 +0300
Subject: [PATCH 19/19] net/ibm/emac: wrong bit is used for STA control
 register write

STA control register has areas of mode and opcodes for opeations. 18 bit is
using for mode selection, where 0 is old MIO/MDIO access method and 1 is
indirect access mode. 19-20 bits are using for setting up read/write
operation(STA opcodes). In current state 'read' is set into old MIO/MDIO mode
with 19 bit and write operation is set into 18 bit which is mode selection,
not a write operation. To correlate write with read we set it into 20 bit.
All those bit operations are MSB 0 based.

Signed-off-by: Ivan Mikhaylov <ivan@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/emac/emac.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/emac/emac.h b/drivers/net/ethernet/ibm/emac/emac.h
index d0a0e3b3f283..c26d2631ca30 100644
--- a/drivers/net/ethernet/ibm/emac/emac.h
+++ b/drivers/net/ethernet/ibm/emac/emac.h
@@ -244,7 +244,7 @@ struct emac_regs {
 #define EMAC_STACR_PHYE			0x00004000
 #define EMAC_STACR_STAC_MASK		0x00003000
 #define EMAC_STACR_STAC_READ		0x00001000
-#define EMAC_STACR_STAC_WRITE		0x00002000
+#define EMAC_STACR_STAC_WRITE		0x00000800
 #define EMAC_STACR_OPBC_MASK		0x00000C00
 #define EMAC_STACR_OPBC_50		0x00000000
 #define EMAC_STACR_OPBC_66		0x00000400