From 0fc4dcc13f090c941abfab453a24945a4005b350 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Tue, 29 Jun 2021 11:39:07 +0200
Subject: [PATCH 001/158] bpf, devmap: Convert remaining READ_ONCE() to
 rcu_dereference_check()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There were a couple of READ_ONCE()-invocations left-over by the devmap
RCU conversion. Convert these to rcu_dereference_check() as well to avoid
complaints from sparse.

Fixes: 782347b6bcad ("xdp: Add proper __rcu annotations to redirect map entries")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210629093907.573598-1-toke@redhat.com
---
 kernel/bpf/devmap.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2546dafd6672..fdc20892837c 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -558,7 +558,8 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
 
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		for (i = 0; i < map->max_entries; i++) {
-			dst = READ_ONCE(dtab->netdev_map[i]);
+			dst = rcu_dereference_check(dtab->netdev_map[i],
+						    rcu_read_lock_bh_held());
 			if (!is_valid_dst(dst, xdp, exclude_ifindex))
 				continue;
 
@@ -654,7 +655,8 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
 
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		for (i = 0; i < map->max_entries; i++) {
-			dst = READ_ONCE(dtab->netdev_map[i]);
+			dst = rcu_dereference_check(dtab->netdev_map[i],
+						    rcu_read_lock_bh_held());
 			if (!dst || dst->dev->ifindex == exclude_ifindex)
 				continue;
 

From 56ea7ed103b46970e171eb1c95916f393d64eeff Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Thu, 13 May 2021 17:31:03 -0700
Subject: [PATCH 002/158] igc: Fix use-after-free error during reset

Cleans the next descriptor to watch (next_to_watch) when cleaning the
TX ring.

Failure to do so can cause invalid memory accesses. If igc_poll() runs
while the controller is being reset this can lead to the driver try to
free a skb that was already freed.

Log message:

 [  101.525242] refcount_t: underflow; use-after-free.
 [  101.525251] WARNING: CPU: 1 PID: 646 at lib/refcount.c:28 refcount_warn_saturate+0xab/0xf0
 [  101.525259] Modules linked in: sch_etf(E) sch_mqprio(E) rfkill(E) intel_rapl_msr(E) intel_rapl_common(E)
 x86_pkg_temp_thermal(E) intel_powerclamp(E) coretemp(E) binfmt_misc(E) kvm_intel(E) kvm(E) irqbypass(E) crc32_pclmul(E)
 ghash_clmulni_intel(E) aesni_intel(E) mei_wdt(E) libaes(E) crypto_simd(E) cryptd(E) glue_helper(E) snd_hda_codec_hdmi(E)
 rapl(E) intel_cstate(E) snd_hda_intel(E) snd_intel_dspcfg(E) sg(E) soundwire_intel(E) intel_uncore(E) at24(E)
 soundwire_generic_allocation(E) iTCO_wdt(E) soundwire_cadence(E) intel_pmc_bxt(E) serio_raw(E) snd_hda_codec(E)
 iTCO_vendor_support(E) watchdog(E) snd_hda_core(E) snd_hwdep(E) snd_soc_core(E) snd_compress(E) snd_pcsp(E)
 soundwire_bus(E) snd_pcm(E) evdev(E) snd_timer(E) mei_me(E) snd(E) soundcore(E) mei(E) configfs(E) ip_tables(E) x_tables(E)
 autofs4(E) ext4(E) crc32c_generic(E) crc16(E) mbcache(E) jbd2(E) sd_mod(E) t10_pi(E) crc_t10dif(E) crct10dif_generic(E)
 i915(E) ahci(E) libahci(E) ehci_pci(E) igb(E) xhci_pci(E) ehci_hcd(E)
 [  101.525303]  drm_kms_helper(E) dca(E) xhci_hcd(E) libata(E) crct10dif_pclmul(E) cec(E) crct10dif_common(E) tsn(E) igc(E)
 e1000e(E) ptp(E) i2c_i801(E) crc32c_intel(E) psmouse(E) i2c_algo_bit(E) i2c_smbus(E) scsi_mod(E) lpc_ich(E) pps_core(E)
 usbcore(E) drm(E) button(E) video(E)
 [  101.525318] CPU: 1 PID: 646 Comm: irq/37-enp7s0-T Tainted: G            E     5.10.30-rt37-tsn1-rt-ipipe #ipipe
 [  101.525320] Hardware name: SIEMENS AG SIMATIC IPC427D/A5E31233588, BIOS V17.02.09 03/31/2017
 [  101.525322] RIP: 0010:refcount_warn_saturate+0xab/0xf0
 [  101.525325] Code: 05 31 48 44 01 01 e8 f0 c6 42 00 0f 0b c3 80 3d 1f 48 44 01 00 75 90 48 c7 c7 78 a8 f3 a6 c6 05 0f 48
 44 01 01 e8 d1 c6 42 00 <0f> 0b c3 80 3d fe 47 44 01 00 0f 85 6d ff ff ff 48 c7 c7 d0 a8 f3
 [  101.525327] RSP: 0018:ffffbdedc0917cb8 EFLAGS: 00010286
 [  101.525329] RAX: 0000000000000000 RBX: ffff98fd6becbf40 RCX: 0000000000000001
 [  101.525330] RDX: 0000000000000001 RSI: ffffffffa6f2700c RDI: 00000000ffffffff
 [  101.525332] RBP: ffff98fd6becc14c R08: ffffffffa7463d00 R09: ffffbdedc0917c50
 [  101.525333] R10: ffffffffa74c3578 R11: 0000000000000034 R12: 00000000ffffff00
 [  101.525335] R13: ffff98fd6b0b1000 R14: 0000000000000039 R15: ffff98fd6be35c40
 [  101.525337] FS:  0000000000000000(0000) GS:ffff98fd6e240000(0000) knlGS:0000000000000000
 [  101.525339] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [  101.525341] CR2: 00007f34135a3a70 CR3: 0000000150210003 CR4: 00000000001706e0
 [  101.525343] Call Trace:
 [  101.525346]  sock_wfree+0x9c/0xa0
 [  101.525353]  unix_destruct_scm+0x7b/0xa0
 [  101.525358]  skb_release_head_state+0x40/0x90
 [  101.525362]  skb_release_all+0xe/0x30
 [  101.525364]  napi_consume_skb+0x57/0x160
 [  101.525367]  igc_poll+0xb7/0xc80 [igc]
 [  101.525376]  ? sched_clock+0x5/0x10
 [  101.525381]  ? sched_clock_cpu+0xe/0x100
 [  101.525385]  net_rx_action+0x14c/0x410
 [  101.525388]  __do_softirq+0xe9/0x2f4
 [  101.525391]  __local_bh_enable_ip+0xe3/0x110
 [  101.525395]  ? irq_finalize_oneshot.part.47+0xe0/0xe0
 [  101.525398]  irq_forced_thread_fn+0x6a/0x80
 [  101.525401]  irq_thread+0xe8/0x180
 [  101.525403]  ? wake_threads_waitq+0x30/0x30
 [  101.525406]  ? irq_thread_check_affinity+0xd0/0xd0
 [  101.525408]  kthread+0x183/0x1a0
 [  101.525412]  ? kthread_park+0x80/0x80
 [  101.525415]  ret_from_fork+0x22/0x30

Fixes: 13b5b7fd6a4a ("igc: Add support for Tx/Rx rings")
Reported-by: Erez Geva <erez.geva.ext@siemens.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Dvora Fuxbrumer <dvorax.fuxbrumer@linux.intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/igc/igc_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 95323095094d..1d00a63eb935 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -232,6 +232,8 @@ static void igc_clean_tx_ring(struct igc_ring *tx_ring)
 				igc_unmap_tx_buffer(tx_ring->dev, tx_buffer);
 		}
 
+		tx_buffer->next_to_watch = NULL;
+
 		/* move us one more past the eop_desc for start of next pkt */
 		tx_buffer++;
 		i++;

From 7b292608db23ccbbfbfa50cdb155d01725d7a52e Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Thu, 13 May 2021 17:31:04 -0700
Subject: [PATCH 003/158] igb: Fix use-after-free error during reset

Cleans the next descriptor to watch (next_to_watch) when cleaning the
TX ring.

Failure to do so can cause invalid memory accesses. If igb_poll() runs
while the controller is reset this can lead to the driver try to free
a skb that was already freed.

(The crash is harder to reproduce with the igb driver, but the same
potential problem exists as the code is identical to igc)

Fixes: 7cc6fd4c60f2 ("igb: Don't bother clearing Tx buffer_info in igb_clean_tx_ring")
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Reported-by: Erez Geva <erez.geva.ext@siemens.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 7e6435dc7e80..a61e2e5e95c0 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -4835,6 +4835,8 @@ static void igb_clean_tx_ring(struct igb_ring *tx_ring)
 					       DMA_TO_DEVICE);
 		}
 
+		tx_buffer->next_to_watch = NULL;
+
 		/* move us one more past the eop_desc for start of next pkt */
 		tx_buffer++;
 		i++;

From 05682a0a61b6cbecd97a0f37f743b2cbfd516977 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Fri, 21 May 2021 12:50:19 -0700
Subject: [PATCH 004/158] igc: change default return of igc_read_phy_reg()

Static analysis reports this problem

igc_main.c:4944:20: warning: The left operand of '&'
  is a garbage value
    if (!(phy_data & SR_1000T_REMOTE_RX_STATUS) &&
          ~~~~~~~~ ^

phy_data is set by the call to igc_read_phy_reg() only if
there is a read_reg() op, else it is unset and a 0 is
returned.  Change the return to -EOPNOTSUPP.

Fixes: 208983f099d9 ("igc: Add watchdog")
Signed-off-by: Tom Rix <trix@redhat.com>
Tested-by: Dvora Fuxbrumer <dvorax.fuxbrumer@linux.intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/igc/igc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
index 9e0bbb2e55e3..5901ed9fb545 100644
--- a/drivers/net/ethernet/intel/igc/igc.h
+++ b/drivers/net/ethernet/intel/igc/igc.h
@@ -578,7 +578,7 @@ static inline s32 igc_read_phy_reg(struct igc_hw *hw, u32 offset, u16 *data)
 	if (hw->phy.ops.read_reg)
 		return hw->phy.ops.read_reg(hw, offset, data);
 
-	return 0;
+	return -EOPNOTSUPP;
 }
 
 void igc_reinit_locked(struct igc_adapter *);

From dd2aefcd5e37989ae5f90afdae44bbbf3a2990da Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 12 Jun 2021 15:46:09 +0200
Subject: [PATCH 005/158] ixgbe: Fix an error handling path in 'ixgbe_probe()'

If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it
must be undone by a corresponding 'pci_disable_pcie_error_reporting()'
call, as already done in the remove function.

Fixes: 6fabd715e6d8 ("ixgbe: Implement PCIe AER support")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index ffff69efd78a..913253f8ecb4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -11067,6 +11067,7 @@ err_ioremap:
 	disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state);
 	free_netdev(netdev);
 err_alloc_etherdev:
+	pci_disable_pcie_error_reporting(pdev);
 	pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:

From c6bc9e5ce5d37cb3e6b552f41b92a193db1806ab Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 12 Jun 2021 22:00:05 +0200
Subject: [PATCH 006/158] igc: Fix an error handling path in 'igc_probe()'

If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it
must be undone by a corresponding 'pci_disable_pcie_error_reporting()'
call, as already done in the remove function.

Fixes: c9a11c23ceb6 ("igc: Add netdev")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Tested-by: Dvora Fuxbrumer <dvorax.fuxbrumer@linux.intel.com>
Acked-by: Sasha Neftin <sasha.neftin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/igc/igc_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 1d00a63eb935..e29aadbc6744 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -6056,6 +6056,7 @@ err_sw_init:
 err_ioremap:
 	free_netdev(netdev);
 err_alloc_etherdev:
+	pci_disable_pcie_error_reporting(pdev);
 	pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:

From fea03b1cebd653cd095f2e9a58cfe1c85661c363 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 12 Jun 2021 22:08:33 +0200
Subject: [PATCH 007/158] igb: Fix an error handling path in 'igb_probe()'

If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it
must be undone by a corresponding 'pci_disable_pcie_error_reporting()'
call, as already done in the remove function.

Fixes: 40a914fa72ab ("igb: Add support for pci-e Advanced Error Reporting")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index a61e2e5e95c0..abc239b736fb 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3615,6 +3615,7 @@ err_sw_init:
 err_ioremap:
 	free_netdev(netdev);
 err_alloc_etherdev:
+	pci_disable_pcie_error_reporting(pdev);
 	pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:

From e85e14d68f517ef12a5fb8123fff65526b35b6cd Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 16 Jun 2021 07:00:36 +0200
Subject: [PATCH 008/158] fm10k: Fix an error handling path in 'fm10k_probe()'

If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it
must be undone by a corresponding 'pci_disable_pcie_error_reporting()'
call, as already done in the remove function.

Fixes: 19ae1b3fb99c ("fm10k: Add support for PCI power management and error handling")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index dbcae92bb18d..adfa2768f024 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -2227,6 +2227,7 @@ err_sw_init:
 err_ioremap:
 	free_netdev(netdev);
 err_alloc_netdev:
+	pci_disable_pcie_error_reporting(pdev);
 	pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:

From 4589075608420bc49fcef6e98279324bf2bb91ae Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 16 Jun 2021 07:05:53 +0200
Subject: [PATCH 009/158] e1000e: Fix an error handling path in 'e1000_probe()'

If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it
must be undone by a corresponding 'pci_disable_pcie_error_reporting()'
call, as already done in the remove function.

Fixes: 111b9dc5c981 ("e1000e: add aer support")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Sasha Neftin <sasha.neftin@intel.com>
Tested-by: Dvora Fuxbrumer <dvorax.fuxbrumer@linux.intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index d150dade06cf..757a54c39eef 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -7664,6 +7664,7 @@ err_flashmap:
 err_ioremap:
 	free_netdev(netdev);
 err_alloc_etherdev:
+	pci_disable_pcie_error_reporting(pdev);
 	pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:

From af30cbd2f4d6d66a9b6094e0aa32420bc8b20e08 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 16 Jun 2021 07:53:02 +0200
Subject: [PATCH 010/158] iavf: Fix an error handling path in 'iavf_probe()'

If an error occurs after a 'pci_enable_pcie_error_reporting()' call, it
must be undone by a corresponding 'pci_disable_pcie_error_reporting()'
call, as already done in the remove function.

Fixes: 5eae00c57f5e ("i40evf: main driver core")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index e612c24fa384..44bafedd09f2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -3798,6 +3798,7 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 err_ioremap:
 	free_netdev(netdev);
 err_alloc_etherdev:
+	pci_disable_pcie_error_reporting(pdev);
 	pci_release_regions(pdev);
 err_pci_reg:
 err_dma:

From 6c19d772618fea40d9681f259368f284a330fd90 Mon Sep 17 00:00:00 2001
From: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Date: Thu, 22 Apr 2021 10:19:23 +0000
Subject: [PATCH 011/158] igb: Check if num of q_vectors is smaller than max
 before array access

Ensure that the adapter->q_vector[MAX_Q_VECTORS] array isn't accessed
beyond its size. It was fixed by using a local variable num_q_vectors
as a limit for loop index, and ensure that num_q_vectors is not bigger
than MAX_Q_VECTORS.

Fixes: 047e0030f1e6 ("igb: add new data structure for handling interrupts and NAPI")
Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Grzegorz Siwik <grzegorz.siwik@intel.com>
Reviewed-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Reviewed-by: Slawomir Laba <slawomirx.laba@intel.com>
Reviewed-by: Sylwester Dziedziuch <sylwesterx.dziedziuch@intel.com>
Reviewed-by: Mateusz Palczewski <mateusz.placzewski@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index abc239b736fb..9470ba891483 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -931,6 +931,7 @@ static void igb_configure_msix(struct igb_adapter *adapter)
  **/
 static int igb_request_msix(struct igb_adapter *adapter)
 {
+	unsigned int num_q_vectors = adapter->num_q_vectors;
 	struct net_device *netdev = adapter->netdev;
 	int i, err = 0, vector = 0, free_vector = 0;
 
@@ -939,7 +940,13 @@ static int igb_request_msix(struct igb_adapter *adapter)
 	if (err)
 		goto err_out;
 
-	for (i = 0; i < adapter->num_q_vectors; i++) {
+	if (num_q_vectors > MAX_Q_VECTORS) {
+		num_q_vectors = MAX_Q_VECTORS;
+		dev_warn(&adapter->pdev->dev,
+			 "The number of queue vectors (%d) is higher than max allowed (%d)\n",
+			 adapter->num_q_vectors, MAX_Q_VECTORS);
+	}
+	for (i = 0; i < num_q_vectors; i++) {
 		struct igb_q_vector *q_vector = adapter->q_vector[i];
 
 		vector++;

From 382a7c20d9253bcd5715789b8179528d0f3de72c Mon Sep 17 00:00:00 2001
From: Jedrzej Jagielski <jedrzej.jagielski@intel.com>
Date: Fri, 11 Jun 2021 22:42:17 +0000
Subject: [PATCH 012/158] igb: Fix position of assignment to *ring

Assignment to *ring should be done after correctness check of the
argument queue.

Fixes: 91db364236c8 ("igb: Refactor igb_configure_cbs()")
Signed-off-by: Jedrzej Jagielski <jedrzej.jagielski@intel.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 9470ba891483..171a7a629b20 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -1685,14 +1685,15 @@ static bool is_any_txtime_enabled(struct igb_adapter *adapter)
  **/
 static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 {
-	struct igb_ring *ring = adapter->tx_ring[queue];
 	struct net_device *netdev = adapter->netdev;
 	struct e1000_hw *hw = &adapter->hw;
+	struct igb_ring *ring;
 	u32 tqavcc, tqavctrl;
 	u16 value;
 
 	WARN_ON(hw->mac.type != e1000_i210);
 	WARN_ON(queue < 0 || queue > 1);
+	ring = adapter->tx_ring[queue];
 
 	/* If any of the Qav features is enabled, configure queues as SR and
 	 * with HIGH PRIO. If none is, then configure them with LOW PRIO and

From a3609ac24c18947737f5bc1746b8735814c521d1 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Tue, 29 Jun 2021 06:43:05 +0200
Subject: [PATCH 013/158] net: usb: asix: ax88772: suspend PHY on driver probe

After probe/bind sequence is the PHY in active state, even if interface
is stopped. As result, on some systems like Samsung Exynos5250 SoC based Arndale
board, the ASIX PHY will be able to negotiate the link but fail to
transmit the data.

To handle it, suspend the PHY on probe.

Fixes: e532a096be0e ("net: usb: asix: ax88772: add phylib support")
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/asix_devices.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c
index aec97b021a73..2c115216420a 100644
--- a/drivers/net/usb/asix_devices.c
+++ b/drivers/net/usb/asix_devices.c
@@ -701,6 +701,7 @@ static int ax88772_init_phy(struct usbnet *dev)
 		return ret;
 	}
 
+	phy_suspend(priv->phydev);
 	priv->phydev->mac_managed_pm = 1;
 
 	phy_attached_info(priv->phydev);

From b18114476a1432ad1db5d5605bc8cd131814d264 Mon Sep 17 00:00:00 2001
From: Baowen Zheng <baowen.zheng@corigine.com>
Date: Tue, 29 Jun 2021 09:22:11 +0200
Subject: [PATCH 014/158] openvswitch: Optimize operation for key comparison

In the current implement when comparing two flow keys, we will return
result after comparing the whole key from start to end.

In our optimization, we will return result in the first none-zero
comparison, then we will improve the flow table looking up efficiency.

Signed-off-by: Baowen Zheng <baowen.zheng@corigine.com>
Signed-off-by: Louis Peens <louis.peens@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow_table.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index c89c8da99f1a..d4a2db0b2299 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -670,13 +670,13 @@ static bool cmp_key(const struct sw_flow_key *key1,
 {
 	const long *cp1 = (const long *)((const u8 *)key1 + key_start);
 	const long *cp2 = (const long *)((const u8 *)key2 + key_start);
-	long diffs = 0;
 	int i;
 
 	for (i = key_start; i < key_end; i += sizeof(long))
-		diffs |= *cp1++ ^ *cp2++;
+		if (*cp1++ ^ *cp2++)
+			return false;
 
-	return diffs == 0;
+	return true;
 }
 
 static bool flow_cmp_masked_key(const struct sw_flow *flow,

From 996af62167d0e0ec69b938a3561e96f84ffff1aa Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Tue, 29 Jun 2021 15:22:37 +0800
Subject: [PATCH 015/158] net/802/mrp: fix memleak in mrp_request_join()

I got kmemleak report when doing fuzz test:

BUG: memory leak
unreferenced object 0xffff88810c239500 (size 64):
comm "syz-executor940", pid 882, jiffies 4294712870 (age 14.631s)
hex dump (first 32 bytes):
01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00 00 00 00 00 00 00 00 01 00 00 00 01 02 00 04 ................
backtrace:
[<00000000a323afa4>] slab_alloc_node mm/slub.c:2972 [inline]
[<00000000a323afa4>] slab_alloc mm/slub.c:2980 [inline]
[<00000000a323afa4>] __kmalloc+0x167/0x340 mm/slub.c:4130
[<000000005034ca11>] kmalloc include/linux/slab.h:595 [inline]
[<000000005034ca11>] mrp_attr_create net/802/mrp.c:276 [inline]
[<000000005034ca11>] mrp_request_join+0x265/0x550 net/802/mrp.c:530
[<00000000fcfd81f3>] vlan_mvrp_request_join+0x145/0x170 net/8021q/vlan_mvrp.c:40
[<000000009258546e>] vlan_dev_open+0x477/0x890 net/8021q/vlan_dev.c:292
[<0000000059acd82b>] __dev_open+0x281/0x410 net/core/dev.c:1609
[<000000004e6dc695>] __dev_change_flags+0x424/0x560 net/core/dev.c:8767
[<00000000471a09af>] rtnl_configure_link+0xd9/0x210 net/core/rtnetlink.c:3122
[<0000000037a4672b>] __rtnl_newlink+0xe08/0x13e0 net/core/rtnetlink.c:3448
[<000000008d5d0fda>] rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3488
[<000000004882fe39>] rtnetlink_rcv_msg+0x369/0xa10 net/core/rtnetlink.c:5552
[<00000000907e6c54>] netlink_rcv_skb+0x134/0x3d0 net/netlink/af_netlink.c:2504
[<00000000e7d7a8c4>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline]
[<00000000e7d7a8c4>] netlink_unicast+0x4a0/0x6a0 net/netlink/af_netlink.c:1340
[<00000000e0645d50>] netlink_sendmsg+0x78e/0xc90 net/netlink/af_netlink.c:1929
[<00000000c24559b7>] sock_sendmsg_nosec net/socket.c:654 [inline]
[<00000000c24559b7>] sock_sendmsg+0x139/0x170 net/socket.c:674
[<00000000fc210bc2>] ____sys_sendmsg+0x658/0x7d0 net/socket.c:2350
[<00000000be4577b5>] ___sys_sendmsg+0xf8/0x170 net/socket.c:2404

Calling mrp_request_leave() after mrp_request_join(), the attr->state
is set to MRP_APPLICANT_VO, mrp_attr_destroy() won't be called in last
TX event in mrp_uninit_applicant(), the attr of applicant will be leaked.
To fix this leak, iterate and free each attr of applicant before rerturning
from mrp_uninit_applicant().

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/802/mrp.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/net/802/mrp.c b/net/802/mrp.c
index bea6e43d45a0..35e04cc5390c 100644
--- a/net/802/mrp.c
+++ b/net/802/mrp.c
@@ -292,6 +292,19 @@ static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr)
 	kfree(attr);
 }
 
+static void mrp_attr_destroy_all(struct mrp_applicant *app)
+{
+	struct rb_node *node, *next;
+	struct mrp_attr *attr;
+
+	for (node = rb_first(&app->mad);
+	     next = node ? rb_next(node) : NULL, node != NULL;
+	     node = next) {
+		attr = rb_entry(node, struct mrp_attr, node);
+		mrp_attr_destroy(app, attr);
+	}
+}
+
 static int mrp_pdu_init(struct mrp_applicant *app)
 {
 	struct sk_buff *skb;
@@ -895,6 +908,7 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl)
 
 	spin_lock_bh(&app->lock);
 	mrp_mad_event(app, MRP_EVENT_TX);
+	mrp_attr_destroy_all(app);
 	mrp_pdu_queue(app);
 	spin_unlock_bh(&app->lock);
 

From a34dcbfa1475f18a8f1b1dc3dedb76d746874e61 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 29 Jun 2021 11:19:44 +0300
Subject: [PATCH 016/158] sctp: prevent info leak in sctp_make_heartbeat()

The "hbinfo" struct has a 4 byte hole at the end so we have to zero it
out to prevent stack information from being disclosed.

Fixes: fe59379b9ab7 ("sctp: do the basic send and recv for PLPMTUD probe")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 6c08e5048d38..b8fa8f1a7277 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1163,7 +1163,7 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
 				       const struct sctp_transport *transport,
 				       __u32 probe_size)
 {
-	struct sctp_sender_hb_info hbinfo;
+	struct sctp_sender_hb_info hbinfo = {};
 	struct sctp_chunk *retval;
 
 	retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0,

From 42ca63f980842918560b25f0244307fd83b4777c Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Tue, 29 Jun 2021 19:53:28 +0800
Subject: [PATCH 017/158] net/802/garp: fix memleak in garp_request_join()

I got kmemleak report when doing fuzz test:

BUG: memory leak
unreferenced object 0xffff88810c909b80 (size 64):
  comm "syz", pid 957, jiffies 4295220394 (age 399.090s)
  hex dump (first 32 bytes):
    01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 08 00 00 00 01 02 00 04  ................
  backtrace:
    [<00000000ca1f2e2e>] garp_request_join+0x285/0x3d0
    [<00000000bf153351>] vlan_gvrp_request_join+0x15b/0x190
    [<0000000024005e72>] vlan_dev_open+0x706/0x980
    [<00000000dc20c4d4>] __dev_open+0x2bb/0x460
    [<0000000066573004>] __dev_change_flags+0x501/0x650
    [<0000000035b42f83>] rtnl_configure_link+0xee/0x280
    [<00000000a5e69de0>] __rtnl_newlink+0xed5/0x1550
    [<00000000a5258f4a>] rtnl_newlink+0x66/0x90
    [<00000000506568ee>] rtnetlink_rcv_msg+0x439/0xbd0
    [<00000000b7eaeae1>] netlink_rcv_skb+0x14d/0x420
    [<00000000c373ce66>] netlink_unicast+0x550/0x750
    [<00000000ec74ce74>] netlink_sendmsg+0x88b/0xda0
    [<00000000381ff246>] sock_sendmsg+0xc9/0x120
    [<000000008f6a2db3>] ____sys_sendmsg+0x6e8/0x820
    [<000000008d9c1735>] ___sys_sendmsg+0x145/0x1c0
    [<00000000aa39dd8b>] __sys_sendmsg+0xfe/0x1d0

Calling garp_request_leave() after garp_request_join(), the attr->state
is set to GARP_APPLICANT_VO, garp_attr_destroy() won't be called in last
transmit event in garp_uninit_applicant(), the attr of applicant will be
leaked. To fix this leak, iterate and free each attr of applicant before
rerturning from garp_uninit_applicant().

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/802/garp.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/net/802/garp.c b/net/802/garp.c
index 400bd857e5f5..f6012f8e59f0 100644
--- a/net/802/garp.c
+++ b/net/802/garp.c
@@ -203,6 +203,19 @@ static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr
 	kfree(attr);
 }
 
+static void garp_attr_destroy_all(struct garp_applicant *app)
+{
+	struct rb_node *node, *next;
+	struct garp_attr *attr;
+
+	for (node = rb_first(&app->gid);
+	     next = node ? rb_next(node) : NULL, node != NULL;
+	     node = next) {
+		attr = rb_entry(node, struct garp_attr, node);
+		garp_attr_destroy(app, attr);
+	}
+}
+
 static int garp_pdu_init(struct garp_applicant *app)
 {
 	struct sk_buff *skb;
@@ -609,6 +622,7 @@ void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl
 
 	spin_lock_bh(&app->lock);
 	garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
+	garp_attr_destroy_all(app);
 	garp_pdu_queue(app);
 	spin_unlock_bh(&app->lock);
 

From 0dbffbb5335a1e3aa6855e4ee317e25e669dd302 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 29 Jun 2021 07:12:45 -0700
Subject: [PATCH 018/158] net: annotate data race around sk_ll_usec

sk_ll_usec is read locklessly from sk_can_busy_loop()
while another thread can change its value in sock_setsockopt()

This is correct but needs annotations.

BUG: KCSAN: data-race in __skb_try_recv_datagram / sock_setsockopt

write to 0xffff88814eb5f904 of 4 bytes by task 14011 on cpu 0:
 sock_setsockopt+0x1287/0x2090 net/core/sock.c:1175
 __sys_setsockopt+0x14f/0x200 net/socket.c:2100
 __do_sys_setsockopt net/socket.c:2115 [inline]
 __se_sys_setsockopt net/socket.c:2112 [inline]
 __x64_sys_setsockopt+0x62/0x70 net/socket.c:2112
 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47
 entry_SYSCALL_64_after_hwframe+0x44/0xae

read to 0xffff88814eb5f904 of 4 bytes by task 14001 on cpu 1:
 sk_can_busy_loop include/net/busy_poll.h:41 [inline]
 __skb_try_recv_datagram+0x14f/0x320 net/core/datagram.c:273
 unix_dgram_recvmsg+0x14c/0x870 net/unix/af_unix.c:2101
 unix_seqpacket_recvmsg+0x5a/0x70 net/unix/af_unix.c:2067
 ____sys_recvmsg+0x15d/0x310 include/linux/uio.h:244
 ___sys_recvmsg net/socket.c:2598 [inline]
 do_recvmmsg+0x35c/0x9f0 net/socket.c:2692
 __sys_recvmmsg net/socket.c:2771 [inline]
 __do_sys_recvmmsg net/socket.c:2794 [inline]
 __se_sys_recvmmsg net/socket.c:2787 [inline]
 __x64_sys_recvmmsg+0xcf/0x150 net/socket.c:2787
 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47
 entry_SYSCALL_64_after_hwframe+0x44/0xae

value changed: 0x00000000 -> 0x00000101

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 14001 Comm: syz-executor.3 Not tainted 5.13.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h | 2 +-
 net/core/sock.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 73af4a64a599..40296ed976a9 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -38,7 +38,7 @@ static inline bool net_busy_loop_on(void)
 
 static inline bool sk_can_busy_loop(const struct sock *sk)
 {
-	return sk->sk_ll_usec && !signal_pending(current);
+	return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current);
 }
 
 bool sk_busy_loop_end(void *p, unsigned long start_time);
diff --git a/net/core/sock.c b/net/core/sock.c
index ba1c0f75cd45..dd9599656c40 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1201,7 +1201,7 @@ set_sndbuf:
 			if (val < 0)
 				ret = -EINVAL;
 			else
-				sk->sk_ll_usec = val;
+				WRITE_ONCE(sk->sk_ll_usec, val);
 		}
 		break;
 	case SO_PREFER_BUSY_POLL:

From 873a1e3d207ae587a7a1cc1d84545146b449ea5d Mon Sep 17 00:00:00 2001
From: Harman Kalra <hkalra@marvell.com>
Date: Tue, 29 Jun 2021 22:30:04 +0530
Subject: [PATCH 019/158] octeontx2-af: cn10k: Setting up lmtst map table

Introducing a new mailbox to support updating lmt entries
and common lmt base address scheme i.e. multiple pcifuncs
can share lmt region to reduce L1 cache pressure for application.
Parameters passed to mailbox includes the primary pcifunc
value whose lmt regions will be shared by other secondary
pcifuncs. Here secondary pcifunc will be the one who is
calling the mailbox.
For example:
By default each pcifunc has its own LMT base address:
        PCIFUNC1    LMT_BASE_ADDR A
        PCIFUNC2    LMT_BASE_ADDR B
        PCIFUNC3    LMT_BASE_ADDR C
        PCIFUNC4    LMT_BASE_ADDR D
Application will choose PCIFUNC1 as base/primary pcifunc
and as and when other pcifunc(secondary pcifuncs) gets
probed, this mailbox will be called and LMTST table will
be updated as:
        PCIFUNC1    LMT_BASE_ADDR A
        PCIFUNC2    LMT_BASE_ADDR A
        PCIFUNC3    LMT_BASE_ADDR A
        PCIFUNC4    LMT_BASE_ADDR A

On FLR lmtst map table gets resetted to the default lmt
base addresses for all secondary pcifuncs.

Signed-off-by: Harman Kalra <hkalra@marvell.com>
Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/marvell/octeontx2/af/mbox.h  |   7 +
 .../net/ethernet/marvell/octeontx2/af/rvu.c   |   1 +
 .../net/ethernet/marvell/octeontx2/af/rvu.h   |   4 +
 .../ethernet/marvell/octeontx2/af/rvu_cn10k.c | 140 ++++++++++++++++++
 .../ethernet/marvell/octeontx2/af/rvu_reg.h   |   5 +
 .../marvell/octeontx2/af/rvu_struct.h         |   3 +-
 6 files changed, 159 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 770d86262838..638db868125a 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -134,6 +134,8 @@ M(MSIX_OFFSET,		0x005, msix_offset, msg_req, msix_offset_rsp)	\
 M(VF_FLR,		0x006, vf_flr, msg_req, msg_rsp)		\
 M(PTP_OP,		0x007, ptp_op, ptp_req, ptp_rsp)		\
 M(GET_HW_CAP,		0x008, get_hw_cap, msg_req, get_hw_cap_rsp)	\
+M(LMTST_TBL_SETUP,	0x00a, lmtst_tbl_setup, lmtst_tbl_setup_req,    \
+				msg_rsp)				\
 M(SET_VF_PERM,		0x00b, set_vf_perm, set_vf_perm, msg_rsp)	\
 /* CGX mbox IDs (range 0x200 - 0x3FF) */				\
 M(CGX_START_RXTX,	0x200, cgx_start_rxtx, msg_req, msg_rsp)	\
@@ -1278,6 +1280,11 @@ struct set_vf_perm  {
 	u64	flags;
 };
 
+struct lmtst_tbl_setup_req {
+	struct mbox_msghdr hdr;
+	u16 base_pcifunc;
+};
+
 /* CPT mailbox error codes
  * Range 901 - 1000.
  */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index 0b092949d7ac..10cddf1ac7b9 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -2333,6 +2333,7 @@ static void __rvu_flr_handler(struct rvu *rvu, u16 pcifunc)
 	rvu_blklf_teardown(rvu, pcifunc, BLKADDR_SSOW);
 	rvu_blklf_teardown(rvu, pcifunc, BLKADDR_SSO);
 	rvu_blklf_teardown(rvu, pcifunc, BLKADDR_NPA);
+	rvu_reset_lmt_map_tbl(rvu, pcifunc);
 	rvu_detach_rsrcs(rvu, NULL, pcifunc);
 	mutex_unlock(&rvu->flr_lock);
 }
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 9e5d9ba6f01e..3c0a7e981f72 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -243,6 +243,7 @@ struct rvu_pfvf {
 	u8	nix_blkaddr; /* BLKADDR_NIX0/1 assigned to this PF */
 	u8	nix_rx_intf; /* NIX0_RX/NIX1_RX interface to NPC */
 	u8	nix_tx_intf; /* NIX0_TX/NIX1_TX interface to NPC */
+	u64     lmt_base_addr; /* Preseving the pcifunc's lmtst base addr*/
 	unsigned long flags;
 };
 
@@ -754,6 +755,9 @@ int rvu_cpt_lf_teardown(struct rvu *rvu, u16 pcifunc, int lf, int slot);
 int rvu_set_channels_base(struct rvu *rvu);
 void rvu_program_channels(struct rvu *rvu);
 
+/* CN10K RVU - LMT*/
+void rvu_reset_lmt_map_tbl(struct rvu *rvu, u16 pcifunc);
+
 #ifdef CONFIG_DEBUG_FS
 void rvu_dbg_init(struct rvu *rvu);
 void rvu_dbg_exit(struct rvu *rvu);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
index 7d9e71c6965f..87f56e1f32e3 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
@@ -10,6 +10,146 @@
 #include "cgx.h"
 #include "rvu_reg.h"
 
+/* RVU LMTST */
+#define LMT_TBL_OP_READ		0
+#define LMT_TBL_OP_WRITE	1
+#define LMT_MAP_TABLE_SIZE	(128 * 1024)
+#define LMT_MAPTBL_ENTRY_SIZE	16
+
+/* Function to perform operations (read/write) on lmtst map table */
+static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val,
+			       int lmt_tbl_op)
+{
+	void __iomem *lmt_map_base;
+	u64 tbl_base;
+
+	tbl_base = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_MAP_BASE);
+
+	lmt_map_base = ioremap_wc(tbl_base, LMT_MAP_TABLE_SIZE);
+	if (!lmt_map_base) {
+		dev_err(rvu->dev, "Failed to setup lmt map table mapping!!\n");
+		return -ENOMEM;
+	}
+
+	if (lmt_tbl_op == LMT_TBL_OP_READ) {
+		*val = readq(lmt_map_base + index);
+	} else {
+		writeq((*val), (lmt_map_base + index));
+		/* Flushing the AP interceptor cache to make APR_LMT_MAP_ENTRY_S
+		 * changes effective. Write 1 for flush and read is being used as a
+		 * barrier and sets up a data dependency. Write to 0 after a write
+		 * to 1 to complete the flush.
+		 */
+		rvu_write64(rvu, BLKADDR_APR, APR_AF_LMT_CTL, BIT_ULL(0));
+		rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_CTL);
+		rvu_write64(rvu, BLKADDR_APR, APR_AF_LMT_CTL, 0x00);
+	}
+
+	iounmap(lmt_map_base);
+	return 0;
+}
+
+static u32 rvu_get_lmtst_tbl_index(struct rvu *rvu, u16 pcifunc)
+{
+	return ((rvu_get_pf(pcifunc) * rvu->hw->total_vfs) +
+		(pcifunc & RVU_PFVF_FUNC_MASK)) * LMT_MAPTBL_ENTRY_SIZE;
+}
+
+int rvu_mbox_handler_lmtst_tbl_setup(struct rvu *rvu,
+				     struct lmtst_tbl_setup_req *req,
+				     struct msg_rsp *rsp)
+{
+	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, req->hdr.pcifunc);
+	u32 pri_tbl_idx, sec_tbl_idx;
+	int err = 0;
+	u64 val;
+
+	/* Reconfiguring lmtst map table in lmt region shared mode i.e. make
+	 * multiple PF_FUNCs to share an LMTLINE region, so primary/base
+	 * pcifunc (which is passed as an argument to mailbox) is the one
+	 * whose lmt base address will be shared among other secondary
+	 * pcifunc (will be the one who is calling this mailbox).
+	 */
+	if (req->base_pcifunc) {
+		/* Calculating the LMT table index equivalent to primary
+		 * pcifunc.
+		 */
+		pri_tbl_idx = rvu_get_lmtst_tbl_index(rvu, req->base_pcifunc);
+
+		/* Truncating secondary pcifunc to calculate the LMT table index
+		 * equivalent to secondary pcifunc.
+		 */
+		sec_tbl_idx = rvu_get_lmtst_tbl_index(rvu, req->hdr.pcifunc);
+		/* Read the base lmt addr of the secondary pcifunc */
+		err = lmtst_map_table_ops(rvu, sec_tbl_idx, &val,
+					  LMT_TBL_OP_READ);
+		if (err) {
+			dev_err(rvu->dev,
+				"Failed to read LMT map table: index 0x%x err %d\n",
+				sec_tbl_idx, err);
+			goto error;
+		}
+
+		/* Storing the seondary's lmt base address as this needs to be
+		 * reverted in FLR. Also making sure this default value doesn't
+		 * get overwritten on multiple calls to this mailbox.
+		 */
+		if (!pfvf->lmt_base_addr)
+			pfvf->lmt_base_addr = val;
+
+		/* Read the base lmt addr of the primary pcifunc */
+		err = lmtst_map_table_ops(rvu, pri_tbl_idx, &val,
+					  LMT_TBL_OP_READ);
+		if (err) {
+			dev_err(rvu->dev,
+				"Failed to read LMT map table: index 0x%x err %d\n",
+				pri_tbl_idx, err);
+			goto error;
+		}
+
+		/* Update the base lmt addr of secondary with primary's base
+		 * lmt addr.
+		 */
+		err = lmtst_map_table_ops(rvu, sec_tbl_idx, &val,
+					  LMT_TBL_OP_WRITE);
+		if (err) {
+			dev_err(rvu->dev,
+				"Failed to update LMT map table: index 0x%x err %d\n",
+				sec_tbl_idx, err);
+			goto error;
+		}
+	}
+
+error:
+	return err;
+}
+
+/* Resetting the lmtst map table to original base addresses */
+void rvu_reset_lmt_map_tbl(struct rvu *rvu, u16 pcifunc)
+{
+	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
+	u32 tbl_idx;
+	int err;
+
+	if (is_rvu_otx2(rvu))
+		return;
+
+	if (pfvf->lmt_base_addr) {
+		/* This corresponds to lmt map table index */
+		tbl_idx = rvu_get_lmtst_tbl_index(rvu, pcifunc);
+		/* Reverting back original lmt base addr for respective
+		 * pcifunc.
+		 */
+		err = lmtst_map_table_ops(rvu, tbl_idx, &pfvf->lmt_base_addr,
+					  LMT_TBL_OP_WRITE);
+		if (err)
+			dev_err(rvu->dev,
+				"Failed to update LMT map table: index 0x%x err %d\n",
+				tbl_idx, err);
+		pfvf->lmt_base_addr = 0;
+	}
+}
+
 int rvu_set_channels_base(struct rvu *rvu)
 {
 	struct rvu_hwinfo *hw = rvu->hw;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h
index 76837d5e19c6..61bafe956aae 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h
@@ -692,4 +692,9 @@
 #define LBK_LINK_CFG_ID_MASK		GENMASK_ULL(11, 6)
 #define LBK_LINK_CFG_BASE_MASK		GENMASK_ULL(5, 0)
 
+/* APR */
+#define	APR_AF_LMT_CFG			(0x000ull)
+#define	APR_AF_LMT_MAP_BASE		(0x008ull)
+#define	APR_AF_LMT_CTL			(0x010ull)
+
 #endif /* RVU_REG_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h
index 14aa8e37ea41..5bbe6727d11d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h
@@ -35,7 +35,8 @@ enum rvu_block_addr_e {
 	BLKADDR_NDC_NPA0	= 0xeULL,
 	BLKADDR_NDC_NIX1_RX	= 0x10ULL,
 	BLKADDR_NDC_NIX1_TX	= 0x11ULL,
-	BLK_COUNT		= 0x12ULL,
+	BLKADDR_APR		= 0x16ULL,
+	BLK_COUNT		= 0x17ULL,
 };
 
 /* RVU Block Type Enumeration */

From 893ae97214c385be02f8ec097298cc48c7f0d905 Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Tue, 29 Jun 2021 22:30:05 +0530
Subject: [PATCH 020/158] octeontx2-af: cn10k: Support configurable LMTST
 regions

This patch extends the lmtst_tbl_setup_req mbox to support run time
LMTST configuration.
RVU PF/VF and DPDK/ODP allocates a LMT region, creates a translation
entry for a device via VFIO IOCTLs.
This IOVA is shared with AF through above mbox. AF then uses
RVU_SMMU transulation Widget and gets PA for the IOVA and updates
the LMTtable entry for that device.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/marvell/octeontx2/af/mbox.h  |   3 +
 .../ethernet/marvell/octeontx2/af/rvu_cn10k.c | 130 +++++++++++++-----
 .../ethernet/marvell/octeontx2/af/rvu_reg.h   |   5 +
 3 files changed, 103 insertions(+), 35 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 638db868125a..9672cbf8a90a 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -1283,6 +1283,9 @@ struct set_vf_perm  {
 struct lmtst_tbl_setup_req {
 	struct mbox_msghdr hdr;
 	u16 base_pcifunc;
+	u8  use_local_lmt_region;
+	u64 lmt_iova;
+	u64 rsvd[4];
 };
 
 /* CPT mailbox error codes
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
index 87f56e1f32e3..8d48b64485c6 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
@@ -55,14 +55,101 @@ static u32 rvu_get_lmtst_tbl_index(struct rvu *rvu, u16 pcifunc)
 		(pcifunc & RVU_PFVF_FUNC_MASK)) * LMT_MAPTBL_ENTRY_SIZE;
 }
 
+static int rvu_get_lmtaddr(struct rvu *rvu, u16 pcifunc,
+			   u64 iova, u64 *lmt_addr)
+{
+	u64 pa, val, pf;
+	int err;
+
+	if (!iova) {
+		dev_err(rvu->dev, "%s Requested Null address for transulation\n", __func__);
+		return -EINVAL;
+	}
+
+	rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_ADDR_REQ, iova);
+	pf = rvu_get_pf(pcifunc) & 0x1F;
+	val = BIT_ULL(63) | BIT_ULL(14) | BIT_ULL(13) | pf << 8 |
+	      ((pcifunc & RVU_PFVF_FUNC_MASK) & 0xFF);
+	rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_TXN_REQ, val);
+
+	err = rvu_poll_reg(rvu, BLKADDR_RVUM, RVU_AF_SMMU_ADDR_RSP_STS, BIT_ULL(0), false);
+	if (err) {
+		dev_err(rvu->dev, "%s LMTLINE iova transulation failed\n", __func__);
+		return err;
+	}
+	val = rvu_read64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_ADDR_RSP_STS);
+	if (val & ~0x1ULL) {
+		dev_err(rvu->dev, "%s LMTLINE iova transulation failed err:%llx\n", __func__, val);
+		return -EIO;
+	}
+	/* PA[51:12] = RVU_AF_SMMU_TLN_FLIT1[60:21]
+	 * PA[11:0] = IOVA[11:0]
+	 */
+	pa = rvu_read64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_TLN_FLIT1) >> 21;
+	pa &= GENMASK_ULL(39, 0);
+	*lmt_addr = (pa << 12) | (iova  & 0xFFF);
+
+	return 0;
+}
+
+static int rvu_update_lmtaddr(struct rvu *rvu, u16 pcifunc, u64 lmt_addr)
+{
+	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
+	u32 tbl_idx;
+	int err = 0;
+	u64 val;
+
+	/* Read the current lmt addr of pcifunc */
+	tbl_idx = rvu_get_lmtst_tbl_index(rvu, pcifunc);
+	err = lmtst_map_table_ops(rvu, tbl_idx, &val, LMT_TBL_OP_READ);
+	if (err) {
+		dev_err(rvu->dev,
+			"Failed to read LMT map table: index 0x%x err %d\n",
+			tbl_idx, err);
+		return err;
+	}
+
+	/* Storing the seondary's lmt base address as this needs to be
+	 * reverted in FLR. Also making sure this default value doesn't
+	 * get overwritten on multiple calls to this mailbox.
+	 */
+	if (!pfvf->lmt_base_addr)
+		pfvf->lmt_base_addr = val;
+
+	/* Update the LMT table with new addr */
+	err = lmtst_map_table_ops(rvu, tbl_idx, &lmt_addr, LMT_TBL_OP_WRITE);
+	if (err) {
+		dev_err(rvu->dev,
+			"Failed to update LMT map table: index 0x%x err %d\n",
+			tbl_idx, err);
+		return err;
+	}
+	return 0;
+}
+
 int rvu_mbox_handler_lmtst_tbl_setup(struct rvu *rvu,
 				     struct lmtst_tbl_setup_req *req,
 				     struct msg_rsp *rsp)
 {
-	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, req->hdr.pcifunc);
-	u32 pri_tbl_idx, sec_tbl_idx;
+	u64 lmt_addr, val;
+	u32 pri_tbl_idx;
 	int err = 0;
-	u64 val;
+
+	/* Check if PF_FUNC wants to use it's own local memory as LMTLINE
+	 * region, if so, convert that IOVA to physical address and
+	 * populate LMT table with that address
+	 */
+	if (req->use_local_lmt_region) {
+		err = rvu_get_lmtaddr(rvu, req->hdr.pcifunc,
+				      req->lmt_iova, &lmt_addr);
+		if (err < 0)
+			return err;
+
+		/* Update the lmt addr for this PFFUNC in the LMT table */
+		err = rvu_update_lmtaddr(rvu, req->hdr.pcifunc, lmt_addr);
+		if (err)
+			return err;
+	}
 
 	/* Reconfiguring lmtst map table in lmt region shared mode i.e. make
 	 * multiple PF_FUNCs to share an LMTLINE region, so primary/base
@@ -76,27 +163,6 @@ int rvu_mbox_handler_lmtst_tbl_setup(struct rvu *rvu,
 		 */
 		pri_tbl_idx = rvu_get_lmtst_tbl_index(rvu, req->base_pcifunc);
 
-		/* Truncating secondary pcifunc to calculate the LMT table index
-		 * equivalent to secondary pcifunc.
-		 */
-		sec_tbl_idx = rvu_get_lmtst_tbl_index(rvu, req->hdr.pcifunc);
-		/* Read the base lmt addr of the secondary pcifunc */
-		err = lmtst_map_table_ops(rvu, sec_tbl_idx, &val,
-					  LMT_TBL_OP_READ);
-		if (err) {
-			dev_err(rvu->dev,
-				"Failed to read LMT map table: index 0x%x err %d\n",
-				sec_tbl_idx, err);
-			goto error;
-		}
-
-		/* Storing the seondary's lmt base address as this needs to be
-		 * reverted in FLR. Also making sure this default value doesn't
-		 * get overwritten on multiple calls to this mailbox.
-		 */
-		if (!pfvf->lmt_base_addr)
-			pfvf->lmt_base_addr = val;
-
 		/* Read the base lmt addr of the primary pcifunc */
 		err = lmtst_map_table_ops(rvu, pri_tbl_idx, &val,
 					  LMT_TBL_OP_READ);
@@ -104,24 +170,18 @@ int rvu_mbox_handler_lmtst_tbl_setup(struct rvu *rvu,
 			dev_err(rvu->dev,
 				"Failed to read LMT map table: index 0x%x err %d\n",
 				pri_tbl_idx, err);
-			goto error;
+			return err;
 		}
 
 		/* Update the base lmt addr of secondary with primary's base
 		 * lmt addr.
 		 */
-		err = lmtst_map_table_ops(rvu, sec_tbl_idx, &val,
-					  LMT_TBL_OP_WRITE);
-		if (err) {
-			dev_err(rvu->dev,
-				"Failed to update LMT map table: index 0x%x err %d\n",
-				sec_tbl_idx, err);
-			goto error;
-		}
+		err = rvu_update_lmtaddr(rvu, req->hdr.pcifunc, val);
+		if (err)
+			return err;
 	}
 
-error:
-	return err;
+	return 0;
 }
 
 /* Resetting the lmtst map table to original base addresses */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h
index 61bafe956aae..8b01ef6e2c99 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h
@@ -49,6 +49,11 @@
 #define RVU_AF_PFX_VF_BAR4_ADDR             (0x5400 | (a) << 4)
 #define RVU_AF_PFX_VF_BAR4_CFG              (0x5600 | (a) << 4)
 #define RVU_AF_PFX_LMTLINE_ADDR             (0x5800 | (a) << 4)
+#define RVU_AF_SMMU_ADDR_REQ		    (0x6000)
+#define RVU_AF_SMMU_TXN_REQ		    (0x6008)
+#define RVU_AF_SMMU_ADDR_RSP_STS	    (0x6010)
+#define RVU_AF_SMMU_ADDR_TLN		    (0x6018)
+#define RVU_AF_SMMU_TLN_FLIT1		    (0x6030)
 
 /* Admin function's privileged PF/VF registers */
 #define RVU_PRIV_CONST                      (0x8000000)

From 5c0512072f6517326d9fba083c4467f173ddd984 Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Tue, 29 Jun 2021 22:30:06 +0530
Subject: [PATCH 021/158] octeontx2-pf: cn10k: Use runtime allocated LMTLINE
 region

The current driver uses static LMTST region allocated by firmware.
This memory gets populated as PF/VF BAR2. RVU PF/VF driver ioremap
the memory as device memory for NIX/NPA operation. Since the memory
is mapped as device memory we see performance degration. To address
this issue this patch implements runtime memory allocation.
RVU PF/VF allocates memory during device probe and share the base
address with RVU AF. RVU AF then configure the LMT MAP table
accordingly.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/marvell/octeontx2/nic/cn10k.c    | 93 ++++++++-----------
 .../ethernet/marvell/octeontx2/nic/cn10k.h    |  3 +-
 .../marvell/octeontx2/nic/otx2_common.h       |  7 +-
 .../ethernet/marvell/octeontx2/nic/otx2_pf.c  | 17 ++--
 .../marvell/octeontx2/nic/otx2_txrx.h         |  1 +
 .../ethernet/marvell/octeontx2/nic/otx2_vf.c  | 12 +--
 6 files changed, 57 insertions(+), 76 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c
index 1b08896b46d2..184de9466286 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c
@@ -22,69 +22,52 @@ static struct dev_hw_ops cn10k_hw_ops = {
 	.refill_pool_ptrs = cn10k_refill_pool_ptrs,
 };
 
-int cn10k_pf_lmtst_init(struct otx2_nic *pf)
+int cn10k_lmtst_init(struct otx2_nic *pfvf)
 {
-	int size, num_lines;
-	u64 base;
 
-	if (!test_bit(CN10K_LMTST, &pf->hw.cap_flag)) {
-		pf->hw_ops = &otx2_hw_ops;
+	struct lmtst_tbl_setup_req *req;
+	int qcount, err;
+
+	if (!test_bit(CN10K_LMTST, &pfvf->hw.cap_flag)) {
+		pfvf->hw_ops = &otx2_hw_ops;
 		return 0;
 	}
 
-	pf->hw_ops = &cn10k_hw_ops;
-	base = pci_resource_start(pf->pdev, PCI_MBOX_BAR_NUM) +
-		       (MBOX_SIZE * (pf->total_vfs + 1));
+	pfvf->hw_ops = &cn10k_hw_ops;
+	qcount = pfvf->hw.max_queues;
+	/* LMTST lines allocation
+	 * qcount = num_online_cpus();
+	 * NPA = TX + RX + XDP.
+	 * NIX = TX * 32 (For Burst SQE flush).
+	 */
+	pfvf->tot_lmt_lines = (qcount * 3) + (qcount * 32);
+	pfvf->npa_lmt_lines = qcount * 3;
+	pfvf->nix_lmt_size =  LMT_BURST_SIZE * LMT_LINE_SIZE;
 
-	size = pci_resource_len(pf->pdev, PCI_MBOX_BAR_NUM) -
-	       (MBOX_SIZE * (pf->total_vfs + 1));
-
-	pf->hw.lmt_base = ioremap(base, size);
-
-	if (!pf->hw.lmt_base) {
-		dev_err(pf->dev, "Unable to map PF LMTST region\n");
+	mutex_lock(&pfvf->mbox.lock);
+	req = otx2_mbox_alloc_msg_lmtst_tbl_setup(&pfvf->mbox);
+	if (!req) {
+		mutex_unlock(&pfvf->mbox.lock);
 		return -ENOMEM;
 	}
 
-	/* FIXME: Get the num of LMTST lines from LMT table */
-	pf->tot_lmt_lines = size / LMT_LINE_SIZE;
-	num_lines = (pf->tot_lmt_lines - NIX_LMTID_BASE) /
-			    pf->hw.tx_queues;
-	/* Number of LMT lines per SQ queues */
-	pf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines;
+	req->use_local_lmt_region = true;
+
+	err = qmem_alloc(pfvf->dev, &pfvf->dync_lmt, pfvf->tot_lmt_lines,
+			 LMT_LINE_SIZE);
+	if (err) {
+		mutex_unlock(&pfvf->mbox.lock);
+		return err;
+	}
+	pfvf->hw.lmt_base = (u64 *)pfvf->dync_lmt->base;
+	req->lmt_iova = (u64)pfvf->dync_lmt->iova;
+
+	err = otx2_sync_mbox_msg(&pfvf->mbox);
+	mutex_unlock(&pfvf->mbox.lock);
 
-	pf->nix_lmt_size = pf->nix_lmt_lines * LMT_LINE_SIZE;
 	return 0;
 }
-
-int cn10k_vf_lmtst_init(struct otx2_nic *vf)
-{
-	int size, num_lines;
-
-	if (!test_bit(CN10K_LMTST, &vf->hw.cap_flag)) {
-		vf->hw_ops = &otx2_hw_ops;
-		return 0;
-	}
-
-	vf->hw_ops = &cn10k_hw_ops;
-	size = pci_resource_len(vf->pdev, PCI_MBOX_BAR_NUM);
-	vf->hw.lmt_base = ioremap_wc(pci_resource_start(vf->pdev,
-							PCI_MBOX_BAR_NUM),
-				     size);
-	if (!vf->hw.lmt_base) {
-		dev_err(vf->dev, "Unable to map VF LMTST region\n");
-		return -ENOMEM;
-	}
-
-	vf->tot_lmt_lines = size / LMT_LINE_SIZE;
-	/* LMTST lines per SQ */
-	num_lines = (vf->tot_lmt_lines - NIX_LMTID_BASE) /
-			    vf->hw.tx_queues;
-	vf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines;
-	vf->nix_lmt_size = vf->nix_lmt_lines * LMT_LINE_SIZE;
-	return 0;
-}
-EXPORT_SYMBOL(cn10k_vf_lmtst_init);
+EXPORT_SYMBOL(cn10k_lmtst_init);
 
 int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura)
 {
@@ -93,9 +76,11 @@ int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura)
 	struct otx2_snd_queue *sq;
 
 	sq = &pfvf->qset.sq[qidx];
-	sq->lmt_addr = (__force u64 *)((u64)pfvf->hw.nix_lmt_base +
+	sq->lmt_addr = (u64 *)((u64)pfvf->hw.nix_lmt_base +
 			       (qidx * pfvf->nix_lmt_size));
 
+	sq->lmt_id = pfvf->npa_lmt_lines + (qidx * LMT_BURST_SIZE);
+
 	/* Get memory to put this msg */
 	aq = otx2_mbox_alloc_msg_nix_cn10k_aq_enq(&pfvf->mbox);
 	if (!aq)
@@ -158,15 +143,13 @@ void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq)
 
 void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx)
 {
-	struct otx2_nic *pfvf = dev;
-	int lmt_id = NIX_LMTID_BASE + (qidx * pfvf->nix_lmt_lines);
 	u64 val = 0, tar_addr = 0;
 
 	/* FIXME: val[0:10] LMT_ID.
 	 * [12:15] no of LMTST - 1 in the burst.
 	 * [19:63] data size of each LMTST in the burst except first.
 	 */
-	val = (lmt_id & 0x7FF);
+	val = (sq->lmt_id & 0x7FF);
 	/* Target address for LMTST flush tells HW how many 128bit
 	 * words are present.
 	 * tar_addr[6:4] size of first LMTST - 1 in units of 128b.
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h
index 71292a4cf1f3..1a1ae334477d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h
@@ -12,8 +12,7 @@
 void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq);
 void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx);
 int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura);
-int cn10k_pf_lmtst_init(struct otx2_nic *pf);
-int cn10k_vf_lmtst_init(struct otx2_nic *vf);
+int cn10k_lmtst_init(struct otx2_nic *pfvf);
 int cn10k_free_all_ipolicers(struct otx2_nic *pfvf);
 int cn10k_alloc_matchall_ipolicer(struct otx2_nic *pfvf);
 int cn10k_free_matchall_ipolicer(struct otx2_nic *pfvf);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
index 234b330f3183..20a9c69f020f 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
@@ -218,8 +218,8 @@ struct otx2_hw {
 	unsigned long		cap_flag;
 
 #define LMT_LINE_SIZE		128
-#define NIX_LMTID_BASE		72 /* RX + TX + XDP */
-	void __iomem		*lmt_base;
+#define LMT_BURST_SIZE		32 /* 32 LMTST lines for burst SQE flush */
+	u64			*lmt_base;
 	u64			*npa_lmt_base;
 	u64			*nix_lmt_base;
 };
@@ -363,8 +363,9 @@ struct otx2_nic {
 	/* Block address of NIX either BLKADDR_NIX0 or BLKADDR_NIX1 */
 	int			nix_blkaddr;
 	/* LMTST Lines info */
+	struct qmem		*dync_lmt;
 	u16			tot_lmt_lines;
-	u16			nix_lmt_lines;
+	u16			npa_lmt_lines;
 	u32			nix_lmt_size;
 
 	struct otx2_ptp		*ptp;
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
index 59912f73417b..088c28df849d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@ -1533,10 +1533,10 @@ int otx2_open(struct net_device *netdev)
 
 	if (test_bit(CN10K_LMTST, &pf->hw.cap_flag)) {
 		/* Reserve LMT lines for NPA AURA batch free */
-		pf->hw.npa_lmt_base = (__force u64 *)pf->hw.lmt_base;
+		pf->hw.npa_lmt_base = pf->hw.lmt_base;
 		/* Reserve LMT lines for NIX TX */
-		pf->hw.nix_lmt_base = (__force u64 *)((u64)pf->hw.npa_lmt_base +
-				      (NIX_LMTID_BASE * LMT_LINE_SIZE));
+		pf->hw.nix_lmt_base = (u64 *)((u64)pf->hw.npa_lmt_base +
+				      (pf->npa_lmt_lines * LMT_LINE_SIZE));
 	}
 
 	err = otx2_init_hw_resources(pf);
@@ -2526,7 +2526,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_detach_rsrc;
 
-	err = cn10k_pf_lmtst_init(pf);
+	err = cn10k_lmtst_init(pf);
 	if (err)
 		goto err_detach_rsrc;
 
@@ -2630,8 +2630,8 @@ err_del_mcam_entries:
 err_ptp_destroy:
 	otx2_ptp_destroy(pf);
 err_detach_rsrc:
-	if (hw->lmt_base)
-		iounmap(hw->lmt_base);
+	if (test_bit(CN10K_LMTST, &pf->hw.cap_flag))
+		qmem_free(pf->dev, pf->dync_lmt);
 	otx2_detach_resources(&pf->mbox);
 err_disable_mbox_intr:
 	otx2_disable_mbox_intr(pf);
@@ -2772,9 +2772,8 @@ static void otx2_remove(struct pci_dev *pdev)
 	otx2_mcam_flow_del(pf);
 	otx2_shutdown_tc(pf);
 	otx2_detach_resources(&pf->mbox);
-	if (pf->hw.lmt_base)
-		iounmap(pf->hw.lmt_base);
-
+	if (test_bit(CN10K_LMTST, &pf->hw.cap_flag))
+		qmem_free(pf->dev, pf->dync_lmt);
 	otx2_disable_mbox_intr(pf);
 	otx2_pfaf_mbox_destroy(pf);
 	pci_free_irq_vectors(pf->pdev);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h
index 52486c1f0973..2f144e2cf436 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h
@@ -83,6 +83,7 @@ struct otx2_snd_queue {
 	u16			num_sqbs;
 	u16			sqe_thresh;
 	u8			sqe_per_sqb;
+	u32			lmt_id;
 	u64			 io_addr;
 	u64			*aura_fc_addr;
 	u64			*lmt_addr;
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
index 13a908f75ba0..a8bee5aefec1 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
@@ -609,7 +609,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_detach_rsrc;
 
-	err = cn10k_vf_lmtst_init(vf);
+	err = cn10k_lmtst_init(vf);
 	if (err)
 		goto err_detach_rsrc;
 
@@ -667,8 +667,8 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 err_unreg_netdev:
 	unregister_netdev(netdev);
 err_detach_rsrc:
-	if (hw->lmt_base)
-		iounmap(hw->lmt_base);
+	if (test_bit(CN10K_LMTST, &vf->hw.cap_flag))
+		qmem_free(vf->dev, vf->dync_lmt);
 	otx2_detach_resources(&vf->mbox);
 err_disable_mbox_intr:
 	otx2vf_disable_mbox_intr(vf);
@@ -700,10 +700,8 @@ static void otx2vf_remove(struct pci_dev *pdev)
 		destroy_workqueue(vf->otx2_wq);
 	otx2vf_disable_mbox_intr(vf);
 	otx2_detach_resources(&vf->mbox);
-
-	if (vf->hw.lmt_base)
-		iounmap(vf->hw.lmt_base);
-
+	if (test_bit(CN10K_LMTST, &vf->hw.cap_flag))
+		qmem_free(vf->dev, vf->dync_lmt);
 	otx2vf_vfaf_mbox_destroy(vf);
 	pci_free_irq_vectors(vf->pdev);
 	pci_set_drvdata(pdev, NULL);

From b71d09871566a20ae8a1064e50f1e94813b18482 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 29 Jun 2021 23:32:15 +0300
Subject: [PATCH 022/158] net: dsa: return -EOPNOTSUPP when driver does not
 implement .port_lag_join

The DSA core has a layered structure, and even though we end up
returning 0 (success) to user space when setting a bonding/team upper
that can't be offloaded, some parts of the framework actually need to
know that we couldn't offload that.

For example, if dsa_switch_lag_join returns 0 as it currently does,
dsa_port_lag_join has no way to tell a successful offload from a
software fallback, and it will call dsa_port_bridge_join afterwards.
Then we'll think we're offloading the bridge master of the LAG, when in
fact we're not even offloading the LAG. In turn, this will make us set
skb->offload_fwd_mark = true, which is incorrect and the bridge doesn't
like it.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/switch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index af71b8638098..248455145982 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -427,7 +427,7 @@ static int dsa_switch_lag_join(struct dsa_switch *ds,
 						   info->port, info->lag,
 						   info->info);
 
-	return 0;
+	return -EOPNOTSUPP;
 }
 
 static int dsa_switch_lag_leave(struct dsa_switch *ds,
@@ -440,7 +440,7 @@ static int dsa_switch_lag_leave(struct dsa_switch *ds,
 		return ds->ops->crosschip_lag_leave(ds, info->sw_index,
 						    info->port, info->lag);
 
-	return 0;
+	return -EOPNOTSUPP;
 }
 
 static int dsa_switch_mdb_add(struct dsa_switch *ds,

From 5a3c680aa2c12c90c44af383fe6882a39875ab81 Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb@gmail.com>
Date: Tue, 29 Jun 2021 17:14:19 -0700
Subject: [PATCH 023/158] net: bcmgenet: ensure EXT_ENERGY_DET_MASK is clear

Setting the EXT_ENERGY_DET_MASK bit allows the port energy detection
logic of the internal PHY to prevent the system from sleeping. Some
internal PHYs will report that energy is detected when the network
interface is closed which can prevent the system from going to sleep
if WoL is enabled when the interface is brought down.

Since the driver does not support waking the system on this logic,
this commit clears the bit whenever the internal PHY is powered up
and the other logic for manipulating the bit is removed since it
serves no useful function.

Fixes: 1c1008c793fa ("net: bcmgenet: add main driver file")
Signed-off-by: Doug Berger <opendmb@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c  | 17 ++---------------
 .../net/ethernet/broadcom/genet/bcmgenet_wol.c  |  6 ------
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 41f7f078cd27..35e9956e930c 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -1640,7 +1640,8 @@ static void bcmgenet_power_up(struct bcmgenet_priv *priv,
 
 	switch (mode) {
 	case GENET_POWER_PASSIVE:
-		reg &= ~(EXT_PWR_DOWN_DLL | EXT_PWR_DOWN_BIAS);
+		reg &= ~(EXT_PWR_DOWN_DLL | EXT_PWR_DOWN_BIAS |
+			 EXT_ENERGY_DET_MASK);
 		if (GENET_IS_V5(priv)) {
 			reg &= ~(EXT_PWR_DOWN_PHY_EN |
 				 EXT_PWR_DOWN_PHY_RD |
@@ -3292,7 +3293,6 @@ static int bcmgenet_open(struct net_device *dev)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
 	unsigned long dma_ctrl;
-	u32 reg;
 	int ret;
 
 	netif_dbg(priv, ifup, dev, "bcmgenet_open\n");
@@ -3318,12 +3318,6 @@ static int bcmgenet_open(struct net_device *dev)
 
 	bcmgenet_set_hw_addr(priv, dev->dev_addr);
 
-	if (priv->internal_phy) {
-		reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT);
-		reg |= EXT_ENERGY_DET_MASK;
-		bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT);
-	}
-
 	/* Disable RX/TX DMA and flush TX queues */
 	dma_ctrl = bcmgenet_dma_disable(priv);
 
@@ -4139,7 +4133,6 @@ static int bcmgenet_resume(struct device *d)
 	struct bcmgenet_priv *priv = netdev_priv(dev);
 	struct bcmgenet_rxnfc_rule *rule;
 	unsigned long dma_ctrl;
-	u32 reg;
 	int ret;
 
 	if (!netif_running(dev))
@@ -4176,12 +4169,6 @@ static int bcmgenet_resume(struct device *d)
 		if (rule->state != BCMGENET_RXNFC_STATE_UNUSED)
 			bcmgenet_hfb_create_rxnfc_filter(priv, rule);
 
-	if (priv->internal_phy) {
-		reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT);
-		reg |= EXT_ENERGY_DET_MASK;
-		bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT);
-	}
-
 	/* Disable RX/TX DMA and flush TX queues */
 	dma_ctrl = bcmgenet_dma_disable(priv);
 
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c
index facde824bcaa..e31a5a397f11 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c
@@ -186,12 +186,6 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv,
 	reg |= CMD_RX_EN;
 	bcmgenet_umac_writel(priv, reg, UMAC_CMD);
 
-	if (priv->hw_params->flags & GENET_HAS_EXT) {
-		reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT);
-		reg &= ~EXT_ENERGY_DET_MASK;
-		bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT);
-	}
-
 	reg = UMAC_IRQ_MPD_R;
 	if (hfb_enable)
 		reg |=  UMAC_IRQ_HFB_SM | UMAC_IRQ_HFB_MM;

From 650b2a846dddf33321489597287e6d825ec1d0ba Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 29 Jun 2021 23:30:36 -0400
Subject: [PATCH 024/158] sctp: check pl.raise_count separately from its
 increment

As Marcelo's suggestion this will make code more clear to read.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/transport.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 5f23804f21c7..397a6244dd97 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -335,10 +335,13 @@ void sctp_transport_pl_recv(struct sctp_transport *t)
 			t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
 			sctp_assoc_sync_pmtu(t->asoc);
 		}
-	} else if (t->pl.state == SCTP_PL_COMPLETE && ++t->pl.raise_count == 30) {
-		/* Raise probe_size again after 30 * interval in Search Complete */
-		t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */
-		t->pl.probe_size += SCTP_PL_MIN_STEP;
+	} else if (t->pl.state == SCTP_PL_COMPLETE) {
+		t->pl.raise_count++;
+		if (t->pl.raise_count == 30) {
+			/* Raise probe_size again after 30 * interval in Search Complete */
+			t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */
+			t->pl.probe_size += SCTP_PL_MIN_STEP;
+		}
 	}
 }
 

From 1d11fa231cabeae09a95cb3e4cf1d9dd34e00f08 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 29 Jun 2021 23:34:08 -0400
Subject: [PATCH 025/158] sctp: move 198 addresses from unusable to private
 scope
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The doc draft-stewart-tsvwg-sctp-ipv4-00 that restricts 198 addresses
was never published. These addresses as private addresses should be
allowed to use in SCTP.

As Michael Tuexen suggested, this patch is to move 198 addresses from
unusable to private scope.

Reported-by: Sérgio <surkamp@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 4 +---
 net/sctp/protocol.c          | 3 ++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 265fffa33dad..5859e0a16a58 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -360,8 +360,7 @@ enum {
 #define SCTP_SCOPE_POLICY_MAX	SCTP_SCOPE_POLICY_LINK
 
 /* Based on IPv4 scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>,
- * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 198.18.0.0/24,
- * 192.88.99.0/24.
+ * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 192.88.99.0/24.
  * Also, RFC 8.4, non-unicast addresses are not considered valid SCTP
  * addresses.
  */
@@ -369,7 +368,6 @@ enum {
 	((htonl(INADDR_BROADCAST) == a) ||  \
 	 ipv4_is_multicast(a) ||	    \
 	 ipv4_is_zeronet(a) ||		    \
-	 ipv4_is_test_198(a) ||		    \
 	 ipv4_is_anycast_6to4(a))
 
 /* Flags used for the bind address copy functions.  */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 3c1fbf38f4f7..ec0f52567c16 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -398,7 +398,8 @@ static enum sctp_scope sctp_v4_scope(union sctp_addr *addr)
 		retval = SCTP_SCOPE_LINK;
 	} else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) ||
 		   ipv4_is_private_172(addr->v4.sin_addr.s_addr) ||
-		   ipv4_is_private_192(addr->v4.sin_addr.s_addr)) {
+		   ipv4_is_private_192(addr->v4.sin_addr.s_addr) ||
+		   ipv4_is_test_198(addr->v4.sin_addr.s_addr)) {
 		retval = SCTP_SCOPE_PRIVATE;
 	} else {
 		retval = SCTP_SCOPE_GLOBAL;

From 7da467d82d1ed4fb317aff836f99709169e73f10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Thu, 1 Jul 2021 00:22:26 +0200
Subject: [PATCH 026/158] net: dsa: mv88e6xxx: enable .port_set_policy() on
 Topaz
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit f3a2cd326e44 ("net: dsa: mv88e6xxx: introduce .port_set_policy")
introduced .port_set_policy() method with implementation for several
models, but forgot to add Topaz, which can use the 6352 implementation.

Use the 6352 implementation of .port_set_policy() on Topaz.

Signed-off-by: Marek Behún <kabel@kernel.org>
Fixes: f3a2cd326e44 ("net: dsa: mv88e6xxx: introduce .port_set_policy")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 961fa6b75cad..6bcee3e012d4 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3583,6 +3583,7 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
 	.port_set_speed_duplex = mv88e6341_port_set_speed_duplex,
 	.port_max_speed_mode = mv88e6341_port_max_speed_mode,
 	.port_tag_remap = mv88e6095_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_ucast_flood = mv88e6352_port_set_ucast_flood,
 	.port_set_mcast_flood = mv88e6352_port_set_mcast_flood,
@@ -4383,6 +4384,7 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
 	.port_set_speed_duplex = mv88e6341_port_set_speed_duplex,
 	.port_max_speed_mode = mv88e6341_port_max_speed_mode,
 	.port_tag_remap = mv88e6095_port_tag_remap,
+	.port_set_policy = mv88e6352_port_set_policy,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_ucast_flood = mv88e6352_port_set_ucast_flood,
 	.port_set_mcast_flood = mv88e6352_port_set_mcast_flood,

From 11527f3c4725640e6c40a2b7654e303f45e82a6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Thu, 1 Jul 2021 00:22:27 +0200
Subject: [PATCH 027/158] net: dsa: mv88e6xxx: use correct
 .stats_set_histogram() on Topaz
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 40cff8fca9e3 ("net: dsa: mv88e6xxx: Fix stats histogram mode")
introduced wrong .stats_set_histogram() method for Topaz family.

The Peridot method should be used instead.

Signed-off-by: Marek Behún <kabel@kernel.org>
Fixes: 40cff8fca9e3 ("net: dsa: mv88e6xxx: Fix stats histogram mode")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 6bcee3e012d4..b125d3227dbd 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3597,7 +3597,7 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
 	.port_set_cmode = mv88e6341_port_set_cmode,
 	.port_setup_message_port = mv88e6xxx_setup_message_port,
 	.stats_snapshot = mv88e6390_g1_stats_snapshot,
-	.stats_set_histogram = mv88e6095_g1_stats_set_histogram,
+	.stats_set_histogram = mv88e6390_g1_stats_set_histogram,
 	.stats_get_sset_count = mv88e6320_stats_get_sset_count,
 	.stats_get_strings = mv88e6320_stats_get_strings,
 	.stats_get_stats = mv88e6390_stats_get_stats,
@@ -4398,7 +4398,7 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
 	.port_set_cmode = mv88e6341_port_set_cmode,
 	.port_setup_message_port = mv88e6xxx_setup_message_port,
 	.stats_snapshot = mv88e6390_g1_stats_snapshot,
-	.stats_set_histogram = mv88e6095_g1_stats_set_histogram,
+	.stats_set_histogram = mv88e6390_g1_stats_set_histogram,
 	.stats_get_sset_count = mv88e6320_stats_get_sset_count,
 	.stats_get_strings = mv88e6320_stats_get_strings,
 	.stats_get_stats = mv88e6390_stats_get_stats,

From 3709488790022c85720f991bff50d48ed5a36e6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Thu, 1 Jul 2021 00:22:28 +0200
Subject: [PATCH 028/158] net: dsa: mv88e6xxx: enable .rmu_disable() on Topaz
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 9e5baf9b36367 ("net: dsa: mv88e6xxx: add RMU disable op")
introduced .rmu_disable() method with implementation for several models,
but forgot to add Topaz, which can use the Peridot implementation.

Use the Peridot implementation of .rmu_disable() on Topaz.

Signed-off-by: Marek Behún <kabel@kernel.org>
Fixes: 9e5baf9b36367 ("net: dsa: mv88e6xxx: add RMU disable op")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index b125d3227dbd..d4b05c10e5f2 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3607,6 +3607,7 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
 	.mgmt_rsvd2cpu =  mv88e6390_g1_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6390_g1_rmu_disable,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
@@ -4408,6 +4409,7 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
 	.mgmt_rsvd2cpu =  mv88e6390_g1_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6390_g1_rmu_disable,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,

From c07fff3492acae41cedbabea395b644dd5872b8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Thu, 1 Jul 2021 00:22:29 +0200
Subject: [PATCH 029/158] net: dsa: mv88e6xxx: enable devlink ATU hash param
 for Topaz
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 23e8b470c7788 ("net: dsa: mv88e6xxx: Add devlink param for ATU
hash algorithm.") introduced ATU hash algorithm access via devlink, but
did not enable it for Topaz.

Enable this feature also for Topaz.

Signed-off-by: Marek Behún <kabel@kernel.org>
Fixes: 23e8b470c7788 ("net: dsa: mv88e6xxx: Add devlink param for ATU hash algorithm.")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index d4b05c10e5f2..354ff0b84b7f 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3608,6 +3608,8 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
 	.rmu_disable = mv88e6390_g1_rmu_disable,
+	.atu_get_hash = mv88e6165_g1_atu_get_hash,
+	.atu_set_hash = mv88e6165_g1_atu_set_hash,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
@@ -4410,6 +4412,8 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
 	.rmu_disable = mv88e6390_g1_rmu_disable,
+	.atu_get_hash = mv88e6165_g1_atu_get_hash,
+	.atu_set_hash = mv88e6165_g1_atu_set_hash,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,

From a03b98d68367b18e5db6d6850e2cc18754fba94a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Thu, 1 Jul 2021 00:22:30 +0200
Subject: [PATCH 030/158] net: dsa: mv88e6xxx: enable SerDes RX stats for Topaz
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 0df952873636a ("mv88e6xxx: Add serdes Rx statistics") added
support for RX statistics on SerDes ports for Peridot.

This same implementation is also valid for Topaz, but was not enabled
at the time.

We need to use the generic .serdes_get_lane() method instead of the
Peridot specific one in the stats methods so that on Topaz the proper
one is used.

Signed-off-by: Marek Behún <kabel@kernel.org>
Fixes: 0df952873636a ("mv88e6xxx: Add serdes Rx statistics")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c   | 6 ++++++
 drivers/net/dsa/mv88e6xxx/serdes.c | 6 +++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 354ff0b84b7f..1e95a0facbd4 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3623,6 +3623,9 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
 	.serdes_irq_enable = mv88e6390_serdes_irq_enable,
 	.serdes_irq_status = mv88e6390_serdes_irq_status,
 	.gpio_ops = &mv88e6352_gpio_ops,
+	.serdes_get_sset_count = mv88e6390_serdes_get_sset_count,
+	.serdes_get_strings = mv88e6390_serdes_get_strings,
+	.serdes_get_stats = mv88e6390_serdes_get_stats,
 	.phylink_validate = mv88e6341_phylink_validate,
 };
 
@@ -4429,6 +4432,9 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
 	.gpio_ops = &mv88e6352_gpio_ops,
 	.avb_ops = &mv88e6390_avb_ops,
 	.ptp_ops = &mv88e6352_ptp_ops,
+	.serdes_get_sset_count = mv88e6390_serdes_get_sset_count,
+	.serdes_get_strings = mv88e6390_serdes_get_strings,
+	.serdes_get_stats = mv88e6390_serdes_get_stats,
 	.phylink_validate = mv88e6341_phylink_validate,
 };
 
diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c
index e4fbef81bc52..b1d46dd8eaab 100644
--- a/drivers/net/dsa/mv88e6xxx/serdes.c
+++ b/drivers/net/dsa/mv88e6xxx/serdes.c
@@ -722,7 +722,7 @@ static struct mv88e6390_serdes_hw_stat mv88e6390_serdes_hw_stats[] = {
 
 int mv88e6390_serdes_get_sset_count(struct mv88e6xxx_chip *chip, int port)
 {
-	if (mv88e6390_serdes_get_lane(chip, port) < 0)
+	if (mv88e6xxx_serdes_get_lane(chip, port) < 0)
 		return 0;
 
 	return ARRAY_SIZE(mv88e6390_serdes_hw_stats);
@@ -734,7 +734,7 @@ int mv88e6390_serdes_get_strings(struct mv88e6xxx_chip *chip,
 	struct mv88e6390_serdes_hw_stat *stat;
 	int i;
 
-	if (mv88e6390_serdes_get_lane(chip, port) < 0)
+	if (mv88e6xxx_serdes_get_lane(chip, port) < 0)
 		return 0;
 
 	for (i = 0; i < ARRAY_SIZE(mv88e6390_serdes_hw_stats); i++) {
@@ -770,7 +770,7 @@ int mv88e6390_serdes_get_stats(struct mv88e6xxx_chip *chip, int port,
 	int lane;
 	int i;
 
-	lane = mv88e6390_serdes_get_lane(chip, port);
+	lane = mv88e6xxx_serdes_get_lane(chip, port);
 	if (lane < 0)
 		return 0;
 

From 953b0dcbe2e3f7bee98cc3bca2ec82c8298e9c16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Thu, 1 Jul 2021 00:22:31 +0200
Subject: [PATCH 031/158] net: dsa: mv88e6xxx: enable SerDes PCS register dump
 via ethtool -d on Topaz
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit bf3504cea7d7e ("net: dsa: mv88e6xxx: Add 6390 family PCS
registers to ethtool -d") added support for dumping SerDes PCS registers
via ethtool -d for Peridot.

The same implementation is also valid for Topaz, but was not
enabled at the time.

Signed-off-by: Marek Behún <kabel@kernel.org>
Fixes: bf3504cea7d7e ("net: dsa: mv88e6xxx: Add 6390 family PCS registers to ethtool -d")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 1e95a0facbd4..beb41572d04e 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3626,6 +3626,8 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
 	.serdes_get_sset_count = mv88e6390_serdes_get_sset_count,
 	.serdes_get_strings = mv88e6390_serdes_get_strings,
 	.serdes_get_stats = mv88e6390_serdes_get_stats,
+	.serdes_get_regs_len = mv88e6390_serdes_get_regs_len,
+	.serdes_get_regs = mv88e6390_serdes_get_regs,
 	.phylink_validate = mv88e6341_phylink_validate,
 };
 
@@ -4435,6 +4437,8 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
 	.serdes_get_sset_count = mv88e6390_serdes_get_sset_count,
 	.serdes_get_strings = mv88e6390_serdes_get_strings,
 	.serdes_get_stats = mv88e6390_serdes_get_stats,
+	.serdes_get_regs_len = mv88e6390_serdes_get_regs_len,
+	.serdes_get_regs = mv88e6390_serdes_get_regs,
 	.phylink_validate = mv88e6341_phylink_validate,
 };
 

From 6f14078e3ee59ccc5806f7bff0f25f94a6d3ff80 Mon Sep 17 00:00:00 2001
From: Sunil Kumar Kori <skori@marvell.com>
Date: Wed, 30 Jun 2021 15:40:57 +0530
Subject: [PATCH 032/158] octeontx2-af: DMAC filter support in MAC block

MAC block supports 32 dmac filters which are logically
divided among all attached LMACS.

For example MAC block0 having one LMAC then maximum supported
filters are 32 where as MAC block1 having 4 enabled LMACS
them maximum supported filteres are 8 for each LMAC.

This patch adds mbox handlers to add/delete/update mac entry
in DMAC filter table.

Signed-off-by: Sunil Kumar Kori <skori@marvell.com>
Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/marvell/octeontx2/af/cgx.c   | 264 +++++++++++++++++-
 .../net/ethernet/marvell/octeontx2/af/cgx.h   |   7 +
 .../marvell/octeontx2/af/lmac_common.h        |  12 +-
 .../net/ethernet/marvell/octeontx2/af/mbox.h  |  48 +++-
 .../net/ethernet/marvell/octeontx2/af/rvu.h   |   2 +
 .../ethernet/marvell/octeontx2/af/rvu_cgx.c   | 109 ++++++++
 .../ethernet/marvell/octeontx2/af/rvu_nix.c   |   3 +
 7 files changed, 425 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
index fac6474ad694..bc413f96b430 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
@@ -86,6 +86,22 @@ bool is_lmac_valid(struct cgx *cgx, int lmac_id)
 	return test_bit(lmac_id, &cgx->lmac_bmap);
 }
 
+/* Helper function to get sequential index
+ * given the enabled LMAC of a CGX
+ */
+static int get_sequence_id_of_lmac(struct cgx *cgx, int lmac_id)
+{
+	int tmp, id = 0;
+
+	for_each_set_bit(tmp, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) {
+		if (tmp == lmac_id)
+			break;
+		id++;
+	}
+
+	return id;
+}
+
 struct mac_ops *get_mac_ops(void *cgxd)
 {
 	if (!cgxd)
@@ -211,37 +227,229 @@ static u64 mac2u64 (u8 *mac_addr)
 	return mac;
 }
 
+static void cfg2mac(u64 cfg, u8 *mac_addr)
+{
+	int i, index = 0;
+
+	for (i = ETH_ALEN - 1; i >= 0; i--, index++)
+		mac_addr[i] = (cfg >> (8 * index)) & 0xFF;
+}
+
 int cgx_lmac_addr_set(u8 cgx_id, u8 lmac_id, u8 *mac_addr)
 {
 	struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+	struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
 	struct mac_ops *mac_ops;
+	int index, id;
 	u64 cfg;
 
+	/* access mac_ops to know csr_offset */
 	mac_ops = cgx_dev->mac_ops;
+
 	/* copy 6bytes from macaddr */
 	/* memcpy(&cfg, mac_addr, 6); */
 
 	cfg = mac2u64 (mac_addr);
 
-	cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (lmac_id * 0x8)),
+	id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+	index = id * lmac->mac_to_index_bmap.max;
+
+	cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)),
 		  cfg | CGX_DMAC_CAM_ADDR_ENABLE | ((u64)lmac_id << 49));
 
 	cfg = cgx_read(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
-	cfg |= CGX_DMAC_CTL0_CAM_ENABLE;
+	cfg |= (CGX_DMAC_CTL0_CAM_ENABLE | CGX_DMAC_BCAST_MODE |
+		CGX_DMAC_MCAST_MODE);
 	cgx_write(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
 
 	return 0;
 }
 
+int cgx_lmac_addr_add(u8 cgx_id, u8 lmac_id, u8 *mac_addr)
+{
+	struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+	struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
+	struct mac_ops *mac_ops;
+	int index, idx;
+	u64 cfg = 0;
+	int id;
+
+	if (!lmac)
+		return -ENODEV;
+
+	mac_ops = cgx_dev->mac_ops;
+	/* Get available index where entry is to be installed */
+	idx = rvu_alloc_rsrc(&lmac->mac_to_index_bmap);
+	if (idx < 0)
+		return idx;
+
+	id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+	index = id * lmac->mac_to_index_bmap.max + idx;
+
+	cfg = mac2u64 (mac_addr);
+	cfg |= CGX_DMAC_CAM_ADDR_ENABLE;
+	cfg |= ((u64)lmac_id << 49);
+	cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)), cfg);
+
+	cfg = cgx_read(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
+	cfg |= (CGX_DMAC_BCAST_MODE | CGX_DMAC_CAM_ACCEPT);
+
+	if (is_multicast_ether_addr(mac_addr)) {
+		cfg &= ~GENMASK_ULL(2, 1);
+		cfg |= CGX_DMAC_MCAST_MODE_CAM;
+		lmac->mcast_filters_count++;
+	} else if (!lmac->mcast_filters_count) {
+		cfg |= CGX_DMAC_MCAST_MODE;
+	}
+
+	cgx_write(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
+
+	return idx;
+}
+
+int cgx_lmac_addr_reset(u8 cgx_id, u8 lmac_id)
+{
+	struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+	struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
+	struct mac_ops *mac_ops;
+	u8 index = 0, id;
+	u64 cfg;
+
+	if (!lmac)
+		return -ENODEV;
+
+	mac_ops = cgx_dev->mac_ops;
+	/* Restore index 0 to its default init value as done during
+	 * cgx_lmac_init
+	 */
+	set_bit(0, lmac->mac_to_index_bmap.bmap);
+
+	id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+	index = id * lmac->mac_to_index_bmap.max + index;
+	cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)), 0);
+
+	/* Reset CGXX_CMRX_RX_DMAC_CTL0 register to default state */
+	cfg = cgx_read(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
+	cfg &= ~CGX_DMAC_CAM_ACCEPT;
+	cfg |= (CGX_DMAC_BCAST_MODE | CGX_DMAC_MCAST_MODE);
+	cgx_write(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
+
+	return 0;
+}
+
+/* Allows caller to change macaddress associated with index
+ * in dmac filter table including index 0 reserved for
+ * interface mac address
+ */
+int cgx_lmac_addr_update(u8 cgx_id, u8 lmac_id, u8 *mac_addr, u8 index)
+{
+	struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+	struct mac_ops *mac_ops;
+	struct lmac *lmac;
+	u64 cfg;
+	int id;
+
+	lmac = lmac_pdata(lmac_id, cgx_dev);
+	if (!lmac)
+		return -ENODEV;
+
+	mac_ops = cgx_dev->mac_ops;
+	/* Validate the index */
+	if (index >= lmac->mac_to_index_bmap.max)
+		return -EINVAL;
+
+	/* ensure index is already set */
+	if (!test_bit(index, lmac->mac_to_index_bmap.bmap))
+		return -EINVAL;
+
+	id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+	index = id * lmac->mac_to_index_bmap.max + index;
+
+	cfg = cgx_read(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)));
+	cfg &= ~CGX_RX_DMAC_ADR_MASK;
+	cfg |= mac2u64 (mac_addr);
+
+	cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)), cfg);
+	return 0;
+}
+
+int cgx_lmac_addr_del(u8 cgx_id, u8 lmac_id, u8 index)
+{
+	struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+	struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
+	struct mac_ops *mac_ops;
+	u8 mac[ETH_ALEN];
+	u64 cfg;
+	int id;
+
+	if (!lmac)
+		return -ENODEV;
+
+	mac_ops = cgx_dev->mac_ops;
+	/* Validate the index */
+	if (index >= lmac->mac_to_index_bmap.max)
+		return -EINVAL;
+
+	/* Skip deletion for reserved index i.e. index 0 */
+	if (index == 0)
+		return 0;
+
+	rvu_free_rsrc(&lmac->mac_to_index_bmap, index);
+
+	id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+	index = id * lmac->mac_to_index_bmap.max + index;
+
+	/* Read MAC address to check whether it is ucast or mcast */
+	cfg = cgx_read(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)));
+
+	cfg2mac(cfg, mac);
+	if (is_multicast_ether_addr(mac))
+		lmac->mcast_filters_count--;
+
+	if (!lmac->mcast_filters_count) {
+		cfg = cgx_read(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
+		cfg &= ~GENMASK_ULL(2, 1);
+		cfg |= CGX_DMAC_MCAST_MODE;
+		cgx_write(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
+	}
+
+	cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)), 0);
+
+	return 0;
+}
+
+int cgx_lmac_addr_max_entries_get(u8 cgx_id, u8 lmac_id)
+{
+	struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+	struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
+
+	if (lmac)
+		return lmac->mac_to_index_bmap.max;
+
+	return 0;
+}
+
 u64 cgx_lmac_addr_get(u8 cgx_id, u8 lmac_id)
 {
 	struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+	struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
 	struct mac_ops *mac_ops;
+	int index;
 	u64 cfg;
+	int id;
 
 	mac_ops = cgx_dev->mac_ops;
 
-	cfg = cgx_read(cgx_dev, 0, CGXX_CMRX_RX_DMAC_CAM0 + lmac_id * 0x8);
+	id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+	index = id * lmac->mac_to_index_bmap.max;
+
+	cfg = cgx_read(cgx_dev, 0, CGXX_CMRX_RX_DMAC_CAM0 + index * 0x8);
 	return cfg & CGX_RX_DMAC_ADR_MASK;
 }
 
@@ -297,35 +505,51 @@ int cgx_lmac_internal_loopback(void *cgxd, int lmac_id, bool enable)
 void cgx_lmac_promisc_config(int cgx_id, int lmac_id, bool enable)
 {
 	struct cgx *cgx = cgx_get_pdata(cgx_id);
+	struct lmac *lmac = lmac_pdata(lmac_id, cgx);
+	u16 max_dmac = lmac->mac_to_index_bmap.max;
 	struct mac_ops *mac_ops;
+	int index, i;
 	u64 cfg = 0;
+	int id;
 
 	if (!cgx)
 		return;
 
+	id = get_sequence_id_of_lmac(cgx, lmac_id);
+
 	mac_ops = cgx->mac_ops;
 	if (enable) {
 		/* Enable promiscuous mode on LMAC */
 		cfg = cgx_read(cgx, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
-		cfg &= ~(CGX_DMAC_CAM_ACCEPT | CGX_DMAC_MCAST_MODE);
-		cfg |= CGX_DMAC_BCAST_MODE;
+		cfg &= ~CGX_DMAC_CAM_ACCEPT;
+		cfg |= (CGX_DMAC_BCAST_MODE | CGX_DMAC_MCAST_MODE);
 		cgx_write(cgx, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
 
-		cfg = cgx_read(cgx, 0,
-			       (CGXX_CMRX_RX_DMAC_CAM0 + lmac_id * 0x8));
-		cfg &= ~CGX_DMAC_CAM_ADDR_ENABLE;
-		cgx_write(cgx, 0,
-			  (CGXX_CMRX_RX_DMAC_CAM0 + lmac_id * 0x8), cfg);
+		for (i = 0; i < max_dmac; i++) {
+			index = id * max_dmac + i;
+			cfg = cgx_read(cgx, 0,
+				       (CGXX_CMRX_RX_DMAC_CAM0 + index * 0x8));
+			cfg &= ~CGX_DMAC_CAM_ADDR_ENABLE;
+			cgx_write(cgx, 0,
+				  (CGXX_CMRX_RX_DMAC_CAM0 + index * 0x8), cfg);
+		}
 	} else {
 		/* Disable promiscuous mode */
 		cfg = cgx_read(cgx, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
 		cfg |= CGX_DMAC_CAM_ACCEPT | CGX_DMAC_MCAST_MODE;
 		cgx_write(cgx, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
-		cfg = cgx_read(cgx, 0,
-			       (CGXX_CMRX_RX_DMAC_CAM0 + lmac_id * 0x8));
-		cfg |= CGX_DMAC_CAM_ADDR_ENABLE;
-		cgx_write(cgx, 0,
-			  (CGXX_CMRX_RX_DMAC_CAM0 + lmac_id * 0x8), cfg);
+		for (i = 0; i < max_dmac; i++) {
+			index = id * max_dmac + i;
+			cfg = cgx_read(cgx, 0,
+				       (CGXX_CMRX_RX_DMAC_CAM0 + index * 0x8));
+			if ((cfg & CGX_RX_DMAC_ADR_MASK) != 0) {
+				cfg |= CGX_DMAC_CAM_ADDR_ENABLE;
+				cgx_write(cgx, 0,
+					  (CGXX_CMRX_RX_DMAC_CAM0 +
+					   index * 0x8),
+					  cfg);
+			}
+		}
 	}
 }
 
@@ -1234,6 +1458,15 @@ static int cgx_lmac_init(struct cgx *cgx)
 		}
 
 		lmac->cgx = cgx;
+		lmac->mac_to_index_bmap.max =
+				MAX_DMAC_ENTRIES_PER_CGX / cgx->lmac_count;
+		err = rvu_alloc_bitmap(&lmac->mac_to_index_bmap);
+		if (err)
+			return err;
+
+		/* Reserve first entry for default MAC address */
+		set_bit(0, lmac->mac_to_index_bmap.bmap);
+
 		init_waitqueue_head(&lmac->wq_cmd_cmplt);
 		mutex_init(&lmac->cmd_lock);
 		spin_lock_init(&lmac->event_cb_lock);
@@ -1274,6 +1507,7 @@ static int cgx_lmac_exit(struct cgx *cgx)
 			continue;
 		cgx->mac_ops->mac_pause_frm_config(cgx, lmac->lmac_id, false);
 		cgx_configure_interrupt(cgx, lmac, lmac->lmac_id, true);
+		kfree(lmac->mac_to_index_bmap.bmap);
 		kfree(lmac->name);
 		kfree(lmac);
 	}
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
index 12521262164a..0c613f83a41c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
@@ -23,6 +23,7 @@
 
 #define CGX_ID_MASK			0x7
 #define MAX_LMAC_PER_CGX		4
+#define MAX_DMAC_ENTRIES_PER_CGX	32
 #define CGX_FIFO_LEN			65536 /* 64K for both Rx & Tx */
 #define CGX_OFFSET(x)			((x) * MAX_LMAC_PER_CGX)
 
@@ -46,6 +47,7 @@
 #define CGXX_CMRX_RX_DMAC_CTL0		(0x1F8 + mac_ops->csr_offset)
 #define CGX_DMAC_CTL0_CAM_ENABLE	BIT_ULL(3)
 #define CGX_DMAC_CAM_ACCEPT		BIT_ULL(3)
+#define CGX_DMAC_MCAST_MODE_CAM		BIT_ULL(2)
 #define CGX_DMAC_MCAST_MODE		BIT_ULL(1)
 #define CGX_DMAC_BCAST_MODE		BIT_ULL(0)
 #define CGXX_CMRX_RX_DMAC_CAM0		(0x200 + mac_ops->csr_offset)
@@ -139,7 +141,11 @@ int cgx_get_rx_stats(void *cgxd, int lmac_id, int idx, u64 *rx_stat);
 int cgx_lmac_rx_tx_enable(void *cgxd, int lmac_id, bool enable);
 int cgx_lmac_tx_enable(void *cgxd, int lmac_id, bool enable);
 int cgx_lmac_addr_set(u8 cgx_id, u8 lmac_id, u8 *mac_addr);
+int cgx_lmac_addr_reset(u8 cgx_id, u8 lmac_id);
 u64 cgx_lmac_addr_get(u8 cgx_id, u8 lmac_id);
+int cgx_lmac_addr_add(u8 cgx_id, u8 lmac_id, u8 *mac_addr);
+int cgx_lmac_addr_del(u8 cgx_id, u8 lmac_id, u8 index);
+int cgx_lmac_addr_max_entries_get(u8 cgx_id, u8 lmac_id);
 void cgx_lmac_promisc_config(int cgx_id, int lmac_id, bool enable);
 void cgx_lmac_enadis_rx_pause_fwding(void *cgxd, int lmac_id, bool enable);
 int cgx_lmac_internal_loopback(void *cgxd, int lmac_id, bool enable);
@@ -165,4 +171,5 @@ u8 cgx_get_lmacid(void *cgxd, u8 lmac_index);
 unsigned long cgx_get_lmac_bmap(void *cgxd);
 void cgx_lmac_write(int cgx_id, int lmac_id, u64 offset, u64 val);
 u64 cgx_lmac_read(int cgx_id, int lmac_id, u64 offset);
+int cgx_lmac_addr_update(u8 cgx_id, u8 lmac_id, u8 *mac_addr, u8 index);
 #endif /* CGX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h b/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h
index 45706fd87120..a8b7b1c7a1d5 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h
@@ -10,17 +10,19 @@
 #include "rvu.h"
 #include "cgx.h"
 /**
- * struct lmac
+ * struct lmac - per lmac locks and properties
  * @wq_cmd_cmplt:	waitq to keep the process blocked until cmd completion
  * @cmd_lock:		Lock to serialize the command interface
  * @resp:		command response
  * @link_info:		link related information
+ * @mac_to_index_bmap:	Mac address to CGX table index mapping
  * @event_cb:		callback for linkchange events
  * @event_cb_lock:	lock for serializing callback with unregister
+ * @cgx:		parent cgx port
+ * @mcast_filters_count:  Number of multicast filters installed
+ * @lmac_id:		lmac port id
  * @cmd_pend:		flag set before new command is started
  *			flag cleared after command response is received
- * @cgx:		parent cgx port
- * @lmac_id:		lmac port id
  * @name:		lmac port name
  */
 struct lmac {
@@ -29,12 +31,14 @@ struct lmac {
 	struct mutex cmd_lock;
 	u64 resp;
 	struct cgx_link_user_info link_info;
+	struct rsrc_bmap mac_to_index_bmap;
 	struct cgx_event_cb event_cb;
 	/* lock for serializing callback with unregister */
 	spinlock_t event_cb_lock;
-	bool cmd_pend;
 	struct cgx *cgx;
+	u8 mcast_filters_count;
 	u8 lmac_id;
+	bool cmd_pend;
 	char *name;
 };
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 9672cbf8a90a..f5ec39de026a 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -165,7 +165,15 @@ M(CGX_SET_LINK_MODE,	0x214, cgx_set_link_mode, cgx_set_link_mode_req,\
 M(CGX_FEATURES_GET,	0x215, cgx_features_get, msg_req,		\
 			       cgx_features_info_msg)			\
 M(RPM_STATS,		0x216, rpm_stats, msg_req, rpm_stats_rsp)	\
- /* NPA mbox IDs (range 0x400 - 0x5FF) */				\
+M(CGX_MAC_ADDR_ADD,	0x217, cgx_mac_addr_add, cgx_mac_addr_add_req,    \
+			       cgx_mac_addr_add_rsp)		\
+M(CGX_MAC_ADDR_DEL,	0x218, cgx_mac_addr_del, cgx_mac_addr_del_req,    \
+			       msg_rsp)		\
+M(CGX_MAC_MAX_ENTRIES_GET, 0x219, cgx_mac_max_entries_get, msg_req,    \
+				  cgx_max_dmac_entries_get_rsp)		\
+M(CGX_MAC_ADDR_RESET,	0x21A, cgx_mac_addr_reset, msg_req, msg_rsp)	\
+M(CGX_MAC_ADDR_UPDATE,	0x21B, cgx_mac_addr_update, cgx_mac_addr_update_req, \
+			       msg_rsp)					\
 /* NPA mbox IDs (range 0x400 - 0x5FF) */				\
 M(NPA_LF_ALLOC,		0x400, npa_lf_alloc,				\
 				npa_lf_alloc_req, npa_lf_alloc_rsp)	\
@@ -403,6 +411,38 @@ struct cgx_mac_addr_set_or_get {
 	u8 mac_addr[ETH_ALEN];
 };
 
+/* Structure for requesting the operation to
+ * add DMAC filter entry into CGX interface
+ */
+struct cgx_mac_addr_add_req {
+	struct mbox_msghdr hdr;
+	u8 mac_addr[ETH_ALEN];
+};
+
+/* Structure for response against the operation to
+ * add DMAC filter entry into CGX interface
+ */
+struct cgx_mac_addr_add_rsp {
+	struct mbox_msghdr hdr;
+	u8 index;
+};
+
+/* Structure for requesting the operation to
+ * delete DMAC filter entry from CGX interface
+ */
+struct cgx_mac_addr_del_req {
+	struct mbox_msghdr hdr;
+	u8 index;
+};
+
+/* Structure for response against the operation to
+ * get maximum supported DMAC filter entries
+ */
+struct cgx_max_dmac_entries_get_rsp {
+	struct mbox_msghdr hdr;
+	u8 max_dmac_filters;
+};
+
 struct cgx_link_user_info {
 	uint64_t link_up:1;
 	uint64_t full_duplex:1;
@@ -501,6 +541,12 @@ struct cgx_set_link_mode_rsp {
 	int status;
 };
 
+struct cgx_mac_addr_update_req {
+	struct mbox_msghdr hdr;
+	u8 mac_addr[ETH_ALEN];
+	u8 index;
+};
+
 #define RVU_LMAC_FEAT_FC		BIT_ULL(0) /* pause frames */
 #define RVU_LMAC_FEAT_PTP		BIT_ULL(1) /* precision time protocol */
 #define RVU_MAC_VERSION			BIT_ULL(2)
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 3c0a7e981f72..bc0d24507033 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -657,6 +657,8 @@ void rvu_cgx_enadis_rx_bp(struct rvu *rvu, int pf, bool enable);
 int rvu_cgx_start_stop_io(struct rvu *rvu, u16 pcifunc, bool start);
 int rvu_cgx_nix_cuml_stats(struct rvu *rvu, void *cgxd, int lmac_id, int index,
 			   int rxtxflag, u64 *stat);
+void rvu_cgx_disable_dmac_entries(struct rvu *rvu, u16 pcifunc);
+
 /* NPA APIs */
 int rvu_npa_init(struct rvu *rvu);
 void rvu_npa_freemem(struct rvu *rvu);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
index 6e2bf4fcd29c..9c6f4ba2d726 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
@@ -454,6 +454,31 @@ int rvu_cgx_config_rxtx(struct rvu *rvu, u16 pcifunc, bool start)
 	return 0;
 }
 
+void rvu_cgx_disable_dmac_entries(struct rvu *rvu, u16 pcifunc)
+{
+	int pf = rvu_get_pf(pcifunc);
+	int i = 0, lmac_count = 0;
+	u8 max_dmac_filters;
+	u8 cgx_id, lmac_id;
+	void *cgx_dev;
+
+	if (!is_cgx_config_permitted(rvu, pcifunc))
+		return;
+
+	rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+	cgx_dev = cgx_get_pdata(cgx_id);
+	lmac_count = cgx_get_lmac_cnt(cgx_dev);
+	max_dmac_filters = MAX_DMAC_ENTRIES_PER_CGX / lmac_count;
+
+	for (i = 0; i < max_dmac_filters; i++)
+		cgx_lmac_addr_del(cgx_id, lmac_id, i);
+
+	/* As cgx_lmac_addr_del does not clear entry for index 0
+	 * so it needs to be done explicitly
+	 */
+	cgx_lmac_addr_reset(cgx_id, lmac_id);
+}
+
 int rvu_mbox_handler_cgx_start_rxtx(struct rvu *rvu, struct msg_req *req,
 				    struct msg_rsp *rsp)
 {
@@ -557,6 +582,63 @@ int rvu_mbox_handler_cgx_mac_addr_set(struct rvu *rvu,
 	return 0;
 }
 
+int rvu_mbox_handler_cgx_mac_addr_add(struct rvu *rvu,
+				      struct cgx_mac_addr_add_req *req,
+				      struct cgx_mac_addr_add_rsp *rsp)
+{
+	int pf = rvu_get_pf(req->hdr.pcifunc);
+	u8 cgx_id, lmac_id;
+	int rc = 0;
+
+	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
+		return -EPERM;
+
+	rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+	rc = cgx_lmac_addr_add(cgx_id, lmac_id, req->mac_addr);
+	if (rc >= 0) {
+		rsp->index = rc;
+		return 0;
+	}
+
+	return rc;
+}
+
+int rvu_mbox_handler_cgx_mac_addr_del(struct rvu *rvu,
+				      struct cgx_mac_addr_del_req *req,
+				      struct msg_rsp *rsp)
+{
+	int pf = rvu_get_pf(req->hdr.pcifunc);
+	u8 cgx_id, lmac_id;
+
+	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
+		return -EPERM;
+
+	rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+	return cgx_lmac_addr_del(cgx_id, lmac_id, req->index);
+}
+
+int rvu_mbox_handler_cgx_mac_max_entries_get(struct rvu *rvu,
+					     struct msg_req *req,
+					     struct cgx_max_dmac_entries_get_rsp
+					     *rsp)
+{
+	int pf = rvu_get_pf(req->hdr.pcifunc);
+	u8 cgx_id, lmac_id;
+
+	/* If msg is received from PFs(which are not mapped to CGX LMACs)
+	 * or VF then no entries are allocated for DMAC filters at CGX level.
+	 * So returning zero.
+	 */
+	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc)) {
+		rsp->max_dmac_filters = 0;
+		return 0;
+	}
+
+	rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+	rsp->max_dmac_filters = cgx_lmac_addr_max_entries_get(cgx_id, lmac_id);
+	return 0;
+}
+
 int rvu_mbox_handler_cgx_mac_addr_get(struct rvu *rvu,
 				      struct cgx_mac_addr_set_or_get *req,
 				      struct cgx_mac_addr_set_or_get *rsp)
@@ -953,3 +1035,30 @@ int rvu_mbox_handler_cgx_set_link_mode(struct rvu *rvu,
 	rsp->status = cgx_set_link_mode(cgxd, req->args, cgx_idx, lmac);
 	return 0;
 }
+
+int rvu_mbox_handler_cgx_mac_addr_reset(struct rvu *rvu, struct msg_req *req,
+					struct msg_rsp *rsp)
+{
+	int pf = rvu_get_pf(req->hdr.pcifunc);
+	u8 cgx_id, lmac_id;
+
+	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
+		return -EPERM;
+
+	rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+	return cgx_lmac_addr_reset(cgx_id, lmac_id);
+}
+
+int rvu_mbox_handler_cgx_mac_addr_update(struct rvu *rvu,
+					 struct cgx_mac_addr_update_req *req,
+					 struct msg_rsp *rsp)
+{
+	int pf = rvu_get_pf(req->hdr.pcifunc);
+	u8 cgx_id, lmac_id;
+
+	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
+		return -EPERM;
+
+	rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+	return cgx_lmac_addr_update(cgx_id, lmac_id, req->mac_addr, req->index);
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index d6f8210652c5..aeae37704428 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -346,6 +346,9 @@ static void nix_interface_deinit(struct rvu *rvu, u16 pcifunc, u8 nixlf)
 
 	/* Free and disable any MCAM entries used by this NIX LF */
 	rvu_npc_disable_mcam_entries(rvu, pcifunc, nixlf);
+
+	/* Disable DMAC filters used */
+	rvu_cgx_disable_dmac_entries(rvu, pcifunc);
 }
 
 int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu,

From dbc52debf95f8f902f74309e7ae0de38e1ed4e4c Mon Sep 17 00:00:00 2001
From: Hariprasad Kelam <hkelam@marvell.com>
Date: Wed, 30 Jun 2021 15:40:58 +0530
Subject: [PATCH 033/158] octeontx2-af: Debugfs support for DMAC filters

Add debugfs support to display CGX/RPM DMAC filter
table associated with pf.

cat /sys/kernel/debug/octeontx2/cgx/cgx0/lmac0/mac_filter

PCI dev       RVUPF  BROADCAST  MULTICAST  FILTER-MODE
0002:02:00.0  PF2    ACCEPT     ACCEPT     UNICAST

DMAC-INDEX  ADDRESS
      0     00:0f:b7:06:17:06
      1     1a:1b:1c:1d:1e:01
      2     1a:1b:1c:1d:1e:02

Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/marvell/octeontx2/af/cgx.c   | 28 ++++++
 .../net/ethernet/marvell/octeontx2/af/cgx.h   |  3 +
 .../net/ethernet/marvell/octeontx2/af/rvu.h   |  1 +
 .../ethernet/marvell/octeontx2/af/rvu_cgx.c   |  2 +-
 .../marvell/octeontx2/af/rvu_debugfs.c        | 88 +++++++++++++++++--
 5 files changed, 113 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
index bc413f96b430..9169849881bf 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
@@ -266,6 +266,34 @@ int cgx_lmac_addr_set(u8 cgx_id, u8 lmac_id, u8 *mac_addr)
 	return 0;
 }
 
+u64 cgx_read_dmac_ctrl(void *cgxd, int lmac_id)
+{
+	struct mac_ops *mac_ops;
+	struct cgx *cgx = cgxd;
+
+	if (!cgxd || !is_lmac_valid(cgxd, lmac_id))
+		return 0;
+
+	cgx = cgxd;
+	/* Get mac_ops to know csr offset */
+	mac_ops = cgx->mac_ops;
+
+	return cgx_read(cgxd, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
+}
+
+u64 cgx_read_dmac_entry(void *cgxd, int index)
+{
+	struct mac_ops *mac_ops;
+	struct cgx *cgx;
+
+	if (!cgxd)
+		return 0;
+
+	cgx = cgxd;
+	mac_ops = cgx->mac_ops;
+	return cgx_read(cgx, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 8)));
+}
+
 int cgx_lmac_addr_add(u8 cgx_id, u8 lmac_id, u8 *mac_addr)
 {
 	struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
index 0c613f83a41c..237ba2b56210 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
@@ -52,6 +52,7 @@
 #define CGX_DMAC_BCAST_MODE		BIT_ULL(0)
 #define CGXX_CMRX_RX_DMAC_CAM0		(0x200 + mac_ops->csr_offset)
 #define CGX_DMAC_CAM_ADDR_ENABLE	BIT_ULL(48)
+#define CGX_DMAC_CAM_ENTRY_LMACID	GENMASK_ULL(50, 49)
 #define CGXX_CMRX_RX_DMAC_CAM1		0x400
 #define CGX_RX_DMAC_ADR_MASK		GENMASK_ULL(47, 0)
 #define CGXX_CMRX_TX_STAT0		0x700
@@ -172,4 +173,6 @@ unsigned long cgx_get_lmac_bmap(void *cgxd);
 void cgx_lmac_write(int cgx_id, int lmac_id, u64 offset, u64 val);
 u64 cgx_lmac_read(int cgx_id, int lmac_id, u64 offset);
 int cgx_lmac_addr_update(u8 cgx_id, u8 lmac_id, u8 *mac_addr, u8 index);
+u64 cgx_read_dmac_ctrl(void *cgxd, int lmac_id);
+u64 cgx_read_dmac_entry(void *cgxd, int index);
 #endif /* CGX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index bc0d24507033..10e58a5d5861 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -744,6 +744,7 @@ void npc_read_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam,
 bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature);
 u32  rvu_cgx_get_fifolen(struct rvu *rvu);
 void *rvu_first_cgx_pdata(struct rvu *rvu);
+int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id);
 
 int npc_get_nixlf_mcam_index(struct npc_mcam *mcam, u16 pcifunc, int nixlf,
 			     int type);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
index 9c6f4ba2d726..6cc8fbb7190c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
@@ -63,7 +63,7 @@ static u16 cgxlmac_to_pfmap(struct rvu *rvu, u8 cgx_id, u8 lmac_id)
 	return rvu->cgxlmac2pf_map[CGX_OFFSET(cgx_id) + lmac_id];
 }
 
-static int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id)
+int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id)
 {
 	unsigned long pfmap;
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
index 3cc3c6fd1d84..370d4ca1e5ed 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
@@ -1971,10 +1971,9 @@ static int cgx_print_stats(struct seq_file *s, int lmac_id)
 	return err;
 }
 
-static int rvu_dbg_cgx_stat_display(struct seq_file *filp, void *unused)
+static int rvu_dbg_derive_lmacid(struct seq_file *filp, int *lmac_id)
 {
 	struct dentry *current_dir;
-	int err, lmac_id;
 	char *buf;
 
 	current_dir = filp->file->f_path.dentry->d_parent;
@@ -1982,17 +1981,87 @@ static int rvu_dbg_cgx_stat_display(struct seq_file *filp, void *unused)
 	if (!buf)
 		return -EINVAL;
 
-	err = kstrtoint(buf + 1, 10, &lmac_id);
-	if (!err) {
-		err = cgx_print_stats(filp, lmac_id);
-		if (err)
-			return err;
-	}
+	return kstrtoint(buf + 1, 10, lmac_id);
+}
+
+static int rvu_dbg_cgx_stat_display(struct seq_file *filp, void *unused)
+{
+	int lmac_id, err;
+
+	err = rvu_dbg_derive_lmacid(filp, &lmac_id);
+	if (!err)
+		return cgx_print_stats(filp, lmac_id);
+
 	return err;
 }
 
 RVU_DEBUG_SEQ_FOPS(cgx_stat, cgx_stat_display, NULL);
 
+static int cgx_print_dmac_flt(struct seq_file *s, int lmac_id)
+{
+	struct pci_dev *pdev = NULL;
+	void *cgxd = s->private;
+	char *bcast, *mcast;
+	u16 index, domain;
+	u8 dmac[ETH_ALEN];
+	struct rvu *rvu;
+	u64 cfg, mac;
+	int pf;
+
+	rvu = pci_get_drvdata(pci_get_device(PCI_VENDOR_ID_CAVIUM,
+					     PCI_DEVID_OCTEONTX2_RVU_AF, NULL));
+	if (!rvu)
+		return -ENODEV;
+
+	pf = cgxlmac_to_pf(rvu, cgx_get_cgxid(cgxd), lmac_id);
+	domain = 2;
+
+	pdev = pci_get_domain_bus_and_slot(domain, pf + 1, 0);
+	if (!pdev)
+		return 0;
+
+	cfg = cgx_read_dmac_ctrl(cgxd, lmac_id);
+	bcast = cfg & CGX_DMAC_BCAST_MODE ? "ACCEPT" : "REJECT";
+	mcast = cfg & CGX_DMAC_MCAST_MODE ? "ACCEPT" : "REJECT";
+
+	seq_puts(s,
+		 "PCI dev       RVUPF   BROADCAST  MULTICAST  FILTER-MODE\n");
+	seq_printf(s, "%s  PF%d  %9s  %9s",
+		   dev_name(&pdev->dev), pf, bcast, mcast);
+	if (cfg & CGX_DMAC_CAM_ACCEPT)
+		seq_printf(s, "%12s\n\n", "UNICAST");
+	else
+		seq_printf(s, "%16s\n\n", "PROMISCUOUS");
+
+	seq_puts(s, "\nDMAC-INDEX  ADDRESS\n");
+
+	for (index = 0 ; index < 32 ; index++) {
+		cfg = cgx_read_dmac_entry(cgxd, index);
+		/* Display enabled dmac entries associated with current lmac */
+		if (lmac_id == FIELD_GET(CGX_DMAC_CAM_ENTRY_LMACID, cfg) &&
+		    FIELD_GET(CGX_DMAC_CAM_ADDR_ENABLE, cfg)) {
+			mac = FIELD_GET(CGX_RX_DMAC_ADR_MASK, cfg);
+			u64_to_ether_addr(mac, dmac);
+			seq_printf(s, "%7d     %pM\n", index, dmac);
+		}
+	}
+
+	return 0;
+}
+
+static int rvu_dbg_cgx_dmac_flt_display(struct seq_file *filp, void *unused)
+{
+	int err, lmac_id;
+
+	err = rvu_dbg_derive_lmacid(filp, &lmac_id);
+	if (!err)
+		return cgx_print_dmac_flt(filp, lmac_id);
+
+	return err;
+}
+
+RVU_DEBUG_SEQ_FOPS(cgx_dmac_flt, cgx_dmac_flt_display, NULL);
+
 static void rvu_dbg_cgx_init(struct rvu *rvu)
 {
 	struct mac_ops *mac_ops;
@@ -2029,6 +2098,9 @@ static void rvu_dbg_cgx_init(struct rvu *rvu)
 
 			debugfs_create_file("stats", 0600, rvu->rvu_dbg.lmac,
 					    cgx, &rvu_dbg_cgx_stat_fops);
+			debugfs_create_file("mac_filter", 0600,
+					    rvu->rvu_dbg.lmac, cgx,
+					    &rvu_dbg_cgx_dmac_flt_fops);
 		}
 	}
 }

From 79d2be385e9eabe4403eb85bcc7d3efc6b936a76 Mon Sep 17 00:00:00 2001
From: Hariprasad Kelam <hkelam@marvell.com>
Date: Wed, 30 Jun 2021 15:40:59 +0530
Subject: [PATCH 034/158] octeontx2-pf: offload DMAC filters to CGX/RPM block

DMAC filtering can be achieved by either NPC MCAM rules or
CGX/RPM MAC filters. Currently we are achieving this by NPC
MCAM rules. This patch offloads DMAC filters to CGX/RPM MAC
filters instead of NPC MCAM rules. Offloading DMAC filter to
CGX/RPM block helps in reducing traffic to NPC block and
save MCAM rules

Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/marvell/octeontx2/nic/Makefile   |   2 +-
 .../marvell/octeontx2/nic/otx2_common.c       |   3 +
 .../marvell/octeontx2/nic/otx2_common.h       |  11 +
 .../marvell/octeontx2/nic/otx2_dmac_flt.c     | 173 +++++++++++++
 .../marvell/octeontx2/nic/otx2_flows.c        | 229 +++++++++++++++++-
 .../ethernet/marvell/octeontx2/nic/otx2_pf.c  |   9 +
 6 files changed, 417 insertions(+), 10 deletions(-)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c

diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
index 457c94793e63..3254b02205ca 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_OCTEONTX2_PF) += rvu_nicpf.o
 obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o
 
 rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \
-		     otx2_ptp.o otx2_flows.o otx2_tc.o cn10k.o
+               otx2_ptp.o otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o
 rvu_nicvf-y := otx2_vf.o
 
 ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
index cf7875d51d87..7cccd802c4ed 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
@@ -210,6 +210,9 @@ int otx2_set_mac_address(struct net_device *netdev, void *p)
 		/* update dmac field in vlan offload rule */
 		if (pfvf->flags & OTX2_FLAG_RX_VLAN_SUPPORT)
 			otx2_install_rxvlan_offload_flow(pfvf);
+		/* update dmac address in ntuple and DMAC filter list */
+		if (pfvf->flags & OTX2_FLAG_DMACFLTR_SUPPORT)
+			otx2_dmacflt_update_pfmac_flow(pfvf);
 	} else {
 		return -EPERM;
 	}
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
index 20a9c69f020f..8fd58cd07f50 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
@@ -288,6 +288,9 @@ struct otx2_flow_config {
 	u16			tc_flower_offset;
 	u16                     ntuple_max_flows;
 	u16			tc_max_flows;
+	u8			dmacflt_max_flows;
+	u8			*bmap_to_dmacindex;
+	unsigned long		dmacflt_bmap;
 	struct list_head	flow_list;
 };
 
@@ -329,6 +332,7 @@ struct otx2_nic {
 #define OTX2_FLAG_TC_FLOWER_SUPPORT		BIT_ULL(11)
 #define OTX2_FLAG_TC_MATCHALL_EGRESS_ENABLED	BIT_ULL(12)
 #define OTX2_FLAG_TC_MATCHALL_INGRESS_ENABLED	BIT_ULL(13)
+#define OTX2_FLAG_DMACFLTR_SUPPORT		BIT_ULL(14)
 	u64			flags;
 
 	struct otx2_qset	qset;
@@ -834,4 +838,11 @@ int otx2_init_tc(struct otx2_nic *nic);
 void otx2_shutdown_tc(struct otx2_nic *nic);
 int otx2_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 		  void *type_data);
+/* CGX/RPM DMAC filters support */
+int otx2_dmacflt_get_max_cnt(struct otx2_nic *pf);
+int otx2_dmacflt_add(struct otx2_nic *pf, const u8 *mac, u8 bit_pos);
+int otx2_dmacflt_remove(struct otx2_nic *pf, const u8 *mac, u8 bit_pos);
+int otx2_dmacflt_update(struct otx2_nic *pf, u8 *mac, u8 bit_pos);
+void otx2_dmacflt_reinstall_flows(struct otx2_nic *pf);
+void otx2_dmacflt_update_pfmac_flow(struct otx2_nic *pfvf);
 #endif /* OTX2_COMMON_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c
new file mode 100644
index 000000000000..ffe3e94562d0
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell OcteonTx2 RVU Physcial Function ethernet driver
+ *
+ * Copyright (C) 2021 Marvell.
+ */
+
+#include "otx2_common.h"
+
+static int otx2_dmacflt_do_add(struct otx2_nic *pf, const u8 *mac,
+			       u8 *dmac_index)
+{
+	struct cgx_mac_addr_add_req *req;
+	struct cgx_mac_addr_add_rsp *rsp;
+	int err;
+
+	mutex_lock(&pf->mbox.lock);
+
+	req = otx2_mbox_alloc_msg_cgx_mac_addr_add(&pf->mbox);
+	if (!req) {
+		mutex_unlock(&pf->mbox.lock);
+		return -ENOMEM;
+	}
+
+	ether_addr_copy(req->mac_addr, mac);
+	err = otx2_sync_mbox_msg(&pf->mbox);
+
+	if (!err) {
+		rsp = (struct cgx_mac_addr_add_rsp *)
+			 otx2_mbox_get_rsp(&pf->mbox.mbox, 0, &req->hdr);
+		*dmac_index = rsp->index;
+	}
+
+	mutex_unlock(&pf->mbox.lock);
+	return err;
+}
+
+static int otx2_dmacflt_add_pfmac(struct otx2_nic *pf)
+{
+	struct cgx_mac_addr_set_or_get *req;
+	int err;
+
+	mutex_lock(&pf->mbox.lock);
+
+	req = otx2_mbox_alloc_msg_cgx_mac_addr_set(&pf->mbox);
+	if (!req) {
+		mutex_unlock(&pf->mbox.lock);
+		return -ENOMEM;
+	}
+
+	ether_addr_copy(req->mac_addr, pf->netdev->dev_addr);
+	err = otx2_sync_mbox_msg(&pf->mbox);
+
+	mutex_unlock(&pf->mbox.lock);
+	return err;
+}
+
+int otx2_dmacflt_add(struct otx2_nic *pf, const u8 *mac, u8 bit_pos)
+{
+	u8 *dmacindex;
+
+	/* Store dmacindex returned by CGX/RPM driver which will
+	 * be used for macaddr update/remove
+	 */
+	dmacindex = &pf->flow_cfg->bmap_to_dmacindex[bit_pos];
+
+	if (ether_addr_equal(mac, pf->netdev->dev_addr))
+		return otx2_dmacflt_add_pfmac(pf);
+	else
+		return otx2_dmacflt_do_add(pf, mac, dmacindex);
+}
+
+static int otx2_dmacflt_do_remove(struct otx2_nic *pfvf, const u8 *mac,
+				  u8 dmac_index)
+{
+	struct cgx_mac_addr_del_req *req;
+	int err;
+
+	mutex_lock(&pfvf->mbox.lock);
+	req = otx2_mbox_alloc_msg_cgx_mac_addr_del(&pfvf->mbox);
+	if (!req) {
+		mutex_unlock(&pfvf->mbox.lock);
+		return -ENOMEM;
+	}
+
+	req->index = dmac_index;
+
+	err = otx2_sync_mbox_msg(&pfvf->mbox);
+	mutex_unlock(&pfvf->mbox.lock);
+
+	return err;
+}
+
+static int otx2_dmacflt_remove_pfmac(struct otx2_nic *pf)
+{
+	struct msg_req *req;
+	int err;
+
+	mutex_lock(&pf->mbox.lock);
+	req = otx2_mbox_alloc_msg_cgx_mac_addr_reset(&pf->mbox);
+	if (!req) {
+		mutex_unlock(&pf->mbox.lock);
+		return -ENOMEM;
+	}
+
+	err = otx2_sync_mbox_msg(&pf->mbox);
+
+	mutex_unlock(&pf->mbox.lock);
+	return err;
+}
+
+int otx2_dmacflt_remove(struct otx2_nic *pf, const u8 *mac,
+			u8 bit_pos)
+{
+	u8 dmacindex = pf->flow_cfg->bmap_to_dmacindex[bit_pos];
+
+	if (ether_addr_equal(mac, pf->netdev->dev_addr))
+		return otx2_dmacflt_remove_pfmac(pf);
+	else
+		return otx2_dmacflt_do_remove(pf, mac, dmacindex);
+}
+
+/* CGX/RPM blocks support max unicast entries of 32.
+ * on typical configuration MAC block associated
+ * with 4 lmacs, each lmac will have 8 dmac entries
+ */
+int otx2_dmacflt_get_max_cnt(struct otx2_nic *pf)
+{
+	struct cgx_max_dmac_entries_get_rsp *rsp;
+	struct msg_req *msg;
+	int err;
+
+	mutex_lock(&pf->mbox.lock);
+	msg = otx2_mbox_alloc_msg_cgx_mac_max_entries_get(&pf->mbox);
+
+	if (!msg) {
+		mutex_unlock(&pf->mbox.lock);
+		return -ENOMEM;
+	}
+
+	err = otx2_sync_mbox_msg(&pf->mbox);
+	if (err)
+		goto out;
+
+	rsp = (struct cgx_max_dmac_entries_get_rsp *)
+		     otx2_mbox_get_rsp(&pf->mbox.mbox, 0, &msg->hdr);
+	pf->flow_cfg->dmacflt_max_flows = rsp->max_dmac_filters;
+
+out:
+	mutex_unlock(&pf->mbox.lock);
+	return err;
+}
+
+int otx2_dmacflt_update(struct otx2_nic *pf, u8 *mac, u8 bit_pos)
+{
+	struct cgx_mac_addr_update_req *req;
+	int rc;
+
+	mutex_lock(&pf->mbox.lock);
+
+	req = otx2_mbox_alloc_msg_cgx_mac_addr_update(&pf->mbox);
+
+	if (!req) {
+		mutex_unlock(&pf->mbox.lock);
+		rc = -ENOMEM;
+	}
+
+	ether_addr_copy(req->mac_addr, mac);
+	req->index = pf->flow_cfg->bmap_to_dmacindex[bit_pos];
+	rc = otx2_sync_mbox_msg(&pf->mbox);
+
+	mutex_unlock(&pf->mbox.lock);
+	return rc;
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
index 8c97106bdd1c..4d9de525802d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
@@ -18,6 +18,12 @@ struct otx2_flow {
 	bool is_vf;
 	u8 rss_ctx_id;
 	int vf;
+	bool dmac_filter;
+};
+
+enum dmac_req {
+	DMAC_ADDR_UPDATE,
+	DMAC_ADDR_DEL
 };
 
 static void otx2_clear_ntuple_flow_info(struct otx2_nic *pfvf, struct otx2_flow_config *flow_cfg)
@@ -219,6 +225,22 @@ int otx2_mcam_flow_init(struct otx2_nic *pf)
 	if (!pf->mac_table)
 		return -ENOMEM;
 
+	otx2_dmacflt_get_max_cnt(pf);
+
+	/* DMAC filters are not allocated */
+	if (!pf->flow_cfg->dmacflt_max_flows)
+		return 0;
+
+	pf->flow_cfg->bmap_to_dmacindex =
+			devm_kzalloc(pf->dev, sizeof(u8) *
+				     pf->flow_cfg->dmacflt_max_flows,
+				     GFP_KERNEL);
+
+	if (!pf->flow_cfg->bmap_to_dmacindex)
+		return -ENOMEM;
+
+	pf->flags |= OTX2_FLAG_DMACFLTR_SUPPORT;
+
 	return 0;
 }
 
@@ -280,6 +302,12 @@ int otx2_add_macfilter(struct net_device *netdev, const u8 *mac)
 {
 	struct otx2_nic *pf = netdev_priv(netdev);
 
+	if (bitmap_weight(&pf->flow_cfg->dmacflt_bmap,
+			  pf->flow_cfg->dmacflt_max_flows))
+		netdev_warn(netdev,
+			    "Add %pM to CGX/RPM DMAC filters list as well\n",
+			    mac);
+
 	return otx2_do_add_macfilter(pf, mac);
 }
 
@@ -351,12 +379,22 @@ static void otx2_add_flow_to_list(struct otx2_nic *pfvf, struct otx2_flow *flow)
 	list_add(&flow->list, head);
 }
 
+static int otx2_get_maxflows(struct otx2_flow_config *flow_cfg)
+{
+	if (flow_cfg->nr_flows == flow_cfg->ntuple_max_flows ||
+	    bitmap_weight(&flow_cfg->dmacflt_bmap,
+			  flow_cfg->dmacflt_max_flows))
+		return flow_cfg->ntuple_max_flows + flow_cfg->dmacflt_max_flows;
+	else
+		return flow_cfg->ntuple_max_flows;
+}
+
 int otx2_get_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc,
 		  u32 location)
 {
 	struct otx2_flow *iter;
 
-	if (location >= pfvf->flow_cfg->ntuple_max_flows)
+	if (location >= otx2_get_maxflows(pfvf->flow_cfg))
 		return -EINVAL;
 
 	list_for_each_entry(iter, &pfvf->flow_cfg->flow_list, list) {
@@ -378,7 +416,7 @@ int otx2_get_all_flows(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc,
 	int idx = 0;
 	int err = 0;
 
-	nfc->data = pfvf->flow_cfg->ntuple_max_flows;
+	nfc->data = otx2_get_maxflows(pfvf->flow_cfg);
 	while ((!err || err == -ENOENT) && idx < rule_cnt) {
 		err = otx2_get_flow(pfvf, nfc, location);
 		if (!err)
@@ -760,6 +798,32 @@ int otx2_prepare_flow_request(struct ethtool_rx_flow_spec *fsp,
 	return 0;
 }
 
+static int otx2_is_flow_rule_dmacfilter(struct otx2_nic *pfvf,
+					struct ethtool_rx_flow_spec *fsp)
+{
+	struct ethhdr *eth_mask = &fsp->m_u.ether_spec;
+	struct ethhdr *eth_hdr = &fsp->h_u.ether_spec;
+	u64 ring_cookie = fsp->ring_cookie;
+	u32 flow_type;
+
+	if (!(pfvf->flags & OTX2_FLAG_DMACFLTR_SUPPORT))
+		return false;
+
+	flow_type = fsp->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS);
+
+	/* CGX/RPM block dmac filtering configured for white listing
+	 * check for action other than DROP
+	 */
+	if (flow_type == ETHER_FLOW && ring_cookie != RX_CLS_FLOW_DISC &&
+	    !ethtool_get_flow_spec_ring_vf(ring_cookie)) {
+		if (is_zero_ether_addr(eth_mask->h_dest) &&
+		    is_valid_ether_addr(eth_hdr->h_dest))
+			return true;
+	}
+
+	return false;
+}
+
 static int otx2_add_flow_msg(struct otx2_nic *pfvf, struct otx2_flow *flow)
 {
 	u64 ring_cookie = flow->flow_spec.ring_cookie;
@@ -818,14 +882,46 @@ static int otx2_add_flow_msg(struct otx2_nic *pfvf, struct otx2_flow *flow)
 	return err;
 }
 
+static int otx2_add_flow_with_pfmac(struct otx2_nic *pfvf,
+				    struct otx2_flow *flow)
+{
+	struct otx2_flow *pf_mac;
+	struct ethhdr *eth_hdr;
+
+	pf_mac = kzalloc(sizeof(*pf_mac), GFP_KERNEL);
+	if (!pf_mac)
+		return -ENOMEM;
+
+	pf_mac->entry = 0;
+	pf_mac->dmac_filter = true;
+	pf_mac->location = pfvf->flow_cfg->ntuple_max_flows;
+	memcpy(&pf_mac->flow_spec, &flow->flow_spec,
+	       sizeof(struct ethtool_rx_flow_spec));
+	pf_mac->flow_spec.location = pf_mac->location;
+
+	/* Copy PF mac address */
+	eth_hdr = &pf_mac->flow_spec.h_u.ether_spec;
+	ether_addr_copy(eth_hdr->h_dest, pfvf->netdev->dev_addr);
+
+	/* Install DMAC filter with PF mac address */
+	otx2_dmacflt_add(pfvf, eth_hdr->h_dest, 0);
+
+	otx2_add_flow_to_list(pfvf, pf_mac);
+	pfvf->flow_cfg->nr_flows++;
+	set_bit(0, &pfvf->flow_cfg->dmacflt_bmap);
+
+	return 0;
+}
+
 int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc)
 {
 	struct otx2_flow_config *flow_cfg = pfvf->flow_cfg;
 	struct ethtool_rx_flow_spec *fsp = &nfc->fs;
 	struct otx2_flow *flow;
+	struct ethhdr *eth_hdr;
 	bool new = false;
+	int err = 0;
 	u32 ring;
-	int err;
 
 	ring = ethtool_get_flow_spec_ring(fsp->ring_cookie);
 	if (!(pfvf->flags & OTX2_FLAG_NTUPLE_SUPPORT))
@@ -834,16 +930,15 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc)
 	if (ring >= pfvf->hw.rx_queues && fsp->ring_cookie != RX_CLS_FLOW_DISC)
 		return -EINVAL;
 
-	if (fsp->location >= flow_cfg->ntuple_max_flows)
+	if (fsp->location >= otx2_get_maxflows(flow_cfg))
 		return -EINVAL;
 
 	flow = otx2_find_flow(pfvf, fsp->location);
 	if (!flow) {
-		flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
+		flow = kzalloc(sizeof(*flow), GFP_KERNEL);
 		if (!flow)
 			return -ENOMEM;
 		flow->location = fsp->location;
-		flow->entry = flow_cfg->flow_ent[flow->location];
 		new = true;
 	}
 	/* struct copy */
@@ -852,7 +947,54 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc)
 	if (fsp->flow_type & FLOW_RSS)
 		flow->rss_ctx_id = nfc->rss_context;
 
-	err = otx2_add_flow_msg(pfvf, flow);
+	if (otx2_is_flow_rule_dmacfilter(pfvf, &flow->flow_spec)) {
+		eth_hdr = &flow->flow_spec.h_u.ether_spec;
+
+		/* Sync dmac filter table with updated fields */
+		if (flow->dmac_filter)
+			return otx2_dmacflt_update(pfvf, eth_hdr->h_dest,
+						   flow->entry);
+
+		if (bitmap_full(&flow_cfg->dmacflt_bmap,
+				flow_cfg->dmacflt_max_flows)) {
+			netdev_warn(pfvf->netdev,
+				    "Can't insert the rule %d as max allowed dmac filters are %d\n",
+				    flow->location +
+				    flow_cfg->dmacflt_max_flows,
+				    flow_cfg->dmacflt_max_flows);
+			err = -EINVAL;
+			if (new)
+				kfree(flow);
+			return err;
+		}
+
+		/* Install PF mac address to DMAC filter list */
+		if (!test_bit(0, &flow_cfg->dmacflt_bmap))
+			otx2_add_flow_with_pfmac(pfvf, flow);
+
+		flow->dmac_filter = true;
+		flow->entry = find_first_zero_bit(&flow_cfg->dmacflt_bmap,
+						  flow_cfg->dmacflt_max_flows);
+		fsp->location = flow_cfg->ntuple_max_flows + flow->entry;
+		flow->flow_spec.location = fsp->location;
+		flow->location = fsp->location;
+
+		set_bit(flow->entry, &flow_cfg->dmacflt_bmap);
+		otx2_dmacflt_add(pfvf, eth_hdr->h_dest, flow->entry);
+
+	} else {
+		if (flow->location >= pfvf->flow_cfg->ntuple_max_flows) {
+			netdev_warn(pfvf->netdev,
+				    "Can't insert non dmac ntuple rule at %d, allowed range %d-0\n",
+				    flow->location,
+				    flow_cfg->ntuple_max_flows - 1);
+			err = -EINVAL;
+		} else {
+			flow->entry = flow_cfg->flow_ent[flow->location];
+			err = otx2_add_flow_msg(pfvf, flow);
+		}
+	}
+
 	if (err) {
 		if (new)
 			kfree(flow);
@@ -890,20 +1032,70 @@ static int otx2_remove_flow_msg(struct otx2_nic *pfvf, u16 entry, bool all)
 	return err;
 }
 
+static void otx2_update_rem_pfmac(struct otx2_nic *pfvf, int req)
+{
+	struct otx2_flow *iter;
+	struct ethhdr *eth_hdr;
+	bool found = false;
+
+	list_for_each_entry(iter, &pfvf->flow_cfg->flow_list, list) {
+		if (iter->dmac_filter && iter->entry == 0) {
+			eth_hdr = &iter->flow_spec.h_u.ether_spec;
+			if (req == DMAC_ADDR_DEL) {
+				otx2_dmacflt_remove(pfvf, eth_hdr->h_dest,
+						    0);
+				clear_bit(0, &pfvf->flow_cfg->dmacflt_bmap);
+				found = true;
+			} else {
+				ether_addr_copy(eth_hdr->h_dest,
+						pfvf->netdev->dev_addr);
+				otx2_dmacflt_update(pfvf, eth_hdr->h_dest, 0);
+			}
+			break;
+		}
+	}
+
+	if (found) {
+		list_del(&iter->list);
+		kfree(iter);
+		pfvf->flow_cfg->nr_flows--;
+	}
+}
+
 int otx2_remove_flow(struct otx2_nic *pfvf, u32 location)
 {
 	struct otx2_flow_config *flow_cfg = pfvf->flow_cfg;
 	struct otx2_flow *flow;
 	int err;
 
-	if (location >= flow_cfg->ntuple_max_flows)
+	if (location >= otx2_get_maxflows(flow_cfg))
 		return -EINVAL;
 
 	flow = otx2_find_flow(pfvf, location);
 	if (!flow)
 		return -ENOENT;
 
-	err = otx2_remove_flow_msg(pfvf, flow->entry, false);
+	if (flow->dmac_filter) {
+		struct ethhdr *eth_hdr = &flow->flow_spec.h_u.ether_spec;
+
+		/* user not allowed to remove dmac filter with interface mac */
+		if (ether_addr_equal(pfvf->netdev->dev_addr, eth_hdr->h_dest))
+			return -EPERM;
+
+		err = otx2_dmacflt_remove(pfvf, eth_hdr->h_dest,
+					  flow->entry);
+		clear_bit(flow->entry, &flow_cfg->dmacflt_bmap);
+		/* If all dmac filters are removed delete macfilter with
+		 * interface mac address and configure CGX/RPM block in
+		 * promiscuous mode
+		 */
+		if (bitmap_weight(&flow_cfg->dmacflt_bmap,
+				  flow_cfg->dmacflt_max_flows) == 1)
+			otx2_update_rem_pfmac(pfvf, DMAC_ADDR_DEL);
+	} else {
+		err = otx2_remove_flow_msg(pfvf, flow->entry, false);
+	}
+
 	if (err)
 		return err;
 
@@ -1100,3 +1292,22 @@ int otx2_enable_rxvlan(struct otx2_nic *pf, bool enable)
 	mutex_unlock(&pf->mbox.lock);
 	return rsp_hdr->rc;
 }
+
+void otx2_dmacflt_reinstall_flows(struct otx2_nic *pf)
+{
+	struct otx2_flow *iter;
+	struct ethhdr *eth_hdr;
+
+	list_for_each_entry(iter, &pf->flow_cfg->flow_list, list) {
+		if (iter->dmac_filter) {
+			eth_hdr = &iter->flow_spec.h_u.ether_spec;
+			otx2_dmacflt_add(pf, eth_hdr->h_dest,
+					 iter->entry);
+		}
+	}
+}
+
+void otx2_dmacflt_update_pfmac_flow(struct otx2_nic *pfvf)
+{
+	otx2_update_rem_pfmac(pfvf, DMAC_ADDR_UPDATE);
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
index 088c28df849d..f300b807a85b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@ -1110,6 +1110,11 @@ static int otx2_cgx_config_loopback(struct otx2_nic *pf, bool enable)
 	struct msg_req *msg;
 	int err;
 
+	if (enable && bitmap_weight(&pf->flow_cfg->dmacflt_bmap,
+				    pf->flow_cfg->dmacflt_max_flows))
+		netdev_warn(pf->netdev,
+			    "CGX/RPM internal loopback might not work as DMAC filters are active\n");
+
 	mutex_lock(&pf->mbox.lock);
 	if (enable)
 		msg = otx2_mbox_alloc_msg_cgx_intlbk_enable(&pf->mbox);
@@ -1644,6 +1649,10 @@ int otx2_open(struct net_device *netdev)
 	/* Restore pause frame settings */
 	otx2_config_pause_frm(pf);
 
+	/* Install DMAC Filters */
+	if (pf->flags & OTX2_FLAG_DMACFLTR_SUPPORT)
+		otx2_dmacflt_reinstall_flows(pf);
+
 	err = otx2_rxtx_enable(pf, true);
 	if (err)
 		goto err_tx_stop_queues;

From 856a5c97268d474282360c8a3cf4f37f6036dbec Mon Sep 17 00:00:00 2001
From: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Date: Thu, 1 Jul 2021 20:37:06 +0530
Subject: [PATCH 035/158] net: wwan: iosm: fix uevent reporting

Change uevent env variable name to IOSM_EVENT & correct
reporting format to key=value pair.

Signed-off-by: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/iosm/iosm_ipc_uevent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wwan/iosm/iosm_ipc_uevent.c b/drivers/net/wwan/iosm/iosm_ipc_uevent.c
index 2229d752926c..d12188ffed7e 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_uevent.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_uevent.c
@@ -37,7 +37,7 @@ void ipc_uevent_send(struct device *dev, char *uevent)
 
 	/* Store the device and event information */
 	info->dev = dev;
-	snprintf(info->uevent, MAX_UEVENT_LEN, "%s: %s", dev_name(dev), uevent);
+	snprintf(info->uevent, MAX_UEVENT_LEN, "IOSM_EVENT=%s", uevent);
 
 	/* Schedule uevent in process context using work queue */
 	schedule_work(&info->work);

From 3bcfc0a2d3199d0a83d47ed67ad57a1c2f5a16d7 Mon Sep 17 00:00:00 2001
From: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Date: Thu, 1 Jul 2021 20:37:31 +0530
Subject: [PATCH 036/158] net: wwan: iosm: remove reduandant check

Remove reduandant IP session id check since required checks
are in place under caller.

Signed-off-by: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/iosm/iosm_ipc_imem_ops.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c b/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c
index 46f76e8aae92..e4e9461b603e 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c
@@ -24,15 +24,7 @@ int ipc_imem_sys_wwan_open(struct iosm_imem *ipc_imem, int if_id)
 		return -EIO;
 	}
 
-	/* check for the interafce id
-	 * if if_id 1 to 8 then create IP MUX channel sessions.
-	 * To start MUX session from 0 as network interface id would start
-	 * from 1 so map it to if_id = if_id - 1
-	 */
-	if (if_id >= IP_MUX_SESSION_START && if_id <= IP_MUX_SESSION_END)
-		return ipc_mux_open_session(ipc_imem->mux, if_id - 1);
-
-	return -EINVAL;
+	return ipc_mux_open_session(ipc_imem->mux, if_id - 1);
 }
 
 /* Release a net link to CP. */
@@ -83,13 +75,8 @@ int ipc_imem_sys_wwan_transmit(struct iosm_imem *ipc_imem,
 		goto out;
 	}
 
-	if (if_id >= IP_MUX_SESSION_START && if_id <= IP_MUX_SESSION_END)
-		/* Route the UL packet through IP MUX Layer */
-		ret = ipc_mux_ul_trigger_encode(ipc_imem->mux,
-						if_id - 1, skb);
-	else
-		dev_err(ipc_imem->dev,
-			"invalid if_id %d: ", if_id);
+	/* Route the UL packet through IP MUX Layer */
+	ret = ipc_mux_ul_trigger_encode(ipc_imem->mux, if_id - 1, skb);
 out:
 	return ret;
 }

From 5bb4eea0c5f5b9383a543293966bdf20e54988aa Mon Sep 17 00:00:00 2001
From: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Date: Thu, 1 Jul 2021 20:37:45 +0530
Subject: [PATCH 037/158] net: wwan: iosm: correct link-id handling

Link ID to be kept intact with MBIM session ID
Ex: ID 0 should be associated to MBIM session ID 0.

Reported-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/iosm/iosm_ipc_imem_ops.c  | 6 +++---
 drivers/net/wwan/iosm/iosm_ipc_imem_ops.h  | 6 +++---
 drivers/net/wwan/iosm/iosm_ipc_mux_codec.c | 2 +-
 drivers/net/wwan/iosm/iosm_ipc_wwan.c      | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c b/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c
index e4e9461b603e..0a472ce77370 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_imem_ops.c
@@ -24,7 +24,7 @@ int ipc_imem_sys_wwan_open(struct iosm_imem *ipc_imem, int if_id)
 		return -EIO;
 	}
 
-	return ipc_mux_open_session(ipc_imem->mux, if_id - 1);
+	return ipc_mux_open_session(ipc_imem->mux, if_id);
 }
 
 /* Release a net link to CP. */
@@ -33,7 +33,7 @@ void ipc_imem_sys_wwan_close(struct iosm_imem *ipc_imem, int if_id,
 {
 	if (ipc_imem->mux && if_id >= IP_MUX_SESSION_START &&
 	    if_id <= IP_MUX_SESSION_END)
-		ipc_mux_close_session(ipc_imem->mux, if_id - 1);
+		ipc_mux_close_session(ipc_imem->mux, if_id);
 }
 
 /* Tasklet call to do uplink transfer. */
@@ -76,7 +76,7 @@ int ipc_imem_sys_wwan_transmit(struct iosm_imem *ipc_imem,
 	}
 
 	/* Route the UL packet through IP MUX Layer */
-	ret = ipc_mux_ul_trigger_encode(ipc_imem->mux, if_id - 1, skb);
+	ret = ipc_mux_ul_trigger_encode(ipc_imem->mux, if_id, skb);
 out:
 	return ret;
 }
diff --git a/drivers/net/wwan/iosm/iosm_ipc_imem_ops.h b/drivers/net/wwan/iosm/iosm_ipc_imem_ops.h
index fd356dafbdd6..2007fe23e9a5 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_imem_ops.h
+++ b/drivers/net/wwan/iosm/iosm_ipc_imem_ops.h
@@ -27,11 +27,11 @@
 #define BOOT_CHECK_DEFAULT_TIMEOUT 400
 
 /* IP MUX channel range */
-#define IP_MUX_SESSION_START 1
-#define IP_MUX_SESSION_END 8
+#define IP_MUX_SESSION_START 0
+#define IP_MUX_SESSION_END 7
 
 /* Default IP MUX channel */
-#define IP_MUX_SESSION_DEFAULT	1
+#define IP_MUX_SESSION_DEFAULT	0
 
 /**
  * ipc_imem_sys_port_open - Open a port link to CP.
diff --git a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c
index e634ffc6ec08..562de275797a 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c
@@ -288,7 +288,7 @@ static int ipc_mux_net_receive(struct iosm_mux *ipc_mux, int if_id,
 	/* Pass the packet to the netif layer. */
 	dest_skb->priority = service_class;
 
-	return ipc_wwan_receive(wwan, dest_skb, false, if_id + 1);
+	return ipc_wwan_receive(wwan, dest_skb, false, if_id);
 }
 
 /* Decode Flow Credit Table in the block */
diff --git a/drivers/net/wwan/iosm/iosm_ipc_wwan.c b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
index c999c64001f4..84e37c4b0f74 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_wwan.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
@@ -252,8 +252,8 @@ int ipc_wwan_receive(struct iosm_wwan *ipc_wwan, struct sk_buff *skb_arg,
 
 	skb->pkt_type = PACKET_HOST;
 
-	if (if_id < (IP_MUX_SESSION_START - 1) ||
-	    if_id > (IP_MUX_SESSION_END - 1)) {
+	if (if_id < IP_MUX_SESSION_START ||
+	    if_id > IP_MUX_SESSION_END) {
 		ret = -EINVAL;
 		goto free;
 	}

From c302e3a1c86f78421d58ef564ba22519b0b039c0 Mon Sep 17 00:00:00 2001
From: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Date: Thu, 1 Jul 2021 20:39:17 +0530
Subject: [PATCH 038/158] net: wwan: iosm: fix netdev tx stats

Update tx stats on successful packet consume, drop.

Signed-off-by: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/iosm/iosm_ipc_wwan.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wwan/iosm/iosm_ipc_wwan.c b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
index 84e37c4b0f74..e0c19c59c5f6 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_wwan.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
@@ -107,6 +107,7 @@ static int ipc_wwan_link_transmit(struct sk_buff *skb,
 {
 	struct iosm_netdev_priv *priv = wwan_netdev_drvpriv(netdev);
 	struct iosm_wwan *ipc_wwan = priv->ipc_wwan;
+	unsigned int len = skb->len;
 	int if_id = priv->if_id;
 	int ret;
 
@@ -123,6 +124,8 @@ static int ipc_wwan_link_transmit(struct sk_buff *skb,
 
 	/* Return code of zero is success */
 	if (ret == 0) {
+		netdev->stats.tx_packets++;
+		netdev->stats.tx_bytes += len;
 		ret = NETDEV_TX_OK;
 	} else if (ret == -EBUSY) {
 		ret = NETDEV_TX_BUSY;
@@ -140,7 +143,8 @@ exit:
 			ret);
 
 	dev_kfree_skb_any(skb);
-	return ret;
+	netdev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
 }
 
 /* Ops structure for wwan net link */

From d7340f46beae05227f5f4a1c8cb18e81e0c3fe0e Mon Sep 17 00:00:00 2001
From: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Date: Thu, 1 Jul 2021 20:39:34 +0530
Subject: [PATCH 039/158] net: wwan: iosm: set default mtu

Set netdev default mtu size to 1500.

Signed-off-by: M Chetan Kumar <m.chetan.kumar@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/iosm/iosm_ipc_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wwan/iosm/iosm_ipc_wwan.c b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
index e0c19c59c5f6..b2357ad5d517 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_wwan.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
@@ -162,6 +162,7 @@ static void ipc_wwan_setup(struct net_device *iosm_dev)
 	iosm_dev->priv_flags |= IFF_NO_QUEUE;
 
 	iosm_dev->type = ARPHRD_NONE;
+	iosm_dev->mtu = ETH_DATA_LEN;
 	iosm_dev->min_mtu = ETH_MIN_MTU;
 	iosm_dev->max_mtu = ETH_MAX_MTU;
 

From 5d43f951b1ac797450bb4d230fdc960b739bea04 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:52 +0800
Subject: [PATCH 040/158] ptp: add ptp virtual clock driver framework

This patch is to add ptp virtual clock driver framework
utilizing timecounter/cyclecounter.

The patch just exports two essential APIs for PTP driver.

- ptp_vclock_register()
- ptp_vclock_unregister()

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/Makefile             |   2 +-
 drivers/ptp/ptp_private.h        |  15 ++++
 drivers/ptp/ptp_vclock.c         | 150 +++++++++++++++++++++++++++++++
 include/linux/ptp_clock_kernel.h |   4 +-
 4 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 drivers/ptp/ptp_vclock.c

diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 8673d1743faa..28a6fe342d3e 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -3,7 +3,7 @@
 # Makefile for PTP 1588 clock support.
 #
 
-ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o ptp_vclock.o
 ptp_kvm-$(CONFIG_X86)			:= ptp_kvm_x86.o ptp_kvm_common.o
 ptp_kvm-$(CONFIG_HAVE_ARM_SMCCC)	:= ptp_kvm_arm.o ptp_kvm_common.o
 obj-$(CONFIG_PTP_1588_CLOCK)		+= ptp.o
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index 6b97155148f1..853b79b6b30e 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -48,6 +48,19 @@ struct ptp_clock {
 	struct kthread_delayed_work aux_work;
 };
 
+#define info_to_vclock(d) container_of((d), struct ptp_vclock, info)
+#define cc_to_vclock(d) container_of((d), struct ptp_vclock, cc)
+#define dw_to_vclock(d) container_of((d), struct ptp_vclock, refresh_work)
+
+struct ptp_vclock {
+	struct ptp_clock *pclock;
+	struct ptp_clock_info info;
+	struct ptp_clock *clock;
+	struct cyclecounter cc;
+	struct timecounter tc;
+	spinlock_t lock;	/* protects tc/cc */
+};
+
 /*
  * The function queue_cnt() is safe for readers to call without
  * holding q->lock. Readers use this function to verify that the queue
@@ -89,4 +102,6 @@ extern const struct attribute_group *ptp_groups[];
 int ptp_populate_pin_groups(struct ptp_clock *ptp);
 void ptp_cleanup_pin_groups(struct ptp_clock *ptp);
 
+struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock);
+void ptp_vclock_unregister(struct ptp_vclock *vclock);
 #endif
diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
new file mode 100644
index 000000000000..fc9205cc504d
--- /dev/null
+++ b/drivers/ptp/ptp_vclock.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PTP virtual clock driver
+ *
+ * Copyright 2021 NXP
+ */
+#include <linux/slab.h>
+#include "ptp_private.h"
+
+#define PTP_VCLOCK_CC_SHIFT		31
+#define PTP_VCLOCK_CC_MULT		(1 << PTP_VCLOCK_CC_SHIFT)
+#define PTP_VCLOCK_FADJ_SHIFT		9
+#define PTP_VCLOCK_FADJ_DENOMINATOR	15625ULL
+#define PTP_VCLOCK_REFRESH_INTERVAL	(HZ * 2)
+
+static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
+{
+	struct ptp_vclock *vclock = info_to_vclock(ptp);
+	unsigned long flags;
+	s64 adj;
+
+	adj = (s64)scaled_ppm << PTP_VCLOCK_FADJ_SHIFT;
+	adj = div_s64(adj, PTP_VCLOCK_FADJ_DENOMINATOR);
+
+	spin_lock_irqsave(&vclock->lock, flags);
+	timecounter_read(&vclock->tc);
+	vclock->cc.mult = PTP_VCLOCK_CC_MULT + adj;
+	spin_unlock_irqrestore(&vclock->lock, flags);
+
+	return 0;
+}
+
+static int ptp_vclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct ptp_vclock *vclock = info_to_vclock(ptp);
+	unsigned long flags;
+
+	spin_lock_irqsave(&vclock->lock, flags);
+	timecounter_adjtime(&vclock->tc, delta);
+	spin_unlock_irqrestore(&vclock->lock, flags);
+
+	return 0;
+}
+
+static int ptp_vclock_gettime(struct ptp_clock_info *ptp,
+			      struct timespec64 *ts)
+{
+	struct ptp_vclock *vclock = info_to_vclock(ptp);
+	unsigned long flags;
+	u64 ns;
+
+	spin_lock_irqsave(&vclock->lock, flags);
+	ns = timecounter_read(&vclock->tc);
+	spin_unlock_irqrestore(&vclock->lock, flags);
+	*ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
+static int ptp_vclock_settime(struct ptp_clock_info *ptp,
+			      const struct timespec64 *ts)
+{
+	struct ptp_vclock *vclock = info_to_vclock(ptp);
+	u64 ns = timespec64_to_ns(ts);
+	unsigned long flags;
+
+	spin_lock_irqsave(&vclock->lock, flags);
+	timecounter_init(&vclock->tc, &vclock->cc, ns);
+	spin_unlock_irqrestore(&vclock->lock, flags);
+
+	return 0;
+}
+
+static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+{
+	struct ptp_vclock *vclock = info_to_vclock(ptp);
+	struct timespec64 ts;
+
+	ptp_vclock_gettime(&vclock->info, &ts);
+
+	return PTP_VCLOCK_REFRESH_INTERVAL;
+}
+
+static const struct ptp_clock_info ptp_vclock_info = {
+	.owner		= THIS_MODULE,
+	.name		= "ptp virtual clock",
+	/* The maximum ppb value that long scaled_ppm can support */
+	.max_adj	= 32767999,
+	.adjfine	= ptp_vclock_adjfine,
+	.adjtime	= ptp_vclock_adjtime,
+	.gettime64	= ptp_vclock_gettime,
+	.settime64	= ptp_vclock_settime,
+	.do_aux_work	= ptp_vclock_refresh,
+};
+
+static u64 ptp_vclock_read(const struct cyclecounter *cc)
+{
+	struct ptp_vclock *vclock = cc_to_vclock(cc);
+	struct ptp_clock *ptp = vclock->pclock;
+	struct timespec64 ts = {};
+
+	if (ptp->info->gettimex64)
+		ptp->info->gettimex64(ptp->info, &ts, NULL);
+	else
+		ptp->info->gettime64(ptp->info, &ts);
+
+	return timespec64_to_ns(&ts);
+}
+
+static const struct cyclecounter ptp_vclock_cc = {
+	.read	= ptp_vclock_read,
+	.mask	= CYCLECOUNTER_MASK(32),
+	.mult	= PTP_VCLOCK_CC_MULT,
+	.shift	= PTP_VCLOCK_CC_SHIFT,
+};
+
+struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+{
+	struct ptp_vclock *vclock;
+
+	vclock = kzalloc(sizeof(*vclock), GFP_KERNEL);
+	if (!vclock)
+		return NULL;
+
+	vclock->pclock = pclock;
+	vclock->info = ptp_vclock_info;
+	vclock->cc = ptp_vclock_cc;
+
+	snprintf(vclock->info.name, PTP_CLOCK_NAME_LEN, "ptp%d_virt",
+		 pclock->index);
+
+	spin_lock_init(&vclock->lock);
+
+	vclock->clock = ptp_clock_register(&vclock->info, &pclock->dev);
+	if (IS_ERR_OR_NULL(vclock->clock)) {
+		kfree(vclock);
+		return NULL;
+	}
+
+	timecounter_init(&vclock->tc, &vclock->cc, 0);
+	ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+
+	return vclock;
+}
+
+void ptp_vclock_unregister(struct ptp_vclock *vclock)
+{
+	ptp_clock_unregister(vclock->clock);
+	kfree(vclock);
+}
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index aba237c0b3a2..b6fb771ee524 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -11,7 +11,9 @@
 #include <linux/device.h>
 #include <linux/pps_kernel.h>
 #include <linux/ptp_clock.h>
+#include <linux/timecounter.h>
 
+#define PTP_CLOCK_NAME_LEN	32
 /**
  * struct ptp_clock_request - request PTP clock event
  *
@@ -134,7 +136,7 @@ struct ptp_system_timestamp {
 
 struct ptp_clock_info {
 	struct module *owner;
-	char name[16];
+	char name[PTP_CLOCK_NAME_LEN];
 	s32 max_adj;
 	int n_alarm;
 	int n_ext_ts;

From 73f37068d540eba5f93ba3a0019bf479d35ebd76 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:53 +0800
Subject: [PATCH 041/158] ptp: support ptp physical/virtual clocks conversion

Support ptp physical/virtual clocks conversion via sysfs.
There will be a new attribute n_vclocks under ptp physical
clock sysfs.

- In default, the value is 0 meaning only ptp physical clock
  is in use.
- Setting the value can create corresponding number of ptp
  virtual clocks to use. But current physical clock is guaranteed
  to stay free running.
- Setting the value back to 0 can delete virtual clocks and back
  use physical clock again.

Another new attribute max_vclocks control the maximum number of
ptp vclocks.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/ABI/testing/sysfs-ptp |  20 ++++
 drivers/ptp/ptp_clock.c             |  26 ++++++
 drivers/ptp/ptp_private.h           |  21 +++++
 drivers/ptp/ptp_sysfs.c             | 138 ++++++++++++++++++++++++++++
 4 files changed, 205 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-ptp b/Documentation/ABI/testing/sysfs-ptp
index 2363ad810ddb..d378f57c1b73 100644
--- a/Documentation/ABI/testing/sysfs-ptp
+++ b/Documentation/ABI/testing/sysfs-ptp
@@ -33,6 +33,13 @@ Description:
 		frequency adjustment value (a positive integer) in
 		parts per billion.
 
+What:		/sys/class/ptp/ptpN/max_vclocks
+Date:		May 2021
+Contact:	Yangbo Lu <yangbo.lu@nxp.com>
+Description:
+		This file contains the maximum number of ptp vclocks.
+		Write integer to re-configure it.
+
 What:		/sys/class/ptp/ptpN/n_alarms
 Date:		September 2010
 Contact:	Richard Cochran <richardcochran@gmail.com>
@@ -61,6 +68,19 @@ Description:
 		This file contains the number of programmable pins
 		offered by the PTP hardware clock.
 
+What:		/sys/class/ptp/ptpN/n_vclocks
+Date:		May 2021
+Contact:	Yangbo Lu <yangbo.lu@nxp.com>
+Description:
+		This file contains the number of virtual PTP clocks in
+		use.  By default, the value is 0 meaning that only the
+		physical clock is in use.  Setting the value creates
+		the corresponding number of virtual clocks and causes
+		the physical clock to become free running.  Setting the
+		value back to 0 deletes the virtual clocks and
+		switches the physical clock back to normal, adjustable
+		operation.
+
 What:		/sys/class/ptp/ptpN/pins
 Date:		March 2014
 Contact:	Richard Cochran <richardcochran@gmail.com>
diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index a23a37a4d5dc..7334f478dde7 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -76,6 +76,11 @@ static int ptp_clock_settime(struct posix_clock *pc, const struct timespec64 *tp
 {
 	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
 
+	if (ptp_vclock_in_use(ptp)) {
+		pr_err("ptp: virtual clock in use\n");
+		return -EBUSY;
+	}
+
 	return  ptp->info->settime64(ptp->info, tp);
 }
 
@@ -97,6 +102,11 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct __kernel_timex *tx)
 	struct ptp_clock_info *ops;
 	int err = -EOPNOTSUPP;
 
+	if (ptp_vclock_in_use(ptp)) {
+		pr_err("ptp: virtual clock in use\n");
+		return -EBUSY;
+	}
+
 	ops = ptp->info;
 
 	if (tx->modes & ADJ_SETOFFSET) {
@@ -161,6 +171,7 @@ static void ptp_clock_release(struct device *dev)
 	ptp_cleanup_pin_groups(ptp);
 	mutex_destroy(&ptp->tsevq_mux);
 	mutex_destroy(&ptp->pincfg_mux);
+	mutex_destroy(&ptp->n_vclocks_mux);
 	ida_simple_remove(&ptp_clocks_map, ptp->index);
 	kfree(ptp);
 }
@@ -208,6 +219,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 	spin_lock_init(&ptp->tsevq.lock);
 	mutex_init(&ptp->tsevq_mux);
 	mutex_init(&ptp->pincfg_mux);
+	mutex_init(&ptp->n_vclocks_mux);
 	init_waitqueue_head(&ptp->tsev_wq);
 
 	if (ptp->info->do_aux_work) {
@@ -221,6 +233,14 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 		ptp->pps_source->lookup_cookie = ptp;
 	}
 
+	/* PTP virtual clock is being registered under physical clock */
+	if (parent->class && parent->class->name &&
+	    strcmp(parent->class->name, "ptp") == 0)
+		ptp->is_virtual_clock = true;
+
+	if (!ptp->is_virtual_clock)
+		ptp->max_vclocks = PTP_DEFAULT_MAX_VCLOCKS;
+
 	err = ptp_populate_pin_groups(ptp);
 	if (err)
 		goto no_pin_groups;
@@ -270,6 +290,7 @@ no_pin_groups:
 kworker_err:
 	mutex_destroy(&ptp->tsevq_mux);
 	mutex_destroy(&ptp->pincfg_mux);
+	mutex_destroy(&ptp->n_vclocks_mux);
 	ida_simple_remove(&ptp_clocks_map, index);
 no_slot:
 	kfree(ptp);
@@ -280,6 +301,11 @@ EXPORT_SYMBOL(ptp_clock_register);
 
 int ptp_clock_unregister(struct ptp_clock *ptp)
 {
+	if (ptp_vclock_in_use(ptp)) {
+		pr_err("ptp: virtual clock in use\n");
+		return -EBUSY;
+	}
+
 	ptp->defunct = 1;
 	wake_up_interruptible(&ptp->tsev_wq);
 
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index 853b79b6b30e..87cb55953b69 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -18,6 +18,7 @@
 
 #define PTP_MAX_TIMESTAMPS 128
 #define PTP_BUF_TIMESTAMPS 30
+#define PTP_DEFAULT_MAX_VCLOCKS 20
 
 struct timestamp_event_queue {
 	struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
@@ -46,6 +47,10 @@ struct ptp_clock {
 	const struct attribute_group *pin_attr_groups[2];
 	struct kthread_worker *kworker;
 	struct kthread_delayed_work aux_work;
+	unsigned int max_vclocks;
+	unsigned int n_vclocks;
+	struct mutex n_vclocks_mux; /* protect concurrent n_vclocks access */
+	bool is_virtual_clock;
 };
 
 #define info_to_vclock(d) container_of((d), struct ptp_vclock, info)
@@ -74,6 +79,22 @@ static inline int queue_cnt(struct timestamp_event_queue *q)
 	return cnt < 0 ? PTP_MAX_TIMESTAMPS + cnt : cnt;
 }
 
+/* Check if ptp virtual clock is in use */
+static inline bool ptp_vclock_in_use(struct ptp_clock *ptp)
+{
+	bool in_use = false;
+
+	if (mutex_lock_interruptible(&ptp->n_vclocks_mux))
+		return true;
+
+	if (!ptp->is_virtual_clock && ptp->n_vclocks)
+		in_use = true;
+
+	mutex_unlock(&ptp->n_vclocks_mux);
+
+	return in_use;
+}
+
 /*
  * see ptp_chardev.c
  */
diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c
index be076a91e20e..0b05041783a5 100644
--- a/drivers/ptp/ptp_sysfs.c
+++ b/drivers/ptp/ptp_sysfs.c
@@ -3,6 +3,7 @@
  * PTP 1588 clock support - sysfs interface.
  *
  * Copyright (C) 2010 OMICRON electronics GmbH
+ * Copyright 2021 NXP
  */
 #include <linux/capability.h>
 #include <linux/slab.h>
@@ -148,6 +149,137 @@ out:
 }
 static DEVICE_ATTR(pps_enable, 0220, NULL, pps_enable_store);
 
+static int unregister_vclock(struct device *dev, void *data)
+{
+	struct ptp_clock *ptp = dev_get_drvdata(dev);
+	struct ptp_clock_info *info = ptp->info;
+	struct ptp_vclock *vclock;
+	u8 *num = data;
+
+	vclock = info_to_vclock(info);
+	dev_info(dev->parent, "delete virtual clock ptp%d\n",
+		 vclock->clock->index);
+
+	ptp_vclock_unregister(vclock);
+	(*num)--;
+
+	/* For break. Not error. */
+	if (*num == 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t n_vclocks_show(struct device *dev,
+			      struct device_attribute *attr, char *page)
+{
+	struct ptp_clock *ptp = dev_get_drvdata(dev);
+	ssize_t size;
+
+	if (mutex_lock_interruptible(&ptp->n_vclocks_mux))
+		return -ERESTARTSYS;
+
+	size = snprintf(page, PAGE_SIZE - 1, "%d\n", ptp->n_vclocks);
+
+	mutex_unlock(&ptp->n_vclocks_mux);
+
+	return size;
+}
+
+static ssize_t n_vclocks_store(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	struct ptp_clock *ptp = dev_get_drvdata(dev);
+	struct ptp_vclock *vclock;
+	int err = -EINVAL;
+	u32 num, i;
+
+	if (kstrtou32(buf, 0, &num))
+		return err;
+
+	if (mutex_lock_interruptible(&ptp->n_vclocks_mux))
+		return -ERESTARTSYS;
+
+	if (num > ptp->max_vclocks) {
+		dev_err(dev, "max value is %d\n", ptp->max_vclocks);
+		goto out;
+	}
+
+	/* Need to create more vclocks */
+	if (num > ptp->n_vclocks) {
+		for (i = 0; i < num - ptp->n_vclocks; i++) {
+			vclock = ptp_vclock_register(ptp);
+			if (!vclock)
+				goto out;
+
+			dev_info(dev, "new virtual clock ptp%d\n",
+				 vclock->clock->index);
+		}
+	}
+
+	/* Need to delete vclocks */
+	if (num < ptp->n_vclocks) {
+		i = ptp->n_vclocks - num;
+		device_for_each_child_reverse(dev, &i,
+					      unregister_vclock);
+	}
+
+	if (num == 0)
+		dev_info(dev, "only physical clock in use now\n");
+	else
+		dev_info(dev, "guarantee physical clock free running\n");
+
+	ptp->n_vclocks = num;
+	mutex_unlock(&ptp->n_vclocks_mux);
+
+	return count;
+out:
+	mutex_unlock(&ptp->n_vclocks_mux);
+	return err;
+}
+static DEVICE_ATTR_RW(n_vclocks);
+
+static ssize_t max_vclocks_show(struct device *dev,
+				struct device_attribute *attr, char *page)
+{
+	struct ptp_clock *ptp = dev_get_drvdata(dev);
+	ssize_t size;
+
+	size = snprintf(page, PAGE_SIZE - 1, "%d\n", ptp->max_vclocks);
+
+	return size;
+}
+
+static ssize_t max_vclocks_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct ptp_clock *ptp = dev_get_drvdata(dev);
+	u32 max;
+
+	if (kstrtou32(buf, 0, &max) || max == 0)
+		return -EINVAL;
+
+	if (max == ptp->max_vclocks)
+		return count;
+
+	if (mutex_lock_interruptible(&ptp->n_vclocks_mux))
+		return -ERESTARTSYS;
+
+	if (max < ptp->n_vclocks) {
+		mutex_unlock(&ptp->n_vclocks_mux);
+		return -EINVAL;
+	}
+
+	ptp->max_vclocks = max;
+
+	mutex_unlock(&ptp->n_vclocks_mux);
+
+	return count;
+}
+static DEVICE_ATTR_RW(max_vclocks);
+
 static struct attribute *ptp_attrs[] = {
 	&dev_attr_clock_name.attr,
 
@@ -162,6 +294,8 @@ static struct attribute *ptp_attrs[] = {
 	&dev_attr_fifo.attr,
 	&dev_attr_period.attr,
 	&dev_attr_pps_enable.attr,
+	&dev_attr_n_vclocks.attr,
+	&dev_attr_max_vclocks.attr,
 	NULL
 };
 
@@ -183,6 +317,10 @@ static umode_t ptp_is_attribute_visible(struct kobject *kobj,
 	} else if (attr == &dev_attr_pps_enable.attr) {
 		if (!info->pps)
 			mode = 0;
+	} else if (attr == &dev_attr_n_vclocks.attr ||
+		   attr == &dev_attr_max_vclocks.attr) {
+		if (ptp->is_virtual_clock)
+			mode = 0;
 	}
 
 	return mode;

From 44c494c8e30e35713c7d11ca3c5ab332cbfabacf Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:54 +0800
Subject: [PATCH 042/158] ptp: track available ptp vclocks information

Track available ptp vclocks information. Record index values
of available ptp vclocks during registering and unregistering.

This is preparation for supporting ptp vclocks info query
through ethtool.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c   | 15 ++++++++++++++-
 drivers/ptp/ptp_private.h |  1 +
 drivers/ptp/ptp_sysfs.c   | 28 +++++++++++++++++++++++++---
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 7334f478dde7..9205a9362a9d 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -196,6 +196,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 {
 	struct ptp_clock *ptp;
 	int err = 0, index, major = MAJOR(ptp_devt);
+	size_t size;
 
 	if (info->n_alarm > PTP_MAX_ALARMS)
 		return ERR_PTR(-EINVAL);
@@ -238,9 +239,17 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 	    strcmp(parent->class->name, "ptp") == 0)
 		ptp->is_virtual_clock = true;
 
-	if (!ptp->is_virtual_clock)
+	if (!ptp->is_virtual_clock) {
 		ptp->max_vclocks = PTP_DEFAULT_MAX_VCLOCKS;
 
+		size = sizeof(int) * ptp->max_vclocks;
+		ptp->vclock_index = kzalloc(size, GFP_KERNEL);
+		if (!ptp->vclock_index) {
+			err = -ENOMEM;
+			goto no_mem_for_vclocks;
+		}
+	}
+
 	err = ptp_populate_pin_groups(ptp);
 	if (err)
 		goto no_pin_groups;
@@ -285,6 +294,8 @@ no_clock:
 no_pps:
 	ptp_cleanup_pin_groups(ptp);
 no_pin_groups:
+	kfree(ptp->vclock_index);
+no_mem_for_vclocks:
 	if (ptp->kworker)
 		kthread_destroy_worker(ptp->kworker);
 kworker_err:
@@ -309,6 +320,8 @@ int ptp_clock_unregister(struct ptp_clock *ptp)
 	ptp->defunct = 1;
 	wake_up_interruptible(&ptp->tsev_wq);
 
+	kfree(ptp->vclock_index);
+
 	if (ptp->kworker) {
 		kthread_cancel_delayed_work_sync(&ptp->aux_work);
 		kthread_destroy_worker(ptp->kworker);
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index 87cb55953b69..f75fadd9b244 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -49,6 +49,7 @@ struct ptp_clock {
 	struct kthread_delayed_work aux_work;
 	unsigned int max_vclocks;
 	unsigned int n_vclocks;
+	int *vclock_index;
 	struct mutex n_vclocks_mux; /* protect concurrent n_vclocks access */
 	bool is_virtual_clock;
 };
diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c
index 0b05041783a5..6a36590ca77a 100644
--- a/drivers/ptp/ptp_sysfs.c
+++ b/drivers/ptp/ptp_sysfs.c
@@ -213,6 +213,9 @@ static ssize_t n_vclocks_store(struct device *dev,
 			if (!vclock)
 				goto out;
 
+			*(ptp->vclock_index + ptp->n_vclocks + i) =
+				vclock->clock->index;
+
 			dev_info(dev, "new virtual clock ptp%d\n",
 				 vclock->clock->index);
 		}
@@ -223,6 +226,9 @@ static ssize_t n_vclocks_store(struct device *dev,
 		i = ptp->n_vclocks - num;
 		device_for_each_child_reverse(dev, &i,
 					      unregister_vclock);
+
+		for (i = 1; i <= ptp->n_vclocks - num; i++)
+			*(ptp->vclock_index + ptp->n_vclocks - i) = -1;
 	}
 
 	if (num == 0)
@@ -256,6 +262,9 @@ static ssize_t max_vclocks_store(struct device *dev,
 				 const char *buf, size_t count)
 {
 	struct ptp_clock *ptp = dev_get_drvdata(dev);
+	unsigned int *vclock_index;
+	int err = -EINVAL;
+	size_t size;
 	u32 max;
 
 	if (kstrtou32(buf, 0, &max) || max == 0)
@@ -267,16 +276,29 @@ static ssize_t max_vclocks_store(struct device *dev,
 	if (mutex_lock_interruptible(&ptp->n_vclocks_mux))
 		return -ERESTARTSYS;
 
-	if (max < ptp->n_vclocks) {
-		mutex_unlock(&ptp->n_vclocks_mux);
-		return -EINVAL;
+	if (max < ptp->n_vclocks)
+		goto out;
+
+	size = sizeof(int) * max;
+	vclock_index = kzalloc(size, GFP_KERNEL);
+	if (!vclock_index) {
+		err = -ENOMEM;
+		goto out;
 	}
 
+	size = sizeof(int) * ptp->n_vclocks;
+	memcpy(vclock_index, ptp->vclock_index, size);
+
+	kfree(ptp->vclock_index);
+	ptp->vclock_index = vclock_index;
 	ptp->max_vclocks = max;
 
 	mutex_unlock(&ptp->n_vclocks_mux);
 
 	return count;
+out:
+	mutex_unlock(&ptp->n_vclocks_mux);
+	return err;
 }
 static DEVICE_ATTR_RW(max_vclocks);
 

From acb288e8047b7569fbc9af6fa6e9405315345103 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:55 +0800
Subject: [PATCH 043/158] ptp: add kernel API ptp_get_vclocks_index()

Add kernel API ptp_get_vclocks_index() to get all ptp
vclocks index on pclock.

This is preparation for supporting ptp vclocks info query
through ethtool.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c          |  3 ++-
 drivers/ptp/ptp_private.h        |  2 ++
 drivers/ptp/ptp_vclock.c         | 35 ++++++++++++++++++++++++++++++++
 include/linux/ptp_clock_kernel.h | 14 +++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 9205a9362a9d..f012fa581cf4 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -24,10 +24,11 @@
 #define PTP_PPS_EVENT PPS_CAPTUREASSERT
 #define PTP_PPS_MODE (PTP_PPS_DEFAULTS | PPS_CANWAIT | PPS_TSFMT_TSPEC)
 
+struct class *ptp_class;
+
 /* private globals */
 
 static dev_t ptp_devt;
-static struct class *ptp_class;
 
 static DEFINE_IDA(ptp_clocks_map);
 
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index f75fadd9b244..dba6be477067 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -96,6 +96,8 @@ static inline bool ptp_vclock_in_use(struct ptp_clock *ptp)
 	return in_use;
 }
 
+extern struct class *ptp_class;
+
 /*
  * see ptp_chardev.c
  */
diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
index fc9205cc504d..cefab29a0592 100644
--- a/drivers/ptp/ptp_vclock.c
+++ b/drivers/ptp/ptp_vclock.c
@@ -148,3 +148,38 @@ void ptp_vclock_unregister(struct ptp_vclock *vclock)
 	ptp_clock_unregister(vclock->clock);
 	kfree(vclock);
 }
+
+int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
+{
+	char name[PTP_CLOCK_NAME_LEN] = "";
+	struct ptp_clock *ptp;
+	struct device *dev;
+	int num = 0;
+
+	if (pclock_index < 0)
+		return num;
+
+	snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", pclock_index);
+	dev = class_find_device_by_name(ptp_class, name);
+	if (!dev)
+		return num;
+
+	ptp = dev_get_drvdata(dev);
+
+	if (mutex_lock_interruptible(&ptp->n_vclocks_mux)) {
+		put_device(dev);
+		return num;
+	}
+
+	*vclock_index = kzalloc(sizeof(int) * ptp->n_vclocks, GFP_KERNEL);
+	if (!(*vclock_index))
+		goto out;
+
+	memcpy(*vclock_index, ptp->vclock_index, sizeof(int) * ptp->n_vclocks);
+	num = ptp->n_vclocks;
+out:
+	mutex_unlock(&ptp->n_vclocks_mux);
+	put_device(dev);
+	return num;
+}
+EXPORT_SYMBOL(ptp_get_vclocks_index);
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index b6fb771ee524..300a984fec87 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -306,6 +306,18 @@ int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay);
  */
 void ptp_cancel_worker_sync(struct ptp_clock *ptp);
 
+/**
+ * ptp_get_vclocks_index() - get all vclocks index on pclock, and
+ *                           caller is responsible to free memory
+ *                           of vclock_index
+ *
+ * @pclock_index: phc index of ptp pclock.
+ * @vclock_index: pointer to pointer of vclock index.
+ *
+ * return number of vclocks.
+ */
+int ptp_get_vclocks_index(int pclock_index, int **vclock_index);
+
 #else
 static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 						   struct device *parent)
@@ -325,6 +337,8 @@ static inline int ptp_schedule_worker(struct ptp_clock *ptp,
 { return -EOPNOTSUPP; }
 static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp)
 { }
+static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
+{ return 0; }
 
 #endif
 

From c156174a67070042d51d2c866146d3c934d5468c Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:56 +0800
Subject: [PATCH 044/158] ethtool: add a new command for getting PHC virtual
 clocks

Add an interface for getting PHC (PTP Hardware Clock)
virtual clocks, which are based on PHC physical clock
providing hardware timestamp to network packets.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst | 22 +++++
 include/linux/ethtool.h                      | 10 +++
 include/uapi/linux/ethtool_netlink.h         | 15 ++++
 net/ethtool/Makefile                         |  2 +-
 net/ethtool/common.c                         | 13 +++
 net/ethtool/netlink.c                        | 10 +++
 net/ethtool/netlink.h                        |  2 +
 net/ethtool/phc_vclocks.c                    | 94 ++++++++++++++++++++
 8 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 net/ethtool/phc_vclocks.c

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 6ea91e41593f..c86628e6a235 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -212,6 +212,7 @@ Userspace to kernel:
   ``ETHTOOL_MSG_FEC_SET``               set FEC settings
   ``ETHTOOL_MSG_MODULE_EEPROM_GET``     read SFP module EEPROM
   ``ETHTOOL_MSG_STATS_GET``             get standard statistics
+  ``ETHTOOL_MSG_PHC_VCLOCKS_GET``       get PHC virtual clocks info
   ===================================== ================================
 
 Kernel to userspace:
@@ -250,6 +251,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_FEC_NTF``                  FEC settings
   ``ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY``  read SFP module EEPROM
   ``ETHTOOL_MSG_STATS_GET_REPLY``          standard statistics
+  ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY``    PHC virtual clocks info
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1477,6 +1479,25 @@ Low and high bounds are inclusive, for example:
  etherStatsPkts512to1023Octets 512  1023
  ============================= ==== ====
 
+PHC_VCLOCKS_GET
+===============
+
+Query device PHC virtual clocks information.
+
+Request contents:
+
+  ====================================  ======  ==========================
+  ``ETHTOOL_A_PHC_VCLOCKS_HEADER``      nested  request header
+  ====================================  ======  ==========================
+
+Kernel response contents:
+
+  ====================================  ======  ==========================
+  ``ETHTOOL_A_PHC_VCLOCKS_HEADER``      nested  reply header
+  ``ETHTOOL_A_PHC_VCLOCKS_NUM``         u32     PHC virtual clocks number
+  ``ETHTOOL_A_PHC_VCLOCKS_INDEX``       s32     PHC index array
+  ====================================  ======  ==========================
+
 Request translation
 ===================
 
@@ -1575,4 +1596,5 @@ are netlink only.
   n/a                                 ``ETHTOOL_MSG_CABLE_TEST_ACT``
   n/a                                 ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT``
   n/a                                 ``ETHTOOL_MSG_TUNNEL_INFO_GET``
+  n/a                                 ``ETHTOOL_MSG_PHC_VCLOCKS_GET``
   =================================== =====================================
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 29dbb603bc91..232daaec56e4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -757,6 +757,16 @@ void
 ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings,
 			      enum ethtool_link_mode_bit_indices link_mode);
 
+/**
+ * ethtool_get_phc_vclocks - Derive phc vclocks information, and caller
+ *                           is responsible to free memory of vclock_index
+ * @dev: pointer to net_device structure
+ * @vclock_index: pointer to pointer of vclock index
+ *
+ * Return number of phc vclocks
+ */
+int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index);
+
 /**
  * ethtool_sprintf - Write formatted string to ethtool string data
  * @data: Pointer to start of string to update
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index c7135c9c37a5..b3b93710eff7 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -46,6 +46,7 @@ enum {
 	ETHTOOL_MSG_FEC_SET,
 	ETHTOOL_MSG_MODULE_EEPROM_GET,
 	ETHTOOL_MSG_STATS_GET,
+	ETHTOOL_MSG_PHC_VCLOCKS_GET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -88,6 +89,7 @@ enum {
 	ETHTOOL_MSG_FEC_NTF,
 	ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
 	ETHTOOL_MSG_STATS_GET_REPLY,
+	ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -440,6 +442,19 @@ enum {
 	ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1)
 };
 
+/* PHC VCLOCKS */
+
+enum {
+	ETHTOOL_A_PHC_VCLOCKS_UNSPEC,
+	ETHTOOL_A_PHC_VCLOCKS_HEADER,			/* nest - _A_HEADER_* */
+	ETHTOOL_A_PHC_VCLOCKS_NUM,			/* u32 */
+	ETHTOOL_A_PHC_VCLOCKS_INDEX,			/* array, s32 */
+
+	/* add new constants above here */
+	__ETHTOOL_A_PHC_VCLOCKS_CNT,
+	ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1)
+};
+
 /* CABLE TEST */
 
 enum {
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 723c9a8a8cdf..0a19470efbfb 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-		   tunnels.o fec.o eeprom.o stats.o
+		   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index f9dcbad84788..798231b07676 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -4,6 +4,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/phy.h>
 #include <linux/rtnetlink.h>
+#include <linux/ptp_clock_kernel.h>
 
 #include "common.h"
 
@@ -554,6 +555,18 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
 	return 0;
 }
 
+int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index)
+{
+	struct ethtool_ts_info info = { };
+	int num = 0;
+
+	if (!__ethtool_get_ts_info(dev, &info))
+		num = ptp_get_vclocks_index(info.phc_index, vclock_index);
+
+	return num;
+}
+EXPORT_SYMBOL(ethtool_get_phc_vclocks);
+
 const struct ethtool_phy_ops *ethtool_phy_ops;
 
 void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops)
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index a7346346114f..73e0f5b626bf 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -248,6 +248,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_TSINFO_GET]	= &ethnl_tsinfo_request_ops,
 	[ETHTOOL_MSG_MODULE_EEPROM_GET]	= &ethnl_module_eeprom_request_ops,
 	[ETHTOOL_MSG_STATS_GET]		= &ethnl_stats_request_ops,
+	[ETHTOOL_MSG_PHC_VCLOCKS_GET]	= &ethnl_phc_vclocks_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -958,6 +959,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_stats_get_policy,
 		.maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_PHC_VCLOCKS_GET,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_default_start,
+		.dumpit	= ethnl_default_dumpit,
+		.done	= ethnl_default_done,
+		.policy = ethnl_phc_vclocks_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 3e25a47fd482..3fc395c86702 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -347,6 +347,7 @@ extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
 extern const struct ethnl_request_ops ethnl_fec_request_ops;
 extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
 extern const struct ethnl_request_ops ethnl_stats_request_ops;
+extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -382,6 +383,7 @@ extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
 extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
 extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1];
 extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1];
+extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/ethtool/phc_vclocks.c b/net/ethtool/phc_vclocks.c
new file mode 100644
index 000000000000..637b2f5297d5
--- /dev/null
+++ b/net/ethtool/phc_vclocks.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 NXP
+ */
+#include "netlink.h"
+#include "common.h"
+
+struct phc_vclocks_req_info {
+	struct ethnl_req_info		base;
+};
+
+struct phc_vclocks_reply_data {
+	struct ethnl_reply_data		base;
+	int				num;
+	int				*index;
+};
+
+#define PHC_VCLOCKS_REPDATA(__reply_base) \
+	container_of(__reply_base, struct phc_vclocks_reply_data, base)
+
+const struct nla_policy ethnl_phc_vclocks_get_policy[] = {
+	[ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base,
+				    struct ethnl_reply_data *reply_base,
+				    struct genl_info *info)
+{
+	struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base);
+	struct net_device *dev = reply_base->dev;
+	int ret;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		return ret;
+	data->num = ethtool_get_phc_vclocks(dev, &data->index);
+	ethnl_ops_complete(dev);
+
+	return ret;
+}
+
+static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base,
+				  const struct ethnl_reply_data *reply_base)
+{
+	const struct phc_vclocks_reply_data *data =
+		PHC_VCLOCKS_REPDATA(reply_base);
+	int len = 0;
+
+	if (data->num > 0) {
+		len += nla_total_size(sizeof(u32));
+		len += nla_total_size(sizeof(s32) * data->num);
+	}
+
+	return len;
+}
+
+static int phc_vclocks_fill_reply(struct sk_buff *skb,
+				  const struct ethnl_req_info *req_base,
+				  const struct ethnl_reply_data *reply_base)
+{
+	const struct phc_vclocks_reply_data *data =
+		PHC_VCLOCKS_REPDATA(reply_base);
+
+	if (data->num <= 0)
+		return 0;
+
+	if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) ||
+	    nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX,
+		    sizeof(s32) * data->num, data->index))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+	const struct phc_vclocks_reply_data *data =
+		PHC_VCLOCKS_REPDATA(reply_base);
+
+	kfree(data->index);
+}
+
+const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = {
+	.request_cmd		= ETHTOOL_MSG_PHC_VCLOCKS_GET,
+	.reply_cmd		= ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
+	.hdr_attr		= ETHTOOL_A_PHC_VCLOCKS_HEADER,
+	.req_info_size		= sizeof(struct phc_vclocks_req_info),
+	.reply_data_size	= sizeof(struct phc_vclocks_reply_data),
+
+	.prepare_data		= phc_vclocks_prepare_data,
+	.reply_size		= phc_vclocks_reply_size,
+	.fill_reply		= phc_vclocks_fill_reply,
+	.cleanup_data		= phc_vclocks_cleanup_data,
+};

From 895487a3a10fb3a177e20dcde875515d46ccd4df Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:57 +0800
Subject: [PATCH 045/158] ptp: add kernel API ptp_convert_timestamp()

Add kernel API ptp_convert_timestamp() to convert raw hardware timestamp
to a specified ptp vclock time.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_vclock.c         | 34 ++++++++++++++++++++++++++++++++
 include/linux/ptp_clock_kernel.h | 13 ++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
index cefab29a0592..e0f87c57749a 100644
--- a/drivers/ptp/ptp_vclock.c
+++ b/drivers/ptp/ptp_vclock.c
@@ -183,3 +183,37 @@ out:
 	return num;
 }
 EXPORT_SYMBOL(ptp_get_vclocks_index);
+
+void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
+			   int vclock_index)
+{
+	char name[PTP_CLOCK_NAME_LEN] = "";
+	struct ptp_vclock *vclock;
+	struct ptp_clock *ptp;
+	unsigned long flags;
+	struct device *dev;
+	u64 ns;
+
+	snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", vclock_index);
+	dev = class_find_device_by_name(ptp_class, name);
+	if (!dev)
+		return;
+
+	ptp = dev_get_drvdata(dev);
+	if (!ptp->is_virtual_clock) {
+		put_device(dev);
+		return;
+	}
+
+	vclock = info_to_vclock(ptp->info);
+
+	ns = ktime_to_ns(hwtstamps->hwtstamp);
+
+	spin_lock_irqsave(&vclock->lock, flags);
+	ns = timecounter_cyc2time(&vclock->tc, ns);
+	spin_unlock_irqrestore(&vclock->lock, flags);
+
+	put_device(dev);
+	hwtstamps->hwtstamp = ns_to_ktime(ns);
+}
+EXPORT_SYMBOL(ptp_convert_timestamp);
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 300a984fec87..71fac9237725 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -12,6 +12,7 @@
 #include <linux/pps_kernel.h>
 #include <linux/ptp_clock.h>
 #include <linux/timecounter.h>
+#include <linux/skbuff.h>
 
 #define PTP_CLOCK_NAME_LEN	32
 /**
@@ -318,6 +319,15 @@ void ptp_cancel_worker_sync(struct ptp_clock *ptp);
  */
 int ptp_get_vclocks_index(int pclock_index, int **vclock_index);
 
+/**
+ * ptp_convert_timestamp() - convert timestamp to a ptp vclock time
+ *
+ * @hwtstamps:    skb_shared_hwtstamps structure pointer
+ * @vclock_index: phc index of ptp vclock.
+ */
+void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
+			   int vclock_index);
+
 #else
 static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 						   struct device *parent)
@@ -339,6 +349,9 @@ static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp)
 { }
 static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
 { return 0; }
+static inline void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
+					 int vclock_index)
+{ }
 
 #endif
 

From 6c9a0a0f2333b1e3c29fef47a8b12131fce4905b Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:58 +0800
Subject: [PATCH 046/158] mptcp: setsockopt: convert to
 mptcp_setsockopt_sol_socket_timestamping()

Split timestamping handling into a new function
mptcp_setsockopt_sol_socket_timestamping().
This is preparation for extending SO_TIMESTAMPING
for PHC binding, since optval will no longer be
integer.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/sockopt.c | 57 +++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 092d1f635d27..ea38cbcd2ad4 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -157,19 +157,7 @@ static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optnam
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 		bool slow = lock_sock_fast(ssk);
 
-		switch (optname) {
-		case SO_TIMESTAMP_OLD:
-		case SO_TIMESTAMP_NEW:
-		case SO_TIMESTAMPNS_OLD:
-		case SO_TIMESTAMPNS_NEW:
-			sock_set_timestamp(sk, optname, !!val);
-			break;
-		case SO_TIMESTAMPING_NEW:
-		case SO_TIMESTAMPING_OLD:
-			sock_set_timestamping(sk, optname, val);
-			break;
-		}
-
+		sock_set_timestamp(sk, optname, !!val);
 		unlock_sock_fast(ssk, slow);
 	}
 
@@ -178,7 +166,8 @@ static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optnam
 }
 
 static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
-					   sockptr_t optval, unsigned int optlen)
+					   sockptr_t optval,
+					   unsigned int optlen)
 {
 	int val, ret;
 
@@ -205,14 +194,45 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
 	case SO_TIMESTAMP_NEW:
 	case SO_TIMESTAMPNS_OLD:
 	case SO_TIMESTAMPNS_NEW:
-	case SO_TIMESTAMPING_OLD:
-	case SO_TIMESTAMPING_NEW:
 		return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val);
 	}
 
 	return -ENOPROTOOPT;
 }
 
+static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
+						    int optname,
+						    sockptr_t optval,
+						    unsigned int optlen)
+{
+	struct mptcp_subflow_context *subflow;
+	struct sock *sk = (struct sock *)msk;
+	int val, ret;
+
+	ret = mptcp_get_int_option(msk, optval, optlen, &val);
+	if (ret)
+		return ret;
+
+	ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
+			      KERNEL_SOCKPTR(&val), sizeof(val));
+	if (ret)
+		return ret;
+
+	lock_sock(sk);
+
+	mptcp_for_each_subflow(msk, subflow) {
+		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+		bool slow = lock_sock_fast(ssk);
+
+		sock_set_timestamping(sk, optname, val);
+		unlock_sock_fast(ssk, slow);
+	}
+
+	release_sock(sk);
+
+	return 0;
+}
+
 static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval,
 					      unsigned int optlen)
 {
@@ -299,9 +319,12 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
 	case SO_TIMESTAMP_NEW:
 	case SO_TIMESTAMPNS_OLD:
 	case SO_TIMESTAMPNS_NEW:
+		return mptcp_setsockopt_sol_socket_int(msk, optname, optval,
+						       optlen);
 	case SO_TIMESTAMPING_OLD:
 	case SO_TIMESTAMPING_NEW:
-		return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen);
+		return mptcp_setsockopt_sol_socket_timestamping(msk, optname,
+								optval, optlen);
 	case SO_LINGER:
 		return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen);
 	case SO_RCVLOWAT:

From d463126e23f112629edb01594141ca437a92a108 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:59 +0800
Subject: [PATCH 047/158] net: sock: extend SO_TIMESTAMPING for PHC binding

Since PTP virtual clock support is added, there can be
several PTP virtual clocks based on one PTP physical
clock for timestamping.

This patch is to extend SO_TIMESTAMPING API to support
PHC (PTP Hardware Clock) binding by adding a new flag
SOF_TIMESTAMPING_BIND_PHC. When PTP virtual clocks are
in use, user space can configure to bind one for
timestamping, but PTP physical clock is not supported
and not needed to bind.

This patch is preparation for timestamp conversion from
raw timestamp to a specific PTP virtual clock time in
core net.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h              |  8 +++-
 include/uapi/linux/net_tstamp.h | 17 ++++++++-
 net/core/sock.c                 | 65 +++++++++++++++++++++++++++++++--
 net/ethtool/common.c            |  1 +
 net/mptcp/sockopt.c             | 23 +++++++++---
 5 files changed, 101 insertions(+), 13 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 8bdd80027ffb..f23cb259b0e2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -316,7 +316,9 @@ struct bpf_local_storage;
   *	@sk_timer: sock cleanup timer
   *	@sk_stamp: time stamp of last packet received
   *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
-  *	@sk_tsflags: SO_TIMESTAMPING socket options
+  *	@sk_tsflags: SO_TIMESTAMPING flags
+  *	@sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
+  *	              for timestamping
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
   *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
   *	@sk_socket: Identd and reporting IO signals
@@ -493,6 +495,7 @@ struct sock {
 	seqlock_t		sk_stamp_seq;
 #endif
 	u16			sk_tsflags;
+	int			sk_bind_phc;
 	u8			sk_shutdown;
 	u32			sk_tskey;
 	atomic_t		sk_zckey;
@@ -2755,7 +2758,8 @@ void sock_def_readable(struct sock *sk);
 
 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
 void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
-int sock_set_timestamping(struct sock *sk, int optname, int val);
+int sock_set_timestamping(struct sock *sk, int optname,
+			  struct so_timestamping timestamping);
 
 void sock_enable_timestamps(struct sock *sk);
 void sock_no_linger(struct sock *sk);
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 7ed0b3d1c00a..fcc61c73a666 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -13,7 +13,7 @@
 #include <linux/types.h>
 #include <linux/socket.h>   /* for SO_TIMESTAMPING */
 
-/* SO_TIMESTAMPING gets an integer bit field comprised of these values */
+/* SO_TIMESTAMPING flags */
 enum {
 	SOF_TIMESTAMPING_TX_HARDWARE = (1<<0),
 	SOF_TIMESTAMPING_TX_SOFTWARE = (1<<1),
@@ -30,8 +30,9 @@ enum {
 	SOF_TIMESTAMPING_OPT_STATS = (1<<12),
 	SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
 	SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14),
+	SOF_TIMESTAMPING_BIND_PHC = (1 << 15),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TX_SWHW,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_BIND_PHC,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
@@ -46,6 +47,18 @@ enum {
 					 SOF_TIMESTAMPING_TX_SCHED | \
 					 SOF_TIMESTAMPING_TX_ACK)
 
+/**
+ * struct so_timestamping - SO_TIMESTAMPING parameter
+ *
+ * @flags:	SO_TIMESTAMPING flags
+ * @bind_phc:	Index of PTP virtual clock bound to sock. This is available
+ *		if flag SOF_TIMESTAMPING_BIND_PHC is set.
+ */
+struct so_timestamping {
+	int flags;
+	int bind_phc;
+};
+
 /**
  * struct hwtstamp_config - %SIOCGHWTSTAMP and %SIOCSHWTSTAMP parameter
  *
diff --git a/net/core/sock.c b/net/core/sock.c
index dd9599656c40..cad107112204 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,6 +139,8 @@
 #include <net/tcp.h>
 #include <net/busy_poll.h>
 
+#include <linux/ethtool.h>
+
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
@@ -810,8 +812,47 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 	}
 }
 
-int sock_set_timestamping(struct sock *sk, int optname, int val)
+static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 {
+	struct net *net = sock_net(sk);
+	struct net_device *dev = NULL;
+	bool match = false;
+	int *vclock_index;
+	int i, num;
+
+	if (sk->sk_bound_dev_if)
+		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+
+	if (!dev) {
+		pr_err("%s: sock not bind to device\n", __func__);
+		return -EOPNOTSUPP;
+	}
+
+	num = ethtool_get_phc_vclocks(dev, &vclock_index);
+	for (i = 0; i < num; i++) {
+		if (*(vclock_index + i) == phc_index) {
+			match = true;
+			break;
+		}
+	}
+
+	if (num > 0)
+		kfree(vclock_index);
+
+	if (!match)
+		return -EINVAL;
+
+	sk->sk_bind_phc = phc_index;
+
+	return 0;
+}
+
+int sock_set_timestamping(struct sock *sk, int optname,
+			  struct so_timestamping timestamping)
+{
+	int val = timestamping.flags;
+	int ret;
+
 	if (val & ~SOF_TIMESTAMPING_MASK)
 		return -EINVAL;
 
@@ -832,6 +873,12 @@ int sock_set_timestamping(struct sock *sk, int optname, int val)
 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 		return -EINVAL;
 
+	if (val & SOF_TIMESTAMPING_BIND_PHC) {
+		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
+		if (ret)
+			return ret;
+	}
+
 	sk->sk_tsflags = val;
 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 
@@ -907,6 +954,7 @@ EXPORT_SYMBOL(sock_set_mark);
 int sock_setsockopt(struct socket *sock, int level, int optname,
 		    sockptr_t optval, unsigned int optlen)
 {
+	struct so_timestamping timestamping;
 	struct sock_txtime sk_txtime;
 	struct sock *sk = sock->sk;
 	int val;
@@ -1073,7 +1121,15 @@ set_sndbuf:
 
 	case SO_TIMESTAMPING_NEW:
 	case SO_TIMESTAMPING_OLD:
-		ret = sock_set_timestamping(sk, optname, val);
+		if (optlen == sizeof(timestamping)) {
+			if (copy_from_sockptr(&timestamping, optval,
+					      sizeof(timestamping)))
+				return -EFAULT;
+		} else {
+			memset(&timestamping, 0, sizeof(timestamping));
+			timestamping.flags = val;
+		}
+		ret = sock_set_timestamping(sk, optname, timestamping);
 		break;
 
 	case SO_RCVLOWAT:
@@ -1348,6 +1404,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		struct __kernel_old_timeval tm;
 		struct  __kernel_sock_timeval stm;
 		struct sock_txtime txtime;
+		struct so_timestamping timestamping;
 	} v;
 
 	int lv = sizeof(int);
@@ -1451,7 +1508,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_TIMESTAMPING_OLD:
-		v.val = sk->sk_tsflags;
+		lv = sizeof(v.timestamping);
+		v.timestamping.flags = sk->sk_tsflags;
+		v.timestamping.bind_phc = sk->sk_bind_phc;
 		break;
 
 	case SO_RCVTIMEO_OLD:
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 798231b07676..c63e0739dc6a 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -398,6 +398,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = {
 	[const_ilog2(SOF_TIMESTAMPING_OPT_STATS)]    = "option-stats",
 	[const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)]  = "option-pktinfo",
 	[const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)]  = "option-tx-swhw",
+	[const_ilog2(SOF_TIMESTAMPING_BIND_PHC)]     = "bind-phc",
 };
 static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT);
 
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index ea38cbcd2ad4..8c03afac5ca0 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -207,14 +207,25 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
 {
 	struct mptcp_subflow_context *subflow;
 	struct sock *sk = (struct sock *)msk;
-	int val, ret;
+	struct so_timestamping timestamping;
+	int ret;
 
-	ret = mptcp_get_int_option(msk, optval, optlen, &val);
-	if (ret)
-		return ret;
+	if (optlen == sizeof(timestamping)) {
+		if (copy_from_sockptr(&timestamping, optval,
+				      sizeof(timestamping)))
+			return -EFAULT;
+	} else if (optlen == sizeof(int)) {
+		memset(&timestamping, 0, sizeof(timestamping));
+
+		if (copy_from_sockptr(&timestamping.flags, optval, sizeof(int)))
+			return -EFAULT;
+	} else {
+		return -EINVAL;
+	}
 
 	ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
-			      KERNEL_SOCKPTR(&val), sizeof(val));
+			      KERNEL_SOCKPTR(&timestamping),
+			      sizeof(timestamping));
 	if (ret)
 		return ret;
 
@@ -224,7 +235,7 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 		bool slow = lock_sock_fast(ssk);
 
-		sock_set_timestamping(sk, optname, val);
+		sock_set_timestamping(sk, optname, timestamping);
 		unlock_sock_fast(ssk, slow);
 	}
 

From d7c08826558811e1bf88de3750a7051f27d0e25c Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:12:00 +0800
Subject: [PATCH 048/158] net: socket: support hardware timestamp conversion to
 PHC bound

This patch is to support hardware timestamp conversion to
PHC bound. This applies to both RX and TX since their skb
handling (for TX, it's skb clone in error queue) all goes
through __sock_recv_timestamp.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index bd9233da2497..0b2dad3bdf7f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -104,6 +104,7 @@
 #include <linux/sockios.h>
 #include <net/busy_poll.h>
 #include <linux/errqueue.h>
+#include <linux/ptp_clock_kernel.h>
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 unsigned int sysctl_net_busy_read __read_mostly;
@@ -873,12 +874,18 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 		empty = 0;
 	if (shhwtstamps &&
 	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
-	    !skb_is_swtx_tstamp(skb, false_tstamp) &&
-	    ktime_to_timespec64_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
-		empty = 0;
-		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
-		    !skb_is_err_queue(skb))
-			put_ts_pktinfo(msg, skb);
+	    !skb_is_swtx_tstamp(skb, false_tstamp)) {
+		if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC)
+			ptp_convert_timestamp(shhwtstamps, sk->sk_bind_phc);
+
+		if (ktime_to_timespec64_cond(shhwtstamps->hwtstamp,
+					     tss.ts + 2)) {
+			empty = 0;
+
+			if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
+			    !skb_is_err_queue(skb))
+				put_ts_pktinfo(msg, skb);
+		}
 	}
 	if (!empty) {
 		if (sock_flag(sk, SOCK_TSTAMP_NEW))

From 2214d7032479e50ff1fa7276e6efe1df08173fbf Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:12:01 +0800
Subject: [PATCH 049/158] selftests/net: timestamping: support binding PHC

Support binding PHC of PTP vclock for timestamping.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/timestamping.c | 55 ++++++++++++++--------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/net/timestamping.c b/tools/testing/selftests/net/timestamping.c
index 21091be70688..aee631c5284e 100644
--- a/tools/testing/selftests/net/timestamping.c
+++ b/tools/testing/selftests/net/timestamping.c
@@ -47,7 +47,7 @@ static void usage(const char *error)
 {
 	if (error)
 		printf("invalid option: %s\n", error);
-	printf("timestamping interface option*\n\n"
+	printf("timestamping <interface> [bind_phc_index] [option]*\n\n"
 	       "Options:\n"
 	       "  IP_MULTICAST_LOOP - looping outgoing multicasts\n"
 	       "  SO_TIMESTAMP - normal software time stamping, ms resolution\n"
@@ -58,6 +58,7 @@ static void usage(const char *error)
 	       "  SOF_TIMESTAMPING_RX_SOFTWARE - software fallback for incoming packets\n"
 	       "  SOF_TIMESTAMPING_SOFTWARE - request reporting of software time stamps\n"
 	       "  SOF_TIMESTAMPING_RAW_HARDWARE - request reporting of raw HW time stamps\n"
+	       "  SOF_TIMESTAMPING_BIND_PHC - request to bind a PHC of PTP vclock\n"
 	       "  SIOCGSTAMP - check last socket time stamp\n"
 	       "  SIOCGSTAMPNS - more accurate socket time stamp\n"
 	       "  PTPV2 - use PTPv2 messages\n");
@@ -311,7 +312,6 @@ static void recvpacket(int sock, int recvmsg_flags,
 
 int main(int argc, char **argv)
 {
-	int so_timestamping_flags = 0;
 	int so_timestamp = 0;
 	int so_timestampns = 0;
 	int siocgstamp = 0;
@@ -325,6 +325,8 @@ int main(int argc, char **argv)
 	struct ifreq device;
 	struct ifreq hwtstamp;
 	struct hwtstamp_config hwconfig, hwconfig_requested;
+	struct so_timestamping so_timestamping_get = { 0, -1 };
+	struct so_timestamping so_timestamping = { 0, -1 };
 	struct sockaddr_in addr;
 	struct ip_mreq imr;
 	struct in_addr iaddr;
@@ -342,7 +344,12 @@ int main(int argc, char **argv)
 		exit(1);
 	}
 
-	for (i = 2; i < argc; i++) {
+	if (argc >= 3 && sscanf(argv[2], "%d", &so_timestamping.bind_phc) == 1)
+		val = 3;
+	else
+		val = 2;
+
+	for (i = val; i < argc; i++) {
 		if (!strcasecmp(argv[i], "SO_TIMESTAMP"))
 			so_timestamp = 1;
 		else if (!strcasecmp(argv[i], "SO_TIMESTAMPNS"))
@@ -356,17 +363,19 @@ int main(int argc, char **argv)
 		else if (!strcasecmp(argv[i], "PTPV2"))
 			ptpv2 = 1;
 		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_HARDWARE"))
-			so_timestamping_flags |= SOF_TIMESTAMPING_TX_HARDWARE;
+			so_timestamping.flags |= SOF_TIMESTAMPING_TX_HARDWARE;
 		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_SOFTWARE"))
-			so_timestamping_flags |= SOF_TIMESTAMPING_TX_SOFTWARE;
+			so_timestamping.flags |= SOF_TIMESTAMPING_TX_SOFTWARE;
 		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_HARDWARE"))
-			so_timestamping_flags |= SOF_TIMESTAMPING_RX_HARDWARE;
+			so_timestamping.flags |= SOF_TIMESTAMPING_RX_HARDWARE;
 		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_SOFTWARE"))
-			so_timestamping_flags |= SOF_TIMESTAMPING_RX_SOFTWARE;
+			so_timestamping.flags |= SOF_TIMESTAMPING_RX_SOFTWARE;
 		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SOFTWARE"))
-			so_timestamping_flags |= SOF_TIMESTAMPING_SOFTWARE;
+			so_timestamping.flags |= SOF_TIMESTAMPING_SOFTWARE;
 		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RAW_HARDWARE"))
-			so_timestamping_flags |= SOF_TIMESTAMPING_RAW_HARDWARE;
+			so_timestamping.flags |= SOF_TIMESTAMPING_RAW_HARDWARE;
+		else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_BIND_PHC"))
+			so_timestamping.flags |= SOF_TIMESTAMPING_BIND_PHC;
 		else
 			usage(argv[i]);
 	}
@@ -385,10 +394,10 @@ int main(int argc, char **argv)
 	hwtstamp.ifr_data = (void *)&hwconfig;
 	memset(&hwconfig, 0, sizeof(hwconfig));
 	hwconfig.tx_type =
-		(so_timestamping_flags & SOF_TIMESTAMPING_TX_HARDWARE) ?
+		(so_timestamping.flags & SOF_TIMESTAMPING_TX_HARDWARE) ?
 		HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
 	hwconfig.rx_filter =
-		(so_timestamping_flags & SOF_TIMESTAMPING_RX_HARDWARE) ?
+		(so_timestamping.flags & SOF_TIMESTAMPING_RX_HARDWARE) ?
 		ptpv2 ? HWTSTAMP_FILTER_PTP_V2_L4_SYNC :
 		HWTSTAMP_FILTER_PTP_V1_L4_SYNC : HWTSTAMP_FILTER_NONE;
 	hwconfig_requested = hwconfig;
@@ -413,6 +422,9 @@ int main(int argc, char **argv)
 		 sizeof(struct sockaddr_in)) < 0)
 		bail("bind");
 
+	if (setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, interface, if_len))
+		bail("bind device");
+
 	/* set multicast group for outgoing packets */
 	inet_aton("224.0.1.130", &iaddr); /* alternate PTP domain 1 */
 	addr.sin_addr = iaddr;
@@ -444,10 +456,9 @@ int main(int argc, char **argv)
 			   &enabled, sizeof(enabled)) < 0)
 		bail("setsockopt SO_TIMESTAMPNS");
 
-	if (so_timestamping_flags &&
-		setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING,
-			   &so_timestamping_flags,
-			   sizeof(so_timestamping_flags)) < 0)
+	if (so_timestamping.flags &&
+	    setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &so_timestamping,
+		       sizeof(so_timestamping)) < 0)
 		bail("setsockopt SO_TIMESTAMPING");
 
 	/* request IP_PKTINFO for debugging purposes */
@@ -468,14 +479,18 @@ int main(int argc, char **argv)
 	else
 		printf("SO_TIMESTAMPNS %d\n", val);
 
-	if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &val, &len) < 0) {
+	len = sizeof(so_timestamping_get);
+	if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &so_timestamping_get,
+		       &len) < 0) {
 		printf("%s: %s\n", "getsockopt SO_TIMESTAMPING",
 		       strerror(errno));
 	} else {
-		printf("SO_TIMESTAMPING %d\n", val);
-		if (val != so_timestamping_flags)
-			printf("   not the expected value %d\n",
-			       so_timestamping_flags);
+		printf("SO_TIMESTAMPING flags %d, bind phc %d\n",
+		       so_timestamping_get.flags, so_timestamping_get.bind_phc);
+		if (so_timestamping_get.flags != so_timestamping.flags ||
+		    so_timestamping_get.bind_phc != so_timestamping.bind_phc)
+			printf("   not expected, flags %d, bind phc %d\n",
+			       so_timestamping.flags, so_timestamping.bind_phc);
 	}
 
 	/* send packets forever every five seconds */

From 5ce15f2783332dce99ef04e2bdc927f0fa84105d Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:12:02 +0800
Subject: [PATCH 050/158] MAINTAINERS: add entry for PTP virtual clock driver

Add entry for PTP virtual clock driver.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 88449b7a4c95..ca7287ee775b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14890,6 +14890,13 @@ F:	drivers/net/phy/dp83640*
 F:	drivers/ptp/*
 F:	include/linux/ptp_cl*
 
+PTP VIRTUAL CLOCK SUPPORT
+M:	Yangbo Lu <yangbo.lu@nxp.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/ptp/ptp_vclock.c
+F:	net/ethtool/phc_vclocks.c
+
 PTRACE SUPPORT
 M:	Oleg Nesterov <oleg@redhat.com>
 S:	Maintained

From 4f408e1fa6e10b6da72691233369172bac7d9e9b Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
Date: Wed, 30 Jun 2021 14:36:17 -0400
Subject: [PATCH 051/158] ibmvnic: retry reset if there are no other resets

Normally, if a reset fails due to failover or other communication error
there is another reset (eg: FAILOVER) in the queue and we would process
that reset. But if we are unable to communicate with PHYP or VIOS after
H_FREE_CRQ, there would be no other resets in the queue and the adapter
would be in an undefined state even though it was in the OPEN state
earlier. While starting the reset we set the carrier to off state so
we won't even get the timeout resets.

If the last queued reset fails, retry it as a hard reset (after the
usual 60 second settling time).

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
Reviewed-by: Dany Madden <drt@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 374a75d4faea..ed77191d19f4 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -2420,9 +2420,10 @@ out:
 
 static void __ibmvnic_reset(struct work_struct *work)
 {
-	struct ibmvnic_rwi *rwi;
 	struct ibmvnic_adapter *adapter;
 	bool saved_state = false;
+	struct ibmvnic_rwi *tmprwi;
+	struct ibmvnic_rwi *rwi;
 	unsigned long flags;
 	u32 reset_state;
 	int rc = 0;
@@ -2489,7 +2490,7 @@ static void __ibmvnic_reset(struct work_struct *work)
 		} else {
 			rc = do_reset(adapter, rwi, reset_state);
 		}
-		kfree(rwi);
+		tmprwi = rwi;
 		adapter->last_reset_time = jiffies;
 
 		if (rc)
@@ -2497,8 +2498,23 @@ static void __ibmvnic_reset(struct work_struct *work)
 
 		rwi = get_next_rwi(adapter);
 
+		/*
+		 * If there is another reset queued, free the previous rwi
+		 * and process the new reset even if previous reset failed
+		 * (the previous reset could have failed because of a fail
+		 * over for instance, so process the fail over).
+		 *
+		 * If there are no resets queued and the previous reset failed,
+		 * the adapter would be in an undefined state. So retry the
+		 * previous reset as a hard reset.
+		 */
+		if (rwi)
+			kfree(tmprwi);
+		else if (rc)
+			rwi = tmprwi;
+
 		if (rwi && (rwi->reset_reason == VNIC_RESET_FAILOVER ||
-			    rwi->reset_reason == VNIC_RESET_MOBILITY))
+			    rwi->reset_reason == VNIC_RESET_MOBILITY || rc))
 			adapter->force_reset_recovery = true;
 	}
 

From 09ef17863f37235fe4e65a7d991e487b9ff6e553 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 30 Jun 2021 20:18:20 -0400
Subject: [PATCH 052/158] Documentation: add more details in tipc.rst

kernel-doc for TIPC is too simple, we need to add more information for it.

This patch is to extend the abstract, and add the Features and Links items.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tipc.rst | 121 +++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/tipc.rst b/Documentation/networking/tipc.rst
index 76775f24cdc8..ab63d298cca2 100644
--- a/Documentation/networking/tipc.rst
+++ b/Documentation/networking/tipc.rst
@@ -4,10 +4,125 @@
 Linux Kernel TIPC
 =================
 
-TIPC (Transparent Inter Process Communication) is a protocol that is
-specially designed for intra-cluster communication.
+Introduction
+============
 
-For more information about TIPC, see http://tipc.sourceforge.net.
+TIPC (Transparent Inter Process Communication) is a protocol that is specially
+designed for intra-cluster communication. It can be configured to transmit
+messages either on UDP or directly across Ethernet. Message delivery is
+sequence guaranteed, loss free and flow controlled. Latency times are shorter
+than with any other known protocol, while maximal throughput is comparable to
+that of TCP.
+
+TIPC Features
+-------------
+
+- Cluster wide IPC service
+
+  Have you ever wished you had the convenience of Unix Domain Sockets even when
+  transmitting data between cluster nodes? Where you yourself determine the
+  addresses you want to bind to and use? Where you don't have to perform DNS
+  lookups and worry about IP addresses? Where you don't have to start timers
+  to monitor the continuous existence of peer sockets? And yet without the
+  downsides of that socket type, such as the risk of lingering inodes?
+
+  Welcome to the Transparent Inter Process Communication service, TIPC in short,
+  which gives you all of this, and a lot more.
+
+- Service Addressing
+
+  A fundamental concept in TIPC is that of Service Addressing which makes it
+  possible for a programmer to chose his own address, bind it to a server
+  socket and let client programs use only that address for sending messages.
+
+- Service Tracking
+
+  A client wanting to wait for the availability of a server, uses the Service
+  Tracking mechanism to subscribe for binding and unbinding/close events for
+  sockets with the associated service address.
+
+  The service tracking mechanism can also be used for Cluster Topology Tracking,
+  i.e., subscribing for availability/non-availability of cluster nodes.
+
+  Likewise, the service tracking mechanism can be used for Cluster Connectivity
+  Tracking, i.e., subscribing for up/down events for individual links between
+  cluster nodes.
+
+- Transmission Modes
+
+  Using a service address, a client can send datagram messages to a server socket.
+
+  Using the same address type, it can establish a connection towards an accepting
+  server socket.
+
+  It can also use a service address to create and join a Communication Group,
+  which is the TIPC manifestation of a brokerless message bus.
+
+  Multicast with very good performance and scalability is available both in
+  datagram mode and in communication group mode.
+
+- Inter Node Links
+
+  Communication between any two nodes in a cluster is maintained by one or two
+  Inter Node Links, which both guarantee data traffic integrity and monitor
+  the peer node's availability.
+
+- Cluster Scalability
+
+  By applying the Overlapping Ring Monitoring algorithm on the inter node links
+  it is possible to scale TIPC clusters up to 1000 nodes with a maintained
+  neighbor failure discovery time of 1-2 seconds. For smaller clusters this
+  time can be made much shorter.
+
+- Neighbor Discovery
+
+  Neighbor Node Discovery in the cluster is done by Ethernet broadcast or UDP
+  multicast, when any of those services are available. If not, configured peer
+  IP addresses can be used.
+
+- Configuration
+
+  When running TIPC in single node mode no configuration whatsoever is needed.
+  When running in cluster mode TIPC must as a minimum be given a node address
+  (before Linux 4.17) and told which interface to attach to. The "tipc"
+  configuration tool makes is possible to add and maintain many more
+  configuration parameters.
+
+- Performance
+
+  TIPC message transfer latency times are better than in any other known protocol.
+  Maximal byte throughput for inter-node connections is still somewhat lower than
+  for TCP, while they are superior for intra-node and inter-container throughput
+  on the same host.
+
+- Language Support
+
+  The TIPC user API has support for C, Python, Perl, Ruby, D and Go.
+
+More Information
+----------------
+
+- How to set up TIPC:
+
+  http://tipc.io/getting_started.html
+
+- How to program with TIPC:
+
+  http://tipc.io/programming.html
+
+- How to contribute to TIPC:
+
+- http://tipc.io/contacts.html
+
+- More details about TIPC specification:
+
+  http://tipc.io/protocol.html
+
+
+Implementation
+==============
+
+TIPC is implemented as a kernel module in net/tipc/ directory.
 
 TIPC Base Types
 ---------------

From ca75bcf0a83b6cc7f53a593d98ec7121c4839b43 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Jul 2021 10:15:09 +0200
Subject: [PATCH 053/158] net: remove the caif_hsi driver

The caif_hsi driver relies on a cfhsi_get_ops symbol using symbol_get,
but this symbol is not provided anywhere in the kernel tree.  Remove
this driver given that it is dead code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/caif/Kconfig    |    9 -
 drivers/net/caif/Makefile   |    3 -
 drivers/net/caif/caif_hsi.c | 1454 -----------------------------------
 include/net/caif/caif_hsi.h |  200 -----
 4 files changed, 1666 deletions(-)
 delete mode 100644 drivers/net/caif/caif_hsi.c
 delete mode 100644 include/net/caif/caif_hsi.h

diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index a77124bc1f4b..709660cb38f8 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -20,15 +20,6 @@ config CAIF_TTY
 	  identified as N_CAIF. When this ldisc is opened from user space
 	  it will redirect the TTY's traffic into the CAIF stack.
 
-config CAIF_HSI
-	tristate "CAIF HSI transport driver"
-	depends on CAIF
-	default n
-	help
-	  The CAIF low level driver for CAIF over HSI.
-	  Be aware that if you enable this then you also need to
-	  enable a low-level HSI driver.
-
 config CAIF_VIRTIO
 	tristate "CAIF virtio transport driver"
 	depends on CAIF && HAS_DMA
diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile
index b1918c8c126c..97f664f8016c 100644
--- a/drivers/net/caif/Makefile
+++ b/drivers/net/caif/Makefile
@@ -4,8 +4,5 @@ ccflags-$(CONFIG_CAIF_DEBUG) := -DDEBUG
 # Serial interface
 obj-$(CONFIG_CAIF_TTY) += caif_serial.o
 
-# HSI interface
-obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
-
 # Virtio interface
 obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
deleted file mode 100644
index 3d63b15bbaa1..000000000000
--- a/drivers/net/caif/caif_hsi.c
+++ /dev/null
@@ -1,1454 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author:  Daniel Martensson
- *	    Dmitry.Tarnyagin  / dmitry.tarnyagin@lockless.no
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME fmt
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/netdevice.h>
-#include <linux/string.h>
-#include <linux/list.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/if_arp.h>
-#include <linux/timer.h>
-#include <net/rtnetlink.h>
-#include <linux/pkt_sched.h>
-#include <net/caif/caif_layer.h>
-#include <net/caif/caif_hsi.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Daniel Martensson");
-MODULE_DESCRIPTION("CAIF HSI driver");
-
-/* Returns the number of padding bytes for alignment. */
-#define PAD_POW2(x, pow) ((((x)&((pow)-1)) == 0) ? 0 :\
-				(((pow)-((x)&((pow)-1)))))
-
-static const struct cfhsi_config  hsi_default_config = {
-
-	/* Inactivity timeout on HSI, ms */
-	.inactivity_timeout = HZ,
-
-	/* Aggregation timeout (ms) of zero means no aggregation is done*/
-	.aggregation_timeout = 1,
-
-	/*
-	 * HSI link layer flow-control thresholds.
-	 * Threshold values for the HSI packet queue. Flow-control will be
-	 * asserted when the number of packets exceeds q_high_mark. It will
-	 * not be de-asserted before the number of packets drops below
-	 * q_low_mark.
-	 * Warning: A high threshold value might increase throughput but it
-	 * will at the same time prevent channel prioritization and increase
-	 * the risk of flooding the modem. The high threshold should be above
-	 * the low.
-	 */
-	.q_high_mark = 100,
-	.q_low_mark = 50,
-
-	/*
-	 * HSI padding options.
-	 * Warning: must be a base of 2 (& operation used) and can not be zero !
-	 */
-	.head_align = 4,
-	.tail_align = 4,
-};
-
-#define ON 1
-#define OFF 0
-
-static LIST_HEAD(cfhsi_list);
-
-static void cfhsi_inactivity_tout(struct timer_list *t)
-{
-	struct cfhsi *cfhsi = from_timer(cfhsi, t, inactivity_timer);
-
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	/* Schedule power down work queue. */
-	if (!test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		queue_work(cfhsi->wq, &cfhsi->wake_down_work);
-}
-
-static void cfhsi_update_aggregation_stats(struct cfhsi *cfhsi,
-					   const struct sk_buff *skb,
-					   int direction)
-{
-	struct caif_payload_info *info;
-	int hpad, tpad, len;
-
-	info = (struct caif_payload_info *)&skb->cb;
-	hpad = 1 + PAD_POW2((info->hdr_len + 1), cfhsi->cfg.head_align);
-	tpad = PAD_POW2((skb->len + hpad), cfhsi->cfg.tail_align);
-	len = skb->len + hpad + tpad;
-
-	if (direction > 0)
-		cfhsi->aggregation_len += len;
-	else if (direction < 0)
-		cfhsi->aggregation_len -= len;
-}
-
-static bool cfhsi_can_send_aggregate(struct cfhsi *cfhsi)
-{
-	int i;
-
-	if (cfhsi->cfg.aggregation_timeout == 0)
-		return true;
-
-	for (i = 0; i < CFHSI_PRIO_BEBK; ++i) {
-		if (cfhsi->qhead[i].qlen)
-			return true;
-	}
-
-	/* TODO: Use aggregation_len instead */
-	if (cfhsi->qhead[CFHSI_PRIO_BEBK].qlen >= CFHSI_MAX_PKTS)
-		return true;
-
-	return false;
-}
-
-static struct sk_buff *cfhsi_dequeue(struct cfhsi *cfhsi)
-{
-	struct sk_buff *skb;
-	int i;
-
-	for (i = 0; i < CFHSI_PRIO_LAST; ++i) {
-		skb = skb_dequeue(&cfhsi->qhead[i]);
-		if (skb)
-			break;
-	}
-
-	return skb;
-}
-
-static int cfhsi_tx_queue_len(struct cfhsi *cfhsi)
-{
-	int i, len = 0;
-	for (i = 0; i < CFHSI_PRIO_LAST; ++i)
-		len += skb_queue_len(&cfhsi->qhead[i]);
-	return len;
-}
-
-static void cfhsi_abort_tx(struct cfhsi *cfhsi)
-{
-	struct sk_buff *skb;
-
-	for (;;) {
-		spin_lock_bh(&cfhsi->lock);
-		skb = cfhsi_dequeue(cfhsi);
-		if (!skb)
-			break;
-
-		cfhsi->ndev->stats.tx_errors++;
-		cfhsi->ndev->stats.tx_dropped++;
-		cfhsi_update_aggregation_stats(cfhsi, skb, -1);
-		spin_unlock_bh(&cfhsi->lock);
-		kfree_skb(skb);
-	}
-	cfhsi->tx_state = CFHSI_TX_STATE_IDLE;
-	if (!test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		mod_timer(&cfhsi->inactivity_timer,
-			jiffies + cfhsi->cfg.inactivity_timeout);
-	spin_unlock_bh(&cfhsi->lock);
-}
-
-static int cfhsi_flush_fifo(struct cfhsi *cfhsi)
-{
-	char buffer[32]; /* Any reasonable value */
-	size_t fifo_occupancy;
-	int ret;
-
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	do {
-		ret = cfhsi->ops->cfhsi_fifo_occupancy(cfhsi->ops,
-				&fifo_occupancy);
-		if (ret) {
-			netdev_warn(cfhsi->ndev,
-				"%s: can't get FIFO occupancy: %d.\n",
-				__func__, ret);
-			break;
-		} else if (!fifo_occupancy)
-			/* No more data, exitting normally */
-			break;
-
-		fifo_occupancy = min(sizeof(buffer), fifo_occupancy);
-		set_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits);
-		ret = cfhsi->ops->cfhsi_rx(buffer, fifo_occupancy,
-				cfhsi->ops);
-		if (ret) {
-			clear_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits);
-			netdev_warn(cfhsi->ndev,
-				"%s: can't read data: %d.\n",
-				__func__, ret);
-			break;
-		}
-
-		ret = 5 * HZ;
-		ret = wait_event_interruptible_timeout(cfhsi->flush_fifo_wait,
-			 !test_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits), ret);
-
-		if (ret < 0) {
-			netdev_warn(cfhsi->ndev,
-				"%s: can't wait for flush complete: %d.\n",
-				__func__, ret);
-			break;
-		} else if (!ret) {
-			ret = -ETIMEDOUT;
-			netdev_warn(cfhsi->ndev,
-				"%s: timeout waiting for flush complete.\n",
-				__func__);
-			break;
-		}
-	} while (1);
-
-	return ret;
-}
-
-static int cfhsi_tx_frm(struct cfhsi_desc *desc, struct cfhsi *cfhsi)
-{
-	int nfrms = 0;
-	int pld_len = 0;
-	struct sk_buff *skb;
-	u8 *pfrm = desc->emb_frm + CFHSI_MAX_EMB_FRM_SZ;
-
-	skb = cfhsi_dequeue(cfhsi);
-	if (!skb)
-		return 0;
-
-	/* Clear offset. */
-	desc->offset = 0;
-
-	/* Check if we can embed a CAIF frame. */
-	if (skb->len < CFHSI_MAX_EMB_FRM_SZ) {
-		struct caif_payload_info *info;
-		int hpad;
-		int tpad;
-
-		/* Calculate needed head alignment and tail alignment. */
-		info = (struct caif_payload_info *)&skb->cb;
-
-		hpad = 1 + PAD_POW2((info->hdr_len + 1), cfhsi->cfg.head_align);
-		tpad = PAD_POW2((skb->len + hpad), cfhsi->cfg.tail_align);
-
-		/* Check if frame still fits with added alignment. */
-		if ((skb->len + hpad + tpad) <= CFHSI_MAX_EMB_FRM_SZ) {
-			u8 *pemb = desc->emb_frm;
-			desc->offset = CFHSI_DESC_SHORT_SZ;
-			*pemb = (u8)(hpad - 1);
-			pemb += hpad;
-
-			/* Update network statistics. */
-			spin_lock_bh(&cfhsi->lock);
-			cfhsi->ndev->stats.tx_packets++;
-			cfhsi->ndev->stats.tx_bytes += skb->len;
-			cfhsi_update_aggregation_stats(cfhsi, skb, -1);
-			spin_unlock_bh(&cfhsi->lock);
-
-			/* Copy in embedded CAIF frame. */
-			skb_copy_bits(skb, 0, pemb, skb->len);
-
-			/* Consume the SKB */
-			consume_skb(skb);
-			skb = NULL;
-		}
-	}
-
-	/* Create payload CAIF frames. */
-	while (nfrms < CFHSI_MAX_PKTS) {
-		struct caif_payload_info *info;
-		int hpad;
-		int tpad;
-
-		if (!skb)
-			skb = cfhsi_dequeue(cfhsi);
-
-		if (!skb)
-			break;
-
-		/* Calculate needed head alignment and tail alignment. */
-		info = (struct caif_payload_info *)&skb->cb;
-
-		hpad = 1 + PAD_POW2((info->hdr_len + 1), cfhsi->cfg.head_align);
-		tpad = PAD_POW2((skb->len + hpad), cfhsi->cfg.tail_align);
-
-		/* Fill in CAIF frame length in descriptor. */
-		desc->cffrm_len[nfrms] = hpad + skb->len + tpad;
-
-		/* Fill head padding information. */
-		*pfrm = (u8)(hpad - 1);
-		pfrm += hpad;
-
-		/* Update network statistics. */
-		spin_lock_bh(&cfhsi->lock);
-		cfhsi->ndev->stats.tx_packets++;
-		cfhsi->ndev->stats.tx_bytes += skb->len;
-		cfhsi_update_aggregation_stats(cfhsi, skb, -1);
-		spin_unlock_bh(&cfhsi->lock);
-
-		/* Copy in CAIF frame. */
-		skb_copy_bits(skb, 0, pfrm, skb->len);
-
-		/* Update payload length. */
-		pld_len += desc->cffrm_len[nfrms];
-
-		/* Update frame pointer. */
-		pfrm += skb->len + tpad;
-
-		/* Consume the SKB */
-		consume_skb(skb);
-		skb = NULL;
-
-		/* Update number of frames. */
-		nfrms++;
-	}
-
-	/* Unused length fields should be zero-filled (according to SPEC). */
-	while (nfrms < CFHSI_MAX_PKTS) {
-		desc->cffrm_len[nfrms] = 0x0000;
-		nfrms++;
-	}
-
-	/* Check if we can piggy-back another descriptor. */
-	if (cfhsi_can_send_aggregate(cfhsi))
-		desc->header |= CFHSI_PIGGY_DESC;
-	else
-		desc->header &= ~CFHSI_PIGGY_DESC;
-
-	return CFHSI_DESC_SZ + pld_len;
-}
-
-static void cfhsi_start_tx(struct cfhsi *cfhsi)
-{
-	struct cfhsi_desc *desc = (struct cfhsi_desc *)cfhsi->tx_buf;
-	int len, res;
-
-	netdev_dbg(cfhsi->ndev, "%s.\n", __func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	do {
-		/* Create HSI frame. */
-		len = cfhsi_tx_frm(desc, cfhsi);
-		if (!len) {
-			spin_lock_bh(&cfhsi->lock);
-			if (unlikely(cfhsi_tx_queue_len(cfhsi))) {
-				spin_unlock_bh(&cfhsi->lock);
-				res = -EAGAIN;
-				continue;
-			}
-			cfhsi->tx_state = CFHSI_TX_STATE_IDLE;
-			/* Start inactivity timer. */
-			mod_timer(&cfhsi->inactivity_timer,
-				jiffies + cfhsi->cfg.inactivity_timeout);
-			spin_unlock_bh(&cfhsi->lock);
-			break;
-		}
-
-		/* Set up new transfer. */
-		res = cfhsi->ops->cfhsi_tx(cfhsi->tx_buf, len, cfhsi->ops);
-		if (WARN_ON(res < 0))
-			netdev_err(cfhsi->ndev, "%s: TX error %d.\n",
-				__func__, res);
-	} while (res < 0);
-}
-
-static void cfhsi_tx_done(struct cfhsi *cfhsi)
-{
-	netdev_dbg(cfhsi->ndev, "%s.\n", __func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	/*
-	 * Send flow on if flow off has been previously signalled
-	 * and number of packets is below low water mark.
-	 */
-	spin_lock_bh(&cfhsi->lock);
-	if (cfhsi->flow_off_sent &&
-			cfhsi_tx_queue_len(cfhsi) <= cfhsi->cfg.q_low_mark &&
-			cfhsi->cfdev.flowctrl) {
-
-		cfhsi->flow_off_sent = 0;
-		cfhsi->cfdev.flowctrl(cfhsi->ndev, ON);
-	}
-
-	if (cfhsi_can_send_aggregate(cfhsi)) {
-		spin_unlock_bh(&cfhsi->lock);
-		cfhsi_start_tx(cfhsi);
-	} else {
-		mod_timer(&cfhsi->aggregation_timer,
-			jiffies + cfhsi->cfg.aggregation_timeout);
-		spin_unlock_bh(&cfhsi->lock);
-	}
-
-	return;
-}
-
-static void cfhsi_tx_done_cb(struct cfhsi_cb_ops *cb_ops)
-{
-	struct cfhsi *cfhsi;
-
-	cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-	cfhsi_tx_done(cfhsi);
-}
-
-static int cfhsi_rx_desc(struct cfhsi_desc *desc, struct cfhsi *cfhsi)
-{
-	int xfer_sz = 0;
-	int nfrms = 0;
-	u16 *plen = NULL;
-	u8 *pfrm = NULL;
-
-	if ((desc->header & ~CFHSI_PIGGY_DESC) ||
-			(desc->offset > CFHSI_MAX_EMB_FRM_SZ)) {
-		netdev_err(cfhsi->ndev, "%s: Invalid descriptor.\n",
-			__func__);
-		return -EPROTO;
-	}
-
-	/* Check for embedded CAIF frame. */
-	if (desc->offset) {
-		struct sk_buff *skb;
-		int len = 0;
-		pfrm = ((u8 *)desc) + desc->offset;
-
-		/* Remove offset padding. */
-		pfrm += *pfrm + 1;
-
-		/* Read length of CAIF frame (little endian). */
-		len = *pfrm;
-		len |= ((*(pfrm+1)) << 8) & 0xFF00;
-		len += 2;	/* Add FCS fields. */
-
-		/* Sanity check length of CAIF frame. */
-		if (unlikely(len > CFHSI_MAX_CAIF_FRAME_SZ)) {
-			netdev_err(cfhsi->ndev, "%s: Invalid length.\n",
-				__func__);
-			return -EPROTO;
-		}
-
-		/* Allocate SKB (OK even in IRQ context). */
-		skb = alloc_skb(len + 1, GFP_ATOMIC);
-		if (!skb) {
-			netdev_err(cfhsi->ndev, "%s: Out of memory !\n",
-				__func__);
-			return -ENOMEM;
-		}
-		caif_assert(skb != NULL);
-
-		skb_put_data(skb, pfrm, len);
-
-		skb->protocol = htons(ETH_P_CAIF);
-		skb_reset_mac_header(skb);
-		skb->dev = cfhsi->ndev;
-
-		netif_rx_any_context(skb);
-
-		/* Update network statistics. */
-		cfhsi->ndev->stats.rx_packets++;
-		cfhsi->ndev->stats.rx_bytes += len;
-	}
-
-	/* Calculate transfer length. */
-	plen = desc->cffrm_len;
-	while (nfrms < CFHSI_MAX_PKTS && *plen) {
-		xfer_sz += *plen;
-		plen++;
-		nfrms++;
-	}
-
-	/* Check for piggy-backed descriptor. */
-	if (desc->header & CFHSI_PIGGY_DESC)
-		xfer_sz += CFHSI_DESC_SZ;
-
-	if ((xfer_sz % 4) || (xfer_sz > (CFHSI_BUF_SZ_RX - CFHSI_DESC_SZ))) {
-		netdev_err(cfhsi->ndev,
-				"%s: Invalid payload len: %d, ignored.\n",
-			__func__, xfer_sz);
-		return -EPROTO;
-	}
-	return xfer_sz;
-}
-
-static int cfhsi_rx_desc_len(struct cfhsi_desc *desc)
-{
-	int xfer_sz = 0;
-	int nfrms = 0;
-	u16 *plen;
-
-	if ((desc->header & ~CFHSI_PIGGY_DESC) ||
-			(desc->offset > CFHSI_MAX_EMB_FRM_SZ)) {
-
-		pr_err("Invalid descriptor. %x %x\n", desc->header,
-				desc->offset);
-		return -EPROTO;
-	}
-
-	/* Calculate transfer length. */
-	plen = desc->cffrm_len;
-	while (nfrms < CFHSI_MAX_PKTS && *plen) {
-		xfer_sz += *plen;
-		plen++;
-		nfrms++;
-	}
-
-	if (xfer_sz % 4) {
-		pr_err("Invalid payload len: %d, ignored.\n", xfer_sz);
-		return -EPROTO;
-	}
-	return xfer_sz;
-}
-
-static int cfhsi_rx_pld(struct cfhsi_desc *desc, struct cfhsi *cfhsi)
-{
-	int rx_sz = 0;
-	int nfrms = 0;
-	u16 *plen = NULL;
-	u8 *pfrm = NULL;
-
-	/* Sanity check header and offset. */
-	if (WARN_ON((desc->header & ~CFHSI_PIGGY_DESC) ||
-			(desc->offset > CFHSI_MAX_EMB_FRM_SZ))) {
-		netdev_err(cfhsi->ndev, "%s: Invalid descriptor.\n",
-			__func__);
-		return -EPROTO;
-	}
-
-	/* Set frame pointer to start of payload. */
-	pfrm = desc->emb_frm + CFHSI_MAX_EMB_FRM_SZ;
-	plen = desc->cffrm_len;
-
-	/* Skip already processed frames. */
-	while (nfrms < cfhsi->rx_state.nfrms) {
-		pfrm += *plen;
-		rx_sz += *plen;
-		plen++;
-		nfrms++;
-	}
-
-	/* Parse payload. */
-	while (nfrms < CFHSI_MAX_PKTS && *plen) {
-		struct sk_buff *skb;
-		u8 *pcffrm = NULL;
-		int len;
-
-		/* CAIF frame starts after head padding. */
-		pcffrm = pfrm + *pfrm + 1;
-
-		/* Read length of CAIF frame (little endian). */
-		len = *pcffrm;
-		len |= ((*(pcffrm + 1)) << 8) & 0xFF00;
-		len += 2;	/* Add FCS fields. */
-
-		/* Sanity check length of CAIF frames. */
-		if (unlikely(len > CFHSI_MAX_CAIF_FRAME_SZ)) {
-			netdev_err(cfhsi->ndev, "%s: Invalid length.\n",
-				__func__);
-			return -EPROTO;
-		}
-
-		/* Allocate SKB (OK even in IRQ context). */
-		skb = alloc_skb(len + 1, GFP_ATOMIC);
-		if (!skb) {
-			netdev_err(cfhsi->ndev, "%s: Out of memory !\n",
-				__func__);
-			cfhsi->rx_state.nfrms = nfrms;
-			return -ENOMEM;
-		}
-		caif_assert(skb != NULL);
-
-		skb_put_data(skb, pcffrm, len);
-
-		skb->protocol = htons(ETH_P_CAIF);
-		skb_reset_mac_header(skb);
-		skb->dev = cfhsi->ndev;
-
-		netif_rx_any_context(skb);
-
-		/* Update network statistics. */
-		cfhsi->ndev->stats.rx_packets++;
-		cfhsi->ndev->stats.rx_bytes += len;
-
-		pfrm += *plen;
-		rx_sz += *plen;
-		plen++;
-		nfrms++;
-	}
-
-	return rx_sz;
-}
-
-static void cfhsi_rx_done(struct cfhsi *cfhsi)
-{
-	int res;
-	int desc_pld_len = 0, rx_len, rx_state;
-	struct cfhsi_desc *desc = NULL;
-	u8 *rx_ptr, *rx_buf;
-	struct cfhsi_desc *piggy_desc = NULL;
-
-	desc = (struct cfhsi_desc *)cfhsi->rx_buf;
-
-	netdev_dbg(cfhsi->ndev, "%s\n", __func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	/* Update inactivity timer if pending. */
-	spin_lock_bh(&cfhsi->lock);
-	mod_timer_pending(&cfhsi->inactivity_timer,
-			jiffies + cfhsi->cfg.inactivity_timeout);
-	spin_unlock_bh(&cfhsi->lock);
-
-	if (cfhsi->rx_state.state == CFHSI_RX_STATE_DESC) {
-		desc_pld_len = cfhsi_rx_desc_len(desc);
-
-		if (desc_pld_len < 0)
-			goto out_of_sync;
-
-		rx_buf = cfhsi->rx_buf;
-		rx_len = desc_pld_len;
-		if (desc_pld_len > 0 && (desc->header & CFHSI_PIGGY_DESC))
-			rx_len += CFHSI_DESC_SZ;
-		if (desc_pld_len == 0)
-			rx_buf = cfhsi->rx_flip_buf;
-	} else {
-		rx_buf = cfhsi->rx_flip_buf;
-
-		rx_len = CFHSI_DESC_SZ;
-		if (cfhsi->rx_state.pld_len > 0 &&
-				(desc->header & CFHSI_PIGGY_DESC)) {
-
-			piggy_desc = (struct cfhsi_desc *)
-				(desc->emb_frm + CFHSI_MAX_EMB_FRM_SZ +
-						cfhsi->rx_state.pld_len);
-
-			cfhsi->rx_state.piggy_desc = true;
-
-			/* Extract payload len from piggy-backed descriptor. */
-			desc_pld_len = cfhsi_rx_desc_len(piggy_desc);
-			if (desc_pld_len < 0)
-				goto out_of_sync;
-
-			if (desc_pld_len > 0) {
-				rx_len = desc_pld_len;
-				if (piggy_desc->header & CFHSI_PIGGY_DESC)
-					rx_len += CFHSI_DESC_SZ;
-			}
-
-			/*
-			 * Copy needed information from the piggy-backed
-			 * descriptor to the descriptor in the start.
-			 */
-			memcpy(rx_buf, (u8 *)piggy_desc,
-					CFHSI_DESC_SHORT_SZ);
-		}
-	}
-
-	if (desc_pld_len) {
-		rx_state = CFHSI_RX_STATE_PAYLOAD;
-		rx_ptr = rx_buf + CFHSI_DESC_SZ;
-	} else {
-		rx_state = CFHSI_RX_STATE_DESC;
-		rx_ptr = rx_buf;
-		rx_len = CFHSI_DESC_SZ;
-	}
-
-	/* Initiate next read */
-	if (test_bit(CFHSI_AWAKE, &cfhsi->bits)) {
-		/* Set up new transfer. */
-		netdev_dbg(cfhsi->ndev, "%s: Start RX.\n",
-				__func__);
-
-		res = cfhsi->ops->cfhsi_rx(rx_ptr, rx_len,
-				cfhsi->ops);
-		if (WARN_ON(res < 0)) {
-			netdev_err(cfhsi->ndev, "%s: RX error %d.\n",
-				__func__, res);
-			cfhsi->ndev->stats.rx_errors++;
-			cfhsi->ndev->stats.rx_dropped++;
-		}
-	}
-
-	if (cfhsi->rx_state.state == CFHSI_RX_STATE_DESC) {
-		/* Extract payload from descriptor */
-		if (cfhsi_rx_desc(desc, cfhsi) < 0)
-			goto out_of_sync;
-	} else {
-		/* Extract payload */
-		if (cfhsi_rx_pld(desc, cfhsi) < 0)
-			goto out_of_sync;
-		if (piggy_desc) {
-			/* Extract any payload in piggyback descriptor. */
-			if (cfhsi_rx_desc(piggy_desc, cfhsi) < 0)
-				goto out_of_sync;
-			/* Mark no embedded frame after extracting it */
-			piggy_desc->offset = 0;
-		}
-	}
-
-	/* Update state info */
-	memset(&cfhsi->rx_state, 0, sizeof(cfhsi->rx_state));
-	cfhsi->rx_state.state = rx_state;
-	cfhsi->rx_ptr = rx_ptr;
-	cfhsi->rx_len = rx_len;
-	cfhsi->rx_state.pld_len = desc_pld_len;
-	cfhsi->rx_state.piggy_desc = desc->header & CFHSI_PIGGY_DESC;
-
-	if (rx_buf != cfhsi->rx_buf)
-		swap(cfhsi->rx_buf, cfhsi->rx_flip_buf);
-	return;
-
-out_of_sync:
-	netdev_err(cfhsi->ndev, "%s: Out of sync.\n", __func__);
-	print_hex_dump_bytes("--> ", DUMP_PREFIX_NONE,
-			cfhsi->rx_buf, CFHSI_DESC_SZ);
-	schedule_work(&cfhsi->out_of_sync_work);
-}
-
-static void cfhsi_rx_slowpath(struct timer_list *t)
-{
-	struct cfhsi *cfhsi = from_timer(cfhsi, t, rx_slowpath_timer);
-
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	cfhsi_rx_done(cfhsi);
-}
-
-static void cfhsi_rx_done_cb(struct cfhsi_cb_ops *cb_ops)
-{
-	struct cfhsi *cfhsi;
-
-	cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	if (test_and_clear_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits))
-		wake_up_interruptible(&cfhsi->flush_fifo_wait);
-	else
-		cfhsi_rx_done(cfhsi);
-}
-
-static void cfhsi_wake_up(struct work_struct *work)
-{
-	struct cfhsi *cfhsi = NULL;
-	int res;
-	int len;
-	long ret;
-
-	cfhsi = container_of(work, struct cfhsi, wake_up_work);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	if (unlikely(test_bit(CFHSI_AWAKE, &cfhsi->bits))) {
-		/* It happenes when wakeup is requested by
-		 * both ends at the same time. */
-		clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-		clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-		return;
-	}
-
-	/* Activate wake line. */
-	cfhsi->ops->cfhsi_wake_up(cfhsi->ops);
-
-	netdev_dbg(cfhsi->ndev, "%s: Start waiting.\n",
-		__func__);
-
-	/* Wait for acknowledge. */
-	ret = CFHSI_WAKE_TOUT;
-	ret = wait_event_interruptible_timeout(cfhsi->wake_up_wait,
-					test_and_clear_bit(CFHSI_WAKE_UP_ACK,
-							&cfhsi->bits), ret);
-	if (unlikely(ret < 0)) {
-		/* Interrupted by signal. */
-		netdev_err(cfhsi->ndev, "%s: Signalled: %ld.\n",
-			__func__, ret);
-
-		clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-		cfhsi->ops->cfhsi_wake_down(cfhsi->ops);
-		return;
-	} else if (!ret) {
-		bool ca_wake = false;
-		size_t fifo_occupancy = 0;
-
-		/* Wakeup timeout */
-		netdev_dbg(cfhsi->ndev, "%s: Timeout.\n",
-			__func__);
-
-		/* Check FIFO to check if modem has sent something. */
-		WARN_ON(cfhsi->ops->cfhsi_fifo_occupancy(cfhsi->ops,
-					&fifo_occupancy));
-
-		netdev_dbg(cfhsi->ndev, "%s: Bytes in FIFO: %u.\n",
-				__func__, (unsigned) fifo_occupancy);
-
-		/* Check if we misssed the interrupt. */
-		WARN_ON(cfhsi->ops->cfhsi_get_peer_wake(cfhsi->ops,
-							&ca_wake));
-
-		if (ca_wake) {
-			netdev_err(cfhsi->ndev, "%s: CA Wake missed !.\n",
-				__func__);
-
-			/* Clear the CFHSI_WAKE_UP_ACK bit to prevent race. */
-			clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-
-			/* Continue execution. */
-			goto wake_ack;
-		}
-
-		clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-		cfhsi->ops->cfhsi_wake_down(cfhsi->ops);
-		return;
-	}
-wake_ack:
-	netdev_dbg(cfhsi->ndev, "%s: Woken.\n",
-		__func__);
-
-	/* Clear power up bit. */
-	set_bit(CFHSI_AWAKE, &cfhsi->bits);
-	clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-
-	/* Resume read operation. */
-	netdev_dbg(cfhsi->ndev, "%s: Start RX.\n", __func__);
-	res = cfhsi->ops->cfhsi_rx(cfhsi->rx_ptr, cfhsi->rx_len, cfhsi->ops);
-
-	if (WARN_ON(res < 0))
-		netdev_err(cfhsi->ndev, "%s: RX err %d.\n", __func__, res);
-
-	/* Clear power up acknowledment. */
-	clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-
-	spin_lock_bh(&cfhsi->lock);
-
-	/* Resume transmit if queues are not empty. */
-	if (!cfhsi_tx_queue_len(cfhsi)) {
-		netdev_dbg(cfhsi->ndev, "%s: Peer wake, start timer.\n",
-			__func__);
-		/* Start inactivity timer. */
-		mod_timer(&cfhsi->inactivity_timer,
-				jiffies + cfhsi->cfg.inactivity_timeout);
-		spin_unlock_bh(&cfhsi->lock);
-		return;
-	}
-
-	netdev_dbg(cfhsi->ndev, "%s: Host wake.\n",
-		__func__);
-
-	spin_unlock_bh(&cfhsi->lock);
-
-	/* Create HSI frame. */
-	len = cfhsi_tx_frm((struct cfhsi_desc *)cfhsi->tx_buf, cfhsi);
-
-	if (likely(len > 0)) {
-		/* Set up new transfer. */
-		res = cfhsi->ops->cfhsi_tx(cfhsi->tx_buf, len, cfhsi->ops);
-		if (WARN_ON(res < 0)) {
-			netdev_err(cfhsi->ndev, "%s: TX error %d.\n",
-				__func__, res);
-			cfhsi_abort_tx(cfhsi);
-		}
-	} else {
-		netdev_err(cfhsi->ndev,
-				"%s: Failed to create HSI frame: %d.\n",
-				__func__, len);
-	}
-}
-
-static void cfhsi_wake_down(struct work_struct *work)
-{
-	long ret;
-	struct cfhsi *cfhsi = NULL;
-	size_t fifo_occupancy = 0;
-	int retry = CFHSI_WAKE_TOUT;
-
-	cfhsi = container_of(work, struct cfhsi, wake_down_work);
-	netdev_dbg(cfhsi->ndev, "%s.\n", __func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	/* Deactivate wake line. */
-	cfhsi->ops->cfhsi_wake_down(cfhsi->ops);
-
-	/* Wait for acknowledge. */
-	ret = CFHSI_WAKE_TOUT;
-	ret = wait_event_interruptible_timeout(cfhsi->wake_down_wait,
-					test_and_clear_bit(CFHSI_WAKE_DOWN_ACK,
-							&cfhsi->bits), ret);
-	if (ret < 0) {
-		/* Interrupted by signal. */
-		netdev_err(cfhsi->ndev, "%s: Signalled: %ld.\n",
-			__func__, ret);
-		return;
-	} else if (!ret) {
-		bool ca_wake = true;
-
-		/* Timeout */
-		netdev_err(cfhsi->ndev, "%s: Timeout.\n", __func__);
-
-		/* Check if we misssed the interrupt. */
-		WARN_ON(cfhsi->ops->cfhsi_get_peer_wake(cfhsi->ops,
-							&ca_wake));
-		if (!ca_wake)
-			netdev_err(cfhsi->ndev, "%s: CA Wake missed !.\n",
-				__func__);
-	}
-
-	/* Check FIFO occupancy. */
-	while (retry) {
-		WARN_ON(cfhsi->ops->cfhsi_fifo_occupancy(cfhsi->ops,
-							&fifo_occupancy));
-
-		if (!fifo_occupancy)
-			break;
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(1);
-		retry--;
-	}
-
-	if (!retry)
-		netdev_err(cfhsi->ndev, "%s: FIFO Timeout.\n", __func__);
-
-	/* Clear AWAKE condition. */
-	clear_bit(CFHSI_AWAKE, &cfhsi->bits);
-
-	/* Cancel pending RX requests. */
-	cfhsi->ops->cfhsi_rx_cancel(cfhsi->ops);
-}
-
-static void cfhsi_out_of_sync(struct work_struct *work)
-{
-	struct cfhsi *cfhsi = NULL;
-
-	cfhsi = container_of(work, struct cfhsi, out_of_sync_work);
-
-	rtnl_lock();
-	dev_close(cfhsi->ndev);
-	rtnl_unlock();
-}
-
-static void cfhsi_wake_up_cb(struct cfhsi_cb_ops *cb_ops)
-{
-	struct cfhsi *cfhsi = NULL;
-
-	cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	set_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-	wake_up_interruptible(&cfhsi->wake_up_wait);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	/* Schedule wake up work queue if the peer initiates. */
-	if (!test_and_set_bit(CFHSI_WAKE_UP, &cfhsi->bits))
-		queue_work(cfhsi->wq, &cfhsi->wake_up_work);
-}
-
-static void cfhsi_wake_down_cb(struct cfhsi_cb_ops *cb_ops)
-{
-	struct cfhsi *cfhsi = NULL;
-
-	cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	/* Initiating low power is only permitted by the host (us). */
-	set_bit(CFHSI_WAKE_DOWN_ACK, &cfhsi->bits);
-	wake_up_interruptible(&cfhsi->wake_down_wait);
-}
-
-static void cfhsi_aggregation_tout(struct timer_list *t)
-{
-	struct cfhsi *cfhsi = from_timer(cfhsi, t, aggregation_timer);
-
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	cfhsi_start_tx(cfhsi);
-}
-
-static netdev_tx_t cfhsi_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-	struct cfhsi *cfhsi = NULL;
-	int start_xfer = 0;
-	int timer_active;
-	int prio;
-
-	if (!dev)
-		return -EINVAL;
-
-	cfhsi = netdev_priv(dev);
-
-	switch (skb->priority) {
-	case TC_PRIO_BESTEFFORT:
-	case TC_PRIO_FILLER:
-	case TC_PRIO_BULK:
-		prio = CFHSI_PRIO_BEBK;
-		break;
-	case TC_PRIO_INTERACTIVE_BULK:
-		prio = CFHSI_PRIO_VI;
-		break;
-	case TC_PRIO_INTERACTIVE:
-		prio = CFHSI_PRIO_VO;
-		break;
-	case TC_PRIO_CONTROL:
-	default:
-		prio = CFHSI_PRIO_CTL;
-		break;
-	}
-
-	spin_lock_bh(&cfhsi->lock);
-
-	/* Update aggregation statistics  */
-	cfhsi_update_aggregation_stats(cfhsi, skb, 1);
-
-	/* Queue the SKB */
-	skb_queue_tail(&cfhsi->qhead[prio], skb);
-
-	/* Sanity check; xmit should not be called after unregister_netdev */
-	if (WARN_ON(test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))) {
-		spin_unlock_bh(&cfhsi->lock);
-		cfhsi_abort_tx(cfhsi);
-		return -EINVAL;
-	}
-
-	/* Send flow off if number of packets is above high water mark. */
-	if (!cfhsi->flow_off_sent &&
-		cfhsi_tx_queue_len(cfhsi) > cfhsi->cfg.q_high_mark &&
-		cfhsi->cfdev.flowctrl) {
-		cfhsi->flow_off_sent = 1;
-		cfhsi->cfdev.flowctrl(cfhsi->ndev, OFF);
-	}
-
-	if (cfhsi->tx_state == CFHSI_TX_STATE_IDLE) {
-		cfhsi->tx_state = CFHSI_TX_STATE_XFER;
-		start_xfer = 1;
-	}
-
-	if (!start_xfer) {
-		/* Send aggregate if it is possible */
-		bool aggregate_ready =
-			cfhsi_can_send_aggregate(cfhsi) &&
-			del_timer(&cfhsi->aggregation_timer) > 0;
-		spin_unlock_bh(&cfhsi->lock);
-		if (aggregate_ready)
-			cfhsi_start_tx(cfhsi);
-		return NETDEV_TX_OK;
-	}
-
-	/* Delete inactivity timer if started. */
-	timer_active = del_timer_sync(&cfhsi->inactivity_timer);
-
-	spin_unlock_bh(&cfhsi->lock);
-
-	if (timer_active) {
-		struct cfhsi_desc *desc = (struct cfhsi_desc *)cfhsi->tx_buf;
-		int len;
-		int res;
-
-		/* Create HSI frame. */
-		len = cfhsi_tx_frm(desc, cfhsi);
-		WARN_ON(!len);
-
-		/* Set up new transfer. */
-		res = cfhsi->ops->cfhsi_tx(cfhsi->tx_buf, len, cfhsi->ops);
-		if (WARN_ON(res < 0)) {
-			netdev_err(cfhsi->ndev, "%s: TX error %d.\n",
-				__func__, res);
-			cfhsi_abort_tx(cfhsi);
-		}
-	} else {
-		/* Schedule wake up work queue if the we initiate. */
-		if (!test_and_set_bit(CFHSI_WAKE_UP, &cfhsi->bits))
-			queue_work(cfhsi->wq, &cfhsi->wake_up_work);
-	}
-
-	return NETDEV_TX_OK;
-}
-
-static const struct net_device_ops cfhsi_netdevops;
-
-static void cfhsi_setup(struct net_device *dev)
-{
-	int i;
-	struct cfhsi *cfhsi = netdev_priv(dev);
-	dev->features = 0;
-	dev->type = ARPHRD_CAIF;
-	dev->flags = IFF_POINTOPOINT | IFF_NOARP;
-	dev->mtu = CFHSI_MAX_CAIF_FRAME_SZ;
-	dev->priv_flags |= IFF_NO_QUEUE;
-	dev->needs_free_netdev = true;
-	dev->netdev_ops = &cfhsi_netdevops;
-	for (i = 0; i < CFHSI_PRIO_LAST; ++i)
-		skb_queue_head_init(&cfhsi->qhead[i]);
-	cfhsi->cfdev.link_select = CAIF_LINK_HIGH_BANDW;
-	cfhsi->cfdev.use_frag = false;
-	cfhsi->cfdev.use_stx = false;
-	cfhsi->cfdev.use_fcs = false;
-	cfhsi->ndev = dev;
-	cfhsi->cfg = hsi_default_config;
-}
-
-static int cfhsi_open(struct net_device *ndev)
-{
-	struct cfhsi *cfhsi = netdev_priv(ndev);
-	int res;
-
-	clear_bit(CFHSI_SHUTDOWN, &cfhsi->bits);
-
-	/* Initialize state vaiables. */
-	cfhsi->tx_state = CFHSI_TX_STATE_IDLE;
-	cfhsi->rx_state.state = CFHSI_RX_STATE_DESC;
-
-	/* Set flow info */
-	cfhsi->flow_off_sent = 0;
-
-	/*
-	 * Allocate a TX buffer with the size of a HSI packet descriptors
-	 * and the necessary room for CAIF payload frames.
-	 */
-	cfhsi->tx_buf = kzalloc(CFHSI_BUF_SZ_TX, GFP_KERNEL);
-	if (!cfhsi->tx_buf) {
-		res = -ENODEV;
-		goto err_alloc_tx;
-	}
-
-	/*
-	 * Allocate a RX buffer with the size of two HSI packet descriptors and
-	 * the necessary room for CAIF payload frames.
-	 */
-	cfhsi->rx_buf = kzalloc(CFHSI_BUF_SZ_RX, GFP_KERNEL);
-	if (!cfhsi->rx_buf) {
-		res = -ENODEV;
-		goto err_alloc_rx;
-	}
-
-	cfhsi->rx_flip_buf = kzalloc(CFHSI_BUF_SZ_RX, GFP_KERNEL);
-	if (!cfhsi->rx_flip_buf) {
-		res = -ENODEV;
-		goto err_alloc_rx_flip;
-	}
-
-	/* Initialize aggregation timeout */
-	cfhsi->cfg.aggregation_timeout = hsi_default_config.aggregation_timeout;
-
-	/* Initialize recieve vaiables. */
-	cfhsi->rx_ptr = cfhsi->rx_buf;
-	cfhsi->rx_len = CFHSI_DESC_SZ;
-
-	/* Initialize spin locks. */
-	spin_lock_init(&cfhsi->lock);
-
-	/* Set up the driver. */
-	cfhsi->cb_ops.tx_done_cb = cfhsi_tx_done_cb;
-	cfhsi->cb_ops.rx_done_cb = cfhsi_rx_done_cb;
-	cfhsi->cb_ops.wake_up_cb = cfhsi_wake_up_cb;
-	cfhsi->cb_ops.wake_down_cb = cfhsi_wake_down_cb;
-
-	/* Initialize the work queues. */
-	INIT_WORK(&cfhsi->wake_up_work, cfhsi_wake_up);
-	INIT_WORK(&cfhsi->wake_down_work, cfhsi_wake_down);
-	INIT_WORK(&cfhsi->out_of_sync_work, cfhsi_out_of_sync);
-
-	/* Clear all bit fields. */
-	clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-	clear_bit(CFHSI_WAKE_DOWN_ACK, &cfhsi->bits);
-	clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-	clear_bit(CFHSI_AWAKE, &cfhsi->bits);
-
-	/* Create work thread. */
-	cfhsi->wq = alloc_ordered_workqueue(cfhsi->ndev->name, WQ_MEM_RECLAIM);
-	if (!cfhsi->wq) {
-		netdev_err(cfhsi->ndev, "%s: Failed to create work queue.\n",
-			__func__);
-		res = -ENODEV;
-		goto err_create_wq;
-	}
-
-	/* Initialize wait queues. */
-	init_waitqueue_head(&cfhsi->wake_up_wait);
-	init_waitqueue_head(&cfhsi->wake_down_wait);
-	init_waitqueue_head(&cfhsi->flush_fifo_wait);
-
-	/* Setup the inactivity timer. */
-	timer_setup(&cfhsi->inactivity_timer, cfhsi_inactivity_tout, 0);
-	/* Setup the slowpath RX timer. */
-	timer_setup(&cfhsi->rx_slowpath_timer, cfhsi_rx_slowpath, 0);
-	/* Setup the aggregation timer. */
-	timer_setup(&cfhsi->aggregation_timer, cfhsi_aggregation_tout, 0);
-
-	/* Activate HSI interface. */
-	res = cfhsi->ops->cfhsi_up(cfhsi->ops);
-	if (res) {
-		netdev_err(cfhsi->ndev,
-			"%s: can't activate HSI interface: %d.\n",
-			__func__, res);
-		goto err_activate;
-	}
-
-	/* Flush FIFO */
-	res = cfhsi_flush_fifo(cfhsi);
-	if (res) {
-		netdev_err(cfhsi->ndev, "%s: Can't flush FIFO: %d.\n",
-			__func__, res);
-		goto err_net_reg;
-	}
-	return res;
-
- err_net_reg:
-	cfhsi->ops->cfhsi_down(cfhsi->ops);
- err_activate:
-	destroy_workqueue(cfhsi->wq);
- err_create_wq:
-	kfree(cfhsi->rx_flip_buf);
- err_alloc_rx_flip:
-	kfree(cfhsi->rx_buf);
- err_alloc_rx:
-	kfree(cfhsi->tx_buf);
- err_alloc_tx:
-	return res;
-}
-
-static int cfhsi_close(struct net_device *ndev)
-{
-	struct cfhsi *cfhsi = netdev_priv(ndev);
-	u8 *tx_buf, *rx_buf, *flip_buf;
-
-	/* going to shutdown driver */
-	set_bit(CFHSI_SHUTDOWN, &cfhsi->bits);
-
-	/* Delete timers if pending */
-	del_timer_sync(&cfhsi->inactivity_timer);
-	del_timer_sync(&cfhsi->rx_slowpath_timer);
-	del_timer_sync(&cfhsi->aggregation_timer);
-
-	/* Cancel pending RX request (if any) */
-	cfhsi->ops->cfhsi_rx_cancel(cfhsi->ops);
-
-	/* Destroy workqueue */
-	destroy_workqueue(cfhsi->wq);
-
-	/* Store bufferes: will be freed later. */
-	tx_buf = cfhsi->tx_buf;
-	rx_buf = cfhsi->rx_buf;
-	flip_buf = cfhsi->rx_flip_buf;
-	/* Flush transmit queues. */
-	cfhsi_abort_tx(cfhsi);
-
-	/* Deactivate interface */
-	cfhsi->ops->cfhsi_down(cfhsi->ops);
-
-	/* Free buffers. */
-	kfree(tx_buf);
-	kfree(rx_buf);
-	kfree(flip_buf);
-	return 0;
-}
-
-static void cfhsi_uninit(struct net_device *dev)
-{
-	struct cfhsi *cfhsi = netdev_priv(dev);
-	ASSERT_RTNL();
-	symbol_put(cfhsi_get_device);
-	list_del(&cfhsi->list);
-}
-
-static const struct net_device_ops cfhsi_netdevops = {
-	.ndo_uninit = cfhsi_uninit,
-	.ndo_open = cfhsi_open,
-	.ndo_stop = cfhsi_close,
-	.ndo_start_xmit = cfhsi_xmit
-};
-
-static void cfhsi_netlink_parms(struct nlattr *data[], struct cfhsi *cfhsi)
-{
-	int i;
-
-	if (!data) {
-		pr_debug("no params data found\n");
-		return;
-	}
-
-	i = __IFLA_CAIF_HSI_INACTIVITY_TOUT;
-	/*
-	 * Inactivity timeout in millisecs. Lowest possible value is 1,
-	 * and highest possible is NEXT_TIMER_MAX_DELTA.
-	 */
-	if (data[i]) {
-		u32 inactivity_timeout = nla_get_u32(data[i]);
-		/* Pre-calculate inactivity timeout. */
-		cfhsi->cfg.inactivity_timeout =	inactivity_timeout * HZ / 1000;
-		if (cfhsi->cfg.inactivity_timeout == 0)
-			cfhsi->cfg.inactivity_timeout = 1;
-		else if (cfhsi->cfg.inactivity_timeout > NEXT_TIMER_MAX_DELTA)
-			cfhsi->cfg.inactivity_timeout = NEXT_TIMER_MAX_DELTA;
-	}
-
-	i = __IFLA_CAIF_HSI_AGGREGATION_TOUT;
-	if (data[i])
-		cfhsi->cfg.aggregation_timeout = nla_get_u32(data[i]);
-
-	i = __IFLA_CAIF_HSI_HEAD_ALIGN;
-	if (data[i])
-		cfhsi->cfg.head_align = nla_get_u32(data[i]);
-
-	i = __IFLA_CAIF_HSI_TAIL_ALIGN;
-	if (data[i])
-		cfhsi->cfg.tail_align = nla_get_u32(data[i]);
-
-	i = __IFLA_CAIF_HSI_QHIGH_WATERMARK;
-	if (data[i])
-		cfhsi->cfg.q_high_mark = nla_get_u32(data[i]);
-
-	i = __IFLA_CAIF_HSI_QLOW_WATERMARK;
-	if (data[i])
-		cfhsi->cfg.q_low_mark = nla_get_u32(data[i]);
-}
-
-static int caif_hsi_changelink(struct net_device *dev, struct nlattr *tb[],
-			       struct nlattr *data[],
-			       struct netlink_ext_ack *extack)
-{
-	cfhsi_netlink_parms(data, netdev_priv(dev));
-	netdev_state_change(dev);
-	return 0;
-}
-
-static const struct nla_policy caif_hsi_policy[__IFLA_CAIF_HSI_MAX + 1] = {
-	[__IFLA_CAIF_HSI_INACTIVITY_TOUT] = { .type = NLA_U32, .len = 4 },
-	[__IFLA_CAIF_HSI_AGGREGATION_TOUT] = { .type = NLA_U32, .len = 4 },
-	[__IFLA_CAIF_HSI_HEAD_ALIGN] = { .type = NLA_U32, .len = 4 },
-	[__IFLA_CAIF_HSI_TAIL_ALIGN] = { .type = NLA_U32, .len = 4 },
-	[__IFLA_CAIF_HSI_QHIGH_WATERMARK] = { .type = NLA_U32, .len = 4 },
-	[__IFLA_CAIF_HSI_QLOW_WATERMARK] = { .type = NLA_U32, .len = 4 },
-};
-
-static size_t caif_hsi_get_size(const struct net_device *dev)
-{
-	int i;
-	size_t s = 0;
-	for (i = __IFLA_CAIF_HSI_UNSPEC + 1; i < __IFLA_CAIF_HSI_MAX; i++)
-		s += nla_total_size(caif_hsi_policy[i].len);
-	return s;
-}
-
-static int caif_hsi_fill_info(struct sk_buff *skb, const struct net_device *dev)
-{
-	struct cfhsi *cfhsi = netdev_priv(dev);
-
-	if (nla_put_u32(skb, __IFLA_CAIF_HSI_INACTIVITY_TOUT,
-			cfhsi->cfg.inactivity_timeout) ||
-	    nla_put_u32(skb, __IFLA_CAIF_HSI_AGGREGATION_TOUT,
-			cfhsi->cfg.aggregation_timeout) ||
-	    nla_put_u32(skb, __IFLA_CAIF_HSI_HEAD_ALIGN,
-			cfhsi->cfg.head_align) ||
-	    nla_put_u32(skb, __IFLA_CAIF_HSI_TAIL_ALIGN,
-			cfhsi->cfg.tail_align) ||
-	    nla_put_u32(skb, __IFLA_CAIF_HSI_QHIGH_WATERMARK,
-			cfhsi->cfg.q_high_mark) ||
-	    nla_put_u32(skb, __IFLA_CAIF_HSI_QLOW_WATERMARK,
-			cfhsi->cfg.q_low_mark))
-		return -EMSGSIZE;
-
-	return 0;
-}
-
-static int caif_hsi_newlink(struct net *src_net, struct net_device *dev,
-			    struct nlattr *tb[], struct nlattr *data[],
-			    struct netlink_ext_ack *extack)
-{
-	struct cfhsi *cfhsi = NULL;
-	struct cfhsi_ops *(*get_ops)(void);
-
-	ASSERT_RTNL();
-
-	cfhsi = netdev_priv(dev);
-	cfhsi_netlink_parms(data, cfhsi);
-
-	get_ops = symbol_get(cfhsi_get_ops);
-	if (!get_ops) {
-		pr_err("%s: failed to get the cfhsi_ops\n", __func__);
-		return -ENODEV;
-	}
-
-	/* Assign the HSI device. */
-	cfhsi->ops = (*get_ops)();
-	if (!cfhsi->ops) {
-		pr_err("%s: failed to get the cfhsi_ops\n", __func__);
-		goto err;
-	}
-
-	/* Assign the driver to this HSI device. */
-	cfhsi->ops->cb_ops = &cfhsi->cb_ops;
-	if (register_netdevice(dev)) {
-		pr_warn("%s: caif_hsi device registration failed\n", __func__);
-		goto err;
-	}
-	/* Add CAIF HSI device to list. */
-	list_add_tail(&cfhsi->list, &cfhsi_list);
-
-	return 0;
-err:
-	symbol_put(cfhsi_get_ops);
-	return -ENODEV;
-}
-
-static struct rtnl_link_ops caif_hsi_link_ops __read_mostly = {
-	.kind		= "cfhsi",
-	.priv_size	= sizeof(struct cfhsi),
-	.setup		= cfhsi_setup,
-	.maxtype	= __IFLA_CAIF_HSI_MAX,
-	.policy	= caif_hsi_policy,
-	.newlink	= caif_hsi_newlink,
-	.changelink	= caif_hsi_changelink,
-	.get_size	= caif_hsi_get_size,
-	.fill_info	= caif_hsi_fill_info,
-};
-
-static void __exit cfhsi_exit_module(void)
-{
-	struct list_head *list_node;
-	struct list_head *n;
-	struct cfhsi *cfhsi;
-
-	rtnl_link_unregister(&caif_hsi_link_ops);
-
-	rtnl_lock();
-	list_for_each_safe(list_node, n, &cfhsi_list) {
-		cfhsi = list_entry(list_node, struct cfhsi, list);
-		unregister_netdevice(cfhsi->ndev);
-	}
-	rtnl_unlock();
-}
-
-static int __init cfhsi_init_module(void)
-{
-	return rtnl_link_register(&caif_hsi_link_ops);
-}
-
-module_init(cfhsi_init_module);
-module_exit(cfhsi_exit_module);
diff --git a/include/net/caif/caif_hsi.h b/include/net/caif/caif_hsi.h
deleted file mode 100644
index 552cf68d28d2..000000000000
--- a/include/net/caif/caif_hsi.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author:  Daniel Martensson / daniel.martensson@stericsson.com
- *	    Dmitry.Tarnyagin  / dmitry.tarnyagin@stericsson.com
- */
-
-#ifndef CAIF_HSI_H_
-#define CAIF_HSI_H_
-
-#include <net/caif/caif_layer.h>
-#include <net/caif/caif_device.h>
-#include <linux/atomic.h>
-
-/*
- * Maximum number of CAIF frames that can reside in the same HSI frame.
- */
-#define CFHSI_MAX_PKTS 15
-
-/*
- * Maximum number of bytes used for the frame that can be embedded in the
- * HSI descriptor.
- */
-#define CFHSI_MAX_EMB_FRM_SZ 96
-
-/*
- * Decides if HSI buffers should be prefilled with 0xFF pattern for easier
- * debugging. Both TX and RX buffers will be filled before the transfer.
- */
-#define CFHSI_DBG_PREFILL		0
-
-/* Structure describing a HSI packet descriptor. */
-#pragma pack(1) /* Byte alignment. */
-struct cfhsi_desc {
-	u8 header;
-	u8 offset;
-	u16 cffrm_len[CFHSI_MAX_PKTS];
-	u8 emb_frm[CFHSI_MAX_EMB_FRM_SZ];
-};
-#pragma pack() /* Default alignment. */
-
-/* Size of the complete HSI packet descriptor. */
-#define CFHSI_DESC_SZ (sizeof(struct cfhsi_desc))
-
-/*
- * Size of the complete HSI packet descriptor excluding the optional embedded
- * CAIF frame.
- */
-#define CFHSI_DESC_SHORT_SZ (CFHSI_DESC_SZ - CFHSI_MAX_EMB_FRM_SZ)
-
-/*
- * Maximum bytes transferred in one transfer.
- */
-#define CFHSI_MAX_CAIF_FRAME_SZ 4096
-
-#define CFHSI_MAX_PAYLOAD_SZ (CFHSI_MAX_PKTS * CFHSI_MAX_CAIF_FRAME_SZ)
-
-/* Size of the complete HSI TX buffer. */
-#define CFHSI_BUF_SZ_TX (CFHSI_DESC_SZ + CFHSI_MAX_PAYLOAD_SZ)
-
-/* Size of the complete HSI RX buffer. */
-#define CFHSI_BUF_SZ_RX ((2 * CFHSI_DESC_SZ) + CFHSI_MAX_PAYLOAD_SZ)
-
-/* Bitmasks for the HSI descriptor. */
-#define CFHSI_PIGGY_DESC		(0x01 << 7)
-
-#define CFHSI_TX_STATE_IDLE			0
-#define CFHSI_TX_STATE_XFER			1
-
-#define CFHSI_RX_STATE_DESC			0
-#define CFHSI_RX_STATE_PAYLOAD			1
-
-/* Bitmasks for power management. */
-#define CFHSI_WAKE_UP				0
-#define CFHSI_WAKE_UP_ACK			1
-#define CFHSI_WAKE_DOWN_ACK			2
-#define CFHSI_AWAKE				3
-#define CFHSI_WAKELOCK_HELD			4
-#define CFHSI_SHUTDOWN				5
-#define CFHSI_FLUSH_FIFO			6
-
-#ifndef CFHSI_INACTIVITY_TOUT
-#define CFHSI_INACTIVITY_TOUT			(1 * HZ)
-#endif /* CFHSI_INACTIVITY_TOUT */
-
-#ifndef CFHSI_WAKE_TOUT
-#define CFHSI_WAKE_TOUT			(3 * HZ)
-#endif /* CFHSI_WAKE_TOUT */
-
-#ifndef CFHSI_MAX_RX_RETRIES
-#define CFHSI_MAX_RX_RETRIES		(10 * HZ)
-#endif
-
-/* Structure implemented by the CAIF HSI driver. */
-struct cfhsi_cb_ops {
-	void (*tx_done_cb) (struct cfhsi_cb_ops *drv);
-	void (*rx_done_cb) (struct cfhsi_cb_ops *drv);
-	void (*wake_up_cb) (struct cfhsi_cb_ops *drv);
-	void (*wake_down_cb) (struct cfhsi_cb_ops *drv);
-};
-
-/* Structure implemented by HSI device. */
-struct cfhsi_ops {
-	int (*cfhsi_up) (struct cfhsi_ops *dev);
-	int (*cfhsi_down) (struct cfhsi_ops *dev);
-	int (*cfhsi_tx) (u8 *ptr, int len, struct cfhsi_ops *dev);
-	int (*cfhsi_rx) (u8 *ptr, int len, struct cfhsi_ops *dev);
-	int (*cfhsi_wake_up) (struct cfhsi_ops *dev);
-	int (*cfhsi_wake_down) (struct cfhsi_ops *dev);
-	int (*cfhsi_get_peer_wake) (struct cfhsi_ops *dev, bool *status);
-	int (*cfhsi_fifo_occupancy) (struct cfhsi_ops *dev, size_t *occupancy);
-	int (*cfhsi_rx_cancel)(struct cfhsi_ops *dev);
-	struct cfhsi_cb_ops *cb_ops;
-};
-
-/* Structure holds status of received CAIF frames processing */
-struct cfhsi_rx_state {
-	int state;
-	int nfrms;
-	int pld_len;
-	int retries;
-	bool piggy_desc;
-};
-
-/* Priority mapping */
-enum {
-	CFHSI_PRIO_CTL = 0,
-	CFHSI_PRIO_VI,
-	CFHSI_PRIO_VO,
-	CFHSI_PRIO_BEBK,
-	CFHSI_PRIO_LAST,
-};
-
-struct cfhsi_config {
-	u32 inactivity_timeout;
-	u32 aggregation_timeout;
-	u32 head_align;
-	u32 tail_align;
-	u32 q_high_mark;
-	u32 q_low_mark;
-};
-
-/* Structure implemented by CAIF HSI drivers. */
-struct cfhsi {
-	struct caif_dev_common cfdev;
-	struct net_device *ndev;
-	struct platform_device *pdev;
-	struct sk_buff_head qhead[CFHSI_PRIO_LAST];
-	struct cfhsi_cb_ops cb_ops;
-	struct cfhsi_ops *ops;
-	int tx_state;
-	struct cfhsi_rx_state rx_state;
-	struct cfhsi_config cfg;
-	int rx_len;
-	u8 *rx_ptr;
-	u8 *tx_buf;
-	u8 *rx_buf;
-	u8 *rx_flip_buf;
-	spinlock_t lock;
-	int flow_off_sent;
-	struct list_head list;
-	struct work_struct wake_up_work;
-	struct work_struct wake_down_work;
-	struct work_struct out_of_sync_work;
-	struct workqueue_struct *wq;
-	wait_queue_head_t wake_up_wait;
-	wait_queue_head_t wake_down_wait;
-	wait_queue_head_t flush_fifo_wait;
-	struct timer_list inactivity_timer;
-	struct timer_list rx_slowpath_timer;
-
-	/* TX aggregation */
-	int aggregation_len;
-	struct timer_list aggregation_timer;
-
-	unsigned long bits;
-};
-extern struct platform_driver cfhsi_driver;
-
-/**
- * enum ifla_caif_hsi - CAIF HSI NetlinkRT parameters.
- * @IFLA_CAIF_HSI_INACTIVITY_TOUT: Inactivity timeout before
- *			taking the HSI wakeline down, in milliseconds.
- * When using RT Netlink to create, destroy or configure a CAIF HSI interface,
- * enum ifla_caif_hsi is used to specify the configuration attributes.
- */
-enum ifla_caif_hsi {
-	__IFLA_CAIF_HSI_UNSPEC,
-	__IFLA_CAIF_HSI_INACTIVITY_TOUT,
-	__IFLA_CAIF_HSI_AGGREGATION_TOUT,
-	__IFLA_CAIF_HSI_HEAD_ALIGN,
-	__IFLA_CAIF_HSI_TAIL_ALIGN,
-	__IFLA_CAIF_HSI_QHIGH_WATERMARK,
-	__IFLA_CAIF_HSI_QLOW_WATERMARK,
-	__IFLA_CAIF_HSI_MAX
-};
-
-struct cfhsi_ops *cfhsi_get_ops(void);
-
-#endif		/* CAIF_HSI_H_ */

From 71158bb1f2d2da61385c58fc1114e1a1c19984ba Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 30 Jun 2021 13:42:13 +0200
Subject: [PATCH 054/158] tcp: consistently disable header prediction for mptcp

The MPTCP receive path is hooked only into the TCP slow-path.
The DSS presence allows plain MPTCP traffic to hit that
consistently.

Since commit e1ff9e82e2ea ("net: mptcp: improve fallback to TCP"),
when an MPTCP socket falls back to TCP, it can hit the TCP receive
fast-path, and delay or stop triggering the event notification.

Address the issue explicitly disabling the header prediction
for MPTCP sockets.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/200
Fixes: e1ff9e82e2ea ("net: mptcp: improve fallback to TCP")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e668f1bf780d..17df9b047ee4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -686,6 +686,10 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 
 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
+	/* mptcp hooks are only on the slow path */
+	if (sk_is_mptcp((struct sock *)tp))
+		return;
+
 	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
 			       ntohl(TCP_FLAG_ACK) |
 			       snd_wnd);

From 18a419bad63b7f68a1979e28459782518e7b6bbe Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 30 Jun 2021 09:42:44 -0700
Subject: [PATCH 055/158] udp: annotate data races around unix_sk(sk)->gso_size

Accesses to unix_sk(sk)->gso_size are lockless.
Add READ_ONCE()/WRITE_ONCE() around them.

BUG: KCSAN: data-race in udp_lib_setsockopt / udpv6_sendmsg

write to 0xffff88812d78f47c of 2 bytes by task 10849 on cpu 1:
 udp_lib_setsockopt+0x3b3/0x710 net/ipv4/udp.c:2696
 udpv6_setsockopt+0x63/0x90 net/ipv6/udp.c:1630
 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3265
 __sys_setsockopt+0x18f/0x200 net/socket.c:2104
 __do_sys_setsockopt net/socket.c:2115 [inline]
 __se_sys_setsockopt net/socket.c:2112 [inline]
 __x64_sys_setsockopt+0x62/0x70 net/socket.c:2112
 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47
 entry_SYSCALL_64_after_hwframe+0x44/0xae

read to 0xffff88812d78f47c of 2 bytes by task 10852 on cpu 0:
 udpv6_sendmsg+0x161/0x16b0 net/ipv6/udp.c:1299
 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:642
 sock_sendmsg_nosec net/socket.c:654 [inline]
 sock_sendmsg net/socket.c:674 [inline]
 ____sys_sendmsg+0x360/0x4d0 net/socket.c:2337
 ___sys_sendmsg net/socket.c:2391 [inline]
 __sys_sendmmsg+0x315/0x4b0 net/socket.c:2477
 __do_sys_sendmmsg net/socket.c:2506 [inline]
 __se_sys_sendmmsg net/socket.c:2503 [inline]
 __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2503
 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47
 entry_SYSCALL_64_after_hwframe+0x44/0xae

value changed: 0x0000 -> 0x0005

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 10852 Comm: syz-executor.0 Not tainted 5.13.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 6 +++---
 net/ipv6/udp.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 62682807b4b2..62cd4cd52e84 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1102,7 +1102,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	}
 
 	ipcm_init_sk(&ipc, inet);
-	ipc.gso_size = up->gso_size;
+	ipc.gso_size = READ_ONCE(up->gso_size);
 
 	if (msg->msg_controllen) {
 		err = udp_cmsg_send(sk, msg, &ipc.gso_size);
@@ -2695,7 +2695,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 	case UDP_SEGMENT:
 		if (val < 0 || val > USHRT_MAX)
 			return -EINVAL;
-		up->gso_size = val;
+		WRITE_ONCE(up->gso_size, val);
 		break;
 
 	case UDP_GRO:
@@ -2790,7 +2790,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case UDP_SEGMENT:
-		val = up->gso_size;
+		val = READ_ONCE(up->gso_size);
 		break;
 
 	case UDP_GRO:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 368972dbd919..0cc7ba531b34 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1296,7 +1296,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
 
 	ipcm6_init(&ipc6);
-	ipc6.gso_size = up->gso_size;
+	ipc6.gso_size = READ_ONCE(up->gso_size);
 	ipc6.sockc.tsflags = sk->sk_tsflags;
 	ipc6.sockc.mark = sk->sk_mark;
 

From b6c8801038234a6d837dfc683d79676d4022ad79 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 1 Jul 2021 14:02:20 +0200
Subject: [PATCH 056/158] ARM: dts: qcom-apq8060: Correct Ethernet node name
 and drop bogus irq property

make dtbs_check:

    ethernet-ebi2@2,0: $nodename:0: 'ethernet-ebi2@2,0' does not match '^ethernet(@.*)?$'
    ethernet-ebi2@2,0: 'smsc,irq-active-low' does not match any of the regexes: 'pinctrl-[0-9]+'

There is no "smsc,irq-active-low" property, as active low is the
default.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/boot/dts/qcom-apq8060-dragonboard.dts | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/arm/boot/dts/qcom-apq8060-dragonboard.dts b/arch/arm/boot/dts/qcom-apq8060-dragonboard.dts
index dace8ffeb991..0a4ffd10c484 100644
--- a/arch/arm/boot/dts/qcom-apq8060-dragonboard.dts
+++ b/arch/arm/boot/dts/qcom-apq8060-dragonboard.dts
@@ -581,7 +581,7 @@
 			 * EBI2. This has a 25MHz chrystal next to it, so no
 			 * clocking is needed.
 			 */
-			ethernet-ebi2@2,0 {
+			ethernet@2,0 {
 				compatible = "smsc,lan9221", "smsc,lan9115";
 				reg = <2 0x0 0x100>;
 				/*
@@ -598,8 +598,6 @@
 				phy-mode = "mii";
 				reg-io-width = <2>;
 				smsc,force-external-phy;
-				/* IRQ on edge falling = active low */
-				smsc,irq-active-low;
 				smsc,irq-push-pull;
 
 				/*

From 19373d0233d04ede649eee4269922e20dd133cf2 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 1 Jul 2021 14:02:21 +0200
Subject: [PATCH 057/158] dt-bindings: net: sms911x: Convert to json-schema

Convert the Smart Mixed-Signal Connectivity (SMSC) LAN911x/912x
Controller Device Tree binding documentation to json-schema.

Document missing properties.
Make "phy-mode" not required, as many DTS files do not have it, and the
Linux drivers falls back to PHY_INTERFACE_MODE_NA.
Correct nodename in example.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/gpmc-eth.txt      |   2 +-
 .../devicetree/bindings/net/smsc,lan9115.yaml | 110 ++++++++++++++++++
 .../devicetree/bindings/net/smsc911x.txt      |  43 -------
 3 files changed, 111 insertions(+), 44 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/smsc,lan9115.yaml
 delete mode 100644 Documentation/devicetree/bindings/net/smsc911x.txt

diff --git a/Documentation/devicetree/bindings/net/gpmc-eth.txt b/Documentation/devicetree/bindings/net/gpmc-eth.txt
index f7da3d73ca1b..32821066a85b 100644
--- a/Documentation/devicetree/bindings/net/gpmc-eth.txt
+++ b/Documentation/devicetree/bindings/net/gpmc-eth.txt
@@ -13,7 +13,7 @@ Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
 
 For the properties relevant to the ethernet controller connected to the GPMC
 refer to the binding documentation of the device. For example, the documentation
-for the SMSC 911x is Documentation/devicetree/bindings/net/smsc911x.txt
+for the SMSC 911x is Documentation/devicetree/bindings/net/smsc,lan9115.yaml
 
 Child nodes need to specify the GPMC bus address width using the "bank-width"
 property but is possible that an ethernet controller also has a property to
diff --git a/Documentation/devicetree/bindings/net/smsc,lan9115.yaml b/Documentation/devicetree/bindings/net/smsc,lan9115.yaml
new file mode 100644
index 000000000000..f86667cbcca8
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/smsc,lan9115.yaml
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/smsc,lan9115.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Smart Mixed-Signal Connectivity (SMSC) LAN911x/912x Controller
+
+maintainers:
+  - Shawn Guo <shawnguo@kernel.org>
+
+allOf:
+  - $ref: ethernet-controller.yaml#
+
+properties:
+  compatible:
+    oneOf:
+      - const: smsc,lan9115
+      - items:
+          - enum:
+              - smsc,lan89218
+              - smsc,lan9117
+              - smsc,lan9118
+              - smsc,lan9220
+              - smsc,lan9221
+          - const: smsc,lan9115
+
+  reg:
+    maxItems: 1
+
+  reg-shift: true
+
+  reg-io-width:
+    enum: [ 2, 4 ]
+    default: 2
+
+  interrupts:
+    minItems: 1
+    items:
+      - description:
+          LAN interrupt line
+      - description:
+          Optional PME (power management event) interrupt that is able to wake
+          up the host system with a 50ms pulse on network activity
+
+  clocks:
+    maxItems: 1
+
+  phy-mode: true
+
+  smsc,irq-active-high:
+    type: boolean
+    description: Indicates the IRQ polarity is active-high
+
+  smsc,irq-push-pull:
+    type: boolean
+    description: Indicates the IRQ type is push-pull
+
+  smsc,force-internal-phy:
+    type: boolean
+    description: Forces SMSC LAN controller to use internal PHY
+
+  smsc,force-external-phy:
+    type: boolean
+    description: Forces SMSC LAN controller to use external PHY
+
+  smsc,save-mac-address:
+    type: boolean
+    description:
+      Indicates that MAC address needs to be saved before resetting the
+      controller
+
+  reset-gpios:
+    maxItems: 1
+    description:
+      A GPIO line connected to the RESET (active low) signal of the device.
+      On many systems this is wired high so the device goes out of reset at
+      power-on, but if it is under program control, this optional GPIO can
+      wake up in response to it.
+
+  vdd33a-supply:
+    description: 3.3V analog power supply
+
+  vddvario-supply:
+    description: IO logic power supply
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+# There are lots of bus-specific properties ("qcom,*", "samsung,*", "fsl,*",
+# "gpmc,*", ...) to be found, that actually depend on the compatible value of
+# the parent node.
+additionalProperties: true
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+
+    ethernet@f4000000 {
+            compatible = "smsc,lan9220", "smsc,lan9115";
+            reg = <0xf4000000 0x2000000>;
+            phy-mode = "mii";
+            interrupt-parent = <&gpio1>;
+            interrupts = <31>, <32>;
+            reset-gpios = <&gpio1 30 GPIO_ACTIVE_LOW>;
+            reg-io-width = <4>;
+            smsc,irq-push-pull;
+    };
diff --git a/Documentation/devicetree/bindings/net/smsc911x.txt b/Documentation/devicetree/bindings/net/smsc911x.txt
deleted file mode 100644
index acfafc8e143c..000000000000
--- a/Documentation/devicetree/bindings/net/smsc911x.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-* Smart Mixed-Signal Connectivity (SMSC) LAN911x/912x Controller
-
-Required properties:
-- compatible : Should be "smsc,lan<model>", "smsc,lan9115"
-- reg : Address and length of the io space for SMSC LAN
-- interrupts : one or two interrupt specifiers
-  - The first interrupt is the SMSC LAN interrupt line
-  - The second interrupt (if present) is the PME (power
-    management event) interrupt that is able to wake up the host
-     system with a 50ms pulse on network activity
-- phy-mode : See ethernet.txt file in the same directory
-
-Optional properties:
-- reg-shift : Specify the quantity to shift the register offsets by
-- reg-io-width : Specify the size (in bytes) of the IO accesses that
-  should be performed on the device.  Valid value for SMSC LAN is
-  2 or 4.  If it's omitted or invalid, the size would be 2.
-- smsc,irq-active-high : Indicates the IRQ polarity is active-high
-- smsc,irq-push-pull : Indicates the IRQ type is push-pull
-- smsc,force-internal-phy : Forces SMSC LAN controller to use
-  internal PHY
-- smsc,force-external-phy : Forces SMSC LAN controller to use
-  external PHY
-- smsc,save-mac-address : Indicates that mac address needs to be saved
-  before resetting the controller
-- reset-gpios : a GPIO line connected to the RESET (active low) signal
-  of the device. On many systems this is wired high so the device goes
-  out of reset at power-on, but if it is under program control, this
-  optional GPIO can wake up in response to it.
-- vdd33a-supply, vddvario-supply : 3.3V analog and IO logic power supplies
-
-Examples:
-
-lan9220@f4000000 {
-	compatible = "smsc,lan9220", "smsc,lan9115";
-	reg = <0xf4000000 0x2000000>;
-	phy-mode = "mii";
-	interrupt-parent = <&gpio1>;
-	interrupts = <31>, <32>;
-	reset-gpios = <&gpio1 30 GPIO_ACTIVE_LOW>;
-	reg-io-width = <4>;
-	smsc,irq-push-pull;
-};

From 6b28a86d6c0bb02119f386ec2f56efde909e9bcb Mon Sep 17 00:00:00 2001
From: Mohammad Athari Bin Ismail <mohammad.athari.ismail@intel.com>
Date: Wed, 30 Jun 2021 17:59:35 +0800
Subject: [PATCH 058/158] net: stmmac: Terminate FPE workqueue in suspend

Add stmmac_fpe_stop_wq() in stmmac_suspend() to terminate FPE workqueue
during suspend. So, in suspend mode, there will be no FPE workqueue
available. Without this fix, new additional FPE workqueue will be created
in every suspend->resume cycle.

Fixes: 5a5586112b92 ("net: stmmac: support FPE link partner hand-shaking procedure")
Signed-off-by: Mohammad Athari Bin Ismail <mohammad.athari.ismail@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 8d9d6ecf8c63..7b8404a21544 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -7171,6 +7171,7 @@ int stmmac_suspend(struct device *dev)
 				     priv->plat->rx_queues_to_use, false);
 
 		stmmac_fpe_handshake(priv, false);
+		stmmac_fpe_stop_wq(priv);
 	}
 
 	priv->speed = SPEED_UNKNOWN;

From 2342ae10d1272d411a468a85a67647dd115b344f Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 1 Jul 2021 22:18:24 +0200
Subject: [PATCH 059/158] gve: Fix an error handling path in 'gve_probe()'

If the 'register_netdev() call fails, we must release the resources
allocated by the previous 'gve_init_priv()' call, as already done in the
remove function.

Add a new label and the missing 'gve_teardown_priv_resources()' in the
error handling path.

Fixes: 893ce44df565 ("gve: Add basic driver framework for Compute Engine Virtual NIC")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Catherine Sullivan <csully@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/google/gve/gve_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 867e87af3432..44262c9f9ec2 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1565,7 +1565,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	err = register_netdev(dev);
 	if (err)
-		goto abort_with_wq;
+		goto abort_with_gve_init;
 
 	dev_info(&pdev->dev, "GVE version %s\n", gve_version_str);
 	dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format);
@@ -1573,6 +1573,9 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	queue_work(priv->gve_wq, &priv->service_task);
 	return 0;
 
+abort_with_gve_init:
+	gve_teardown_priv_resources(priv);
+
 abort_with_wq:
 	destroy_workqueue(priv->gve_wq);
 

From 6dce38b4b7ffb39539b49feca2b3ce34dbaadf02 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 1 Jul 2021 22:18:37 +0200
Subject: [PATCH 060/158] gve: Propagate error codes to caller

If 'gve_probe()' fails, we should propagate the error code, instead of
hard coding a -ENXIO value.
Make sure that all error handling paths set a correct value for 'err'.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Catherine Sullivan <csully@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/google/gve/gve_main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 44262c9f9ec2..c03984b26db4 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1469,7 +1469,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	err = pci_enable_device(pdev);
 	if (err)
-		return -ENXIO;
+		return err;
 
 	err = pci_request_regions(pdev, "gvnic-cfg");
 	if (err)
@@ -1512,6 +1512,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues);
 	if (!dev) {
 		dev_err(&pdev->dev, "could not allocate netdev\n");
+		err = -ENOMEM;
 		goto abort_with_db_bar;
 	}
 	SET_NETDEV_DEV(dev, &pdev->dev);
@@ -1593,7 +1594,7 @@ abort_with_pci_region:
 
 abort_with_enabled:
 	pci_disable_device(pdev);
-	return -ENXIO;
+	return err;
 }
 
 static void gve_remove(struct pci_dev *pdev)

From 5140aaa4604ba96685dc04b4d2dde3384bbaecef Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 1 Jul 2021 17:44:07 +0200
Subject: [PATCH 061/158] s390: iucv: Avoid field over-reading memcpy()

In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally reading across neighboring array fields.

Add a wrapping struct to serve as the memcpy() source so the compiler
can perform appropriate bounds checking, avoiding this future warning:

In function '__fortify_memcpy',
    inlined from 'iucv_message_pending' at net/iucv/iucv.c:1663:4:
./include/linux/fortify-string.h:246:4: error: call to '__read_overflow2_field' declared with attribute error: detected read beyond size of field (2nd parameter)

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/iucv/iucv.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 349c6ac3313f..e6795d5a546a 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -1635,14 +1635,16 @@ struct iucv_message_pending {
 	u8  iptype;
 	u32 ipmsgid;
 	u32 iptrgcls;
-	union {
-		u32 iprmmsg1_u32;
-		u8  iprmmsg1[4];
-	} ln1msg1;
-	union {
-		u32 ipbfln1f;
-		u8  iprmmsg2[4];
-	} ln1msg2;
+	struct {
+		union {
+			u32 iprmmsg1_u32;
+			u8  iprmmsg1[4];
+		} ln1msg1;
+		union {
+			u32 ipbfln1f;
+			u8  iprmmsg2[4];
+		} ln1msg2;
+	} rmmsg;
 	u32 res1[3];
 	u32 ipbfln2f;
 	u8  ippollfg;
@@ -1660,10 +1662,10 @@ static void iucv_message_pending(struct iucv_irq_data *data)
 		msg.id = imp->ipmsgid;
 		msg.class = imp->iptrgcls;
 		if (imp->ipflags1 & IUCV_IPRMDATA) {
-			memcpy(msg.rmmsg, imp->ln1msg1.iprmmsg1, 8);
+			memcpy(msg.rmmsg, &imp->rmmsg, 8);
 			msg.length = 8;
 		} else
-			msg.length = imp->ln1msg2.ipbfln1f;
+			msg.length = imp->rmmsg.ln1msg2.ipbfln1f;
 		msg.reply_size = imp->ipbfln2f;
 		path->handler->message_pending(path, &msg);
 	}

From 37d220b58d52d6da88b928e11bffd83f88c6ac79 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 24 Jun 2021 12:36:41 +0200
Subject: [PATCH 062/158] selftest: netfilter: add test case for unreplied tcp
 connections

TCP connections in UNREPLIED state (only SYN seen) can be kept alive
indefinitely, as each SYN re-sets the timeout.

This means that even if a peer has closed its socket the entry
never times out.

This also prevents re-evaluation of configured NAT rules.
Add a test case that sets SYN timeout to 10 seconds, then check
that the nat redirection added later eventually takes effect.

This is based off a repro script from Antonio Ojea.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/Makefile    |   2 +-
 .../netfilter/conntrack_tcp_unreplied.sh      | 167 ++++++++++++++++++
 2 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh

diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile
index cd6430b39982..8748199ac109 100644
--- a/tools/testing/selftests/netfilter/Makefile
+++ b/tools/testing/selftests/netfilter/Makefile
@@ -5,7 +5,7 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
 	conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
 	nft_concat_range.sh nft_conntrack_helper.sh \
 	nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
-	ipip-conntrack-mtu.sh
+	ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh
 
 LDLIBS = -lmnl
 TEST_GEN_FILES =  nf-queue
diff --git a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh
new file mode 100755
index 000000000000..e7d7bf13cff5
--- /dev/null
+++ b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Check that UNREPLIED tcp conntrack will eventually timeout.
+#
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+waittime=20
+sfx=$(mktemp -u "XXXXXXXX")
+ns1="ns1-$sfx"
+ns2="ns2-$sfx"
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without nft tool"
+	exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+	echo "SKIP: Could not run test without ip tool"
+	exit $ksft_skip
+fi
+
+cleanup() {
+	ip netns pids $ns1 | xargs kill 2>/dev/null
+	ip netns pids $ns2 | xargs kill 2>/dev/null
+
+	ip netns del $ns1
+	ip netns del $ns2
+}
+
+ipv4() {
+    echo -n 192.168.$1.2
+}
+
+check_counter()
+{
+	ns=$1
+	name=$2
+	expect=$3
+	local lret=0
+
+	cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect")
+	if [ $? -ne 0 ]; then
+		echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2
+		ip netns exec $ns2 nft list counter inet filter "$name" 1>&2
+		lret=1
+	fi
+
+	return $lret
+}
+
+# Create test namespaces
+ip netns add $ns1 || exit 1
+
+trap cleanup EXIT
+
+ip netns add $ns2 || exit 1
+
+# Connect the namespace to the host using a veth pair
+ip -net $ns1 link add name veth1 type veth peer name veth2
+ip -net $ns1 link set netns $ns2 dev veth2
+
+ip -net $ns1 link set up dev lo
+ip -net $ns2 link set up dev lo
+ip -net $ns1 link set up dev veth1
+ip -net $ns2 link set up dev veth2
+
+ip -net $ns2 addr add 10.11.11.2/24 dev veth2
+ip -net $ns2 route add default via 10.11.11.1
+
+ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1
+
+# add a rule inside NS so we enable conntrack
+ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT
+
+ip -net $ns1 addr add 10.11.11.1/24 dev veth1
+ip -net $ns1 route add 10.99.99.99 via 10.11.11.2
+
+# Check connectivity works
+ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1
+
+ip netns exec $ns2 nc -l -p 8080 < /dev/null &
+
+# however, conntrack entries are there
+
+ip netns exec $ns2 nft -f - <<EOF
+table inet filter {
+	counter connreq { }
+	counter redir { }
+	chain input {
+		type filter hook input priority 0; policy accept;
+		ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept
+		ct state new ct status dnat tcp dport 8080 counter name "redir" accept
+	}
+}
+EOF
+if [ $? -ne 0 ]; then
+	echo "ERROR: Could not load nft rules"
+	exit 1
+fi
+
+ip netns exec $ns2 sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10
+
+echo "INFO: connect $ns1 -> $ns2 to the virtual ip"
+ip netns exec $ns1 bash -c 'while true ; do
+	nc -p 60000 10.99.99.99 80
+	sleep 1
+	done' &
+
+sleep 1
+
+ip netns exec $ns2 nft -f - <<EOF
+table inet nat {
+	chain prerouting {
+		type nat hook prerouting priority 0; policy accept;
+		ip daddr 10.99.99.99 tcp dport 80 redirect to :8080
+	}
+}
+EOF
+if [ $? -ne 0 ]; then
+	echo "ERROR: Could not load nat redirect"
+	exit 1
+fi
+
+count=$(ip netns exec $ns2 conntrack -L -p tcp --dport 80 2>/dev/null | wc -l)
+if [ $count -eq 0 ]; then
+	echo "ERROR: $ns2 did not pick up tcp connection from peer"
+	exit 1
+fi
+
+echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect"
+for i in $(seq 1 $waittime); do
+	echo -n "."
+
+	sleep 1
+
+	count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l)
+	if [ $count -gt 0 ]; then
+		echo
+		echo "PASS: redirection took effect after $i seconds"
+		break
+	fi
+
+	m=$((i%20))
+	if [ $m -eq 0 ]; then
+		echo " waited for $i seconds"
+	fi
+done
+
+expect="packets 1 bytes 60"
+check_counter "$ns2" "redir" "$expect"
+if [ $? -ne 0 ]; then
+	ret=1
+fi
+
+if [ $ret -eq 0 ];then
+	echo "PASS: redirection counter has expected values"
+else
+	echo "ERROR: no tcp connection was redirected"
+fi
+
+exit $ret

From e15d4cdf27cb0c1e977270270b2cea12e0955edd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 24 Jun 2021 12:36:42 +0200
Subject: [PATCH 063/158] netfilter: conntrack: do not renew entry stuck in tcp
 SYN_SENT state

Consider:
  client -----> conntrack ---> Host

client sends a SYN, but $Host is unreachable/silent.
Client eventually gives up and the conntrack entry will time out.

However, if the client is restarted with same addr/port pair, it
may prevent the conntrack entry from timing out.

This is noticeable when the existing conntrack entry has no NAT
transformation or an outdated one and port reuse happens either
on client or due to a NAT middlebox.

This change prevents refresh of the timeout for SYN retransmits,
so entry is going away after nf_conntrack_tcp_timeout_syn_sent
seconds (default: 60).

Entry will be re-created on next connection attempt, but then
nat rules will be evaluated again.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_tcp.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index f7e8baf59b51..eb4de92077a8 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1134,6 +1134,16 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
 			nf_ct_kill_acct(ct, ctinfo, skb);
 			return NF_ACCEPT;
 		}
+
+		if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
+			/* do not renew timeout on SYN retransmit.
+			 *
+			 * Else port reuse by client or NAT middlebox can keep
+			 * entry alive indefinitely (including nat info).
+			 */
+			return NF_ACCEPT;
+		}
+
 		/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
 		 * pickup with loose=1. Avoid large ESTABLISHED timeout.
 		 */

From 4ca041f919f13783b0b03894783deee00dbca19a Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 24 Jun 2021 20:57:18 +0100
Subject: [PATCH 064/158] netfilter: nf_tables: Fix dereference of null pointer
 flow

In the case where chain->flags & NFT_CHAIN_HW_OFFLOAD is false then
nft_flow_rule_create is not called and flow is NULL. The subsequent
error handling execution via label err_destroy_flow_rule will lead
to a null pointer dereference on flow when calling nft_flow_rule_destroy.
Since the error path to err_destroy_flow_rule has to cater for null
and non-null flows, only call nft_flow_rule_destroy if flow is non-null
to fix this issue.

Addresses-Coverity: ("Explicity null dereference")
Fixes: 3c5e44622011 ("netfilter: nf_tables: memleak in hw offload abort path")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 390d4466567f..de182d1f7c4e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3446,7 +3446,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
 	return 0;
 
 err_destroy_flow_rule:
-	nft_flow_rule_destroy(flow);
+	if (flow)
+		nft_flow_rule_destroy(flow);
 err_release_rule:
 	nf_tables_rule_release(&ctx, rule);
 err_release_expr:

From a23f89a9990684a0ca0cac4a2857c15d338ebe2d Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 1 Jul 2021 08:02:24 +0300
Subject: [PATCH 065/158] netfilter: conntrack: nf_ct_gre_keymap_flush()
 removal

nf_ct_gre_keymap_flush() is useless.
It is called from nf_conntrack_cleanup_net_list() only and tries to remove
nf_ct_gre_keymap entries from pernet gre keymap list. Though:
a) at this point the list should already be empty, all its entries were
deleted during the conntracks cleanup, because
nf_conntrack_cleanup_net_list() executes nf_ct_iterate_cleanup(kill_all)
before nf_conntrack_proto_pernet_fini():
 nf_conntrack_cleanup_net_list
  +- nf_ct_iterate_cleanup
  |   nf_ct_put
  |    nf_conntrack_put
  |     nf_conntrack_destroy
  |      destroy_conntrack
  |       destroy_gre_conntrack
  |        nf_ct_gre_keymap_destroy
  `- nf_conntrack_proto_pernet_fini
      nf_ct_gre_keymap_flush

b) Let's say we find that the keymap list is not empty. This means netns
still has a conntrack associated with gre, in which case we should not free
its memory, because this will lead to a double free and related crashes.
However I doubt it could have gone unnoticed for years, obviously
this does not happen in real life. So I think we can remove
both nf_ct_gre_keymap_flush() and nf_conntrack_proto_pernet_fini().

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h |  1 -
 net/netfilter/nf_conntrack_core.c         |  1 -
 net/netfilter/nf_conntrack_proto.c        |  7 -------
 net/netfilter/nf_conntrack_proto_gre.c    | 13 -------------
 4 files changed, 22 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 09f2efea0b97..13807ea94cd2 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -30,7 +30,6 @@ void nf_conntrack_cleanup_net(struct net *net);
 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list);
 
 void nf_conntrack_proto_pernet_init(struct net *net);
-void nf_conntrack_proto_pernet_fini(struct net *net);
 
 int nf_conntrack_proto_init(void);
 void nf_conntrack_proto_fini(void);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 96ba19fc8155..085a11f1eb43 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2457,7 +2457,6 @@ i_see_dead_people:
 	}
 
 	list_for_each_entry(net, net_exit_list, exit_list) {
-		nf_conntrack_proto_pernet_fini(net);
 		nf_conntrack_ecache_pernet_fini(net);
 		nf_conntrack_expect_pernet_fini(net);
 		free_percpu(net->ct.stat);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 55647409a9be..8f7a9837349c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -697,13 +697,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
 #endif
 }
 
-void nf_conntrack_proto_pernet_fini(struct net *net)
-{
-#ifdef CONFIG_NF_CT_PROTO_GRE
-	nf_ct_gre_keymap_flush(net);
-#endif
-}
-
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 		  &nf_conntrack_htable_size, 0600);
 
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index db11e403d818..728eeb0aea87 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -55,19 +55,6 @@ static inline struct nf_gre_net *gre_pernet(struct net *net)
 	return &net->ct.nf_ct_proto.gre;
 }
 
-void nf_ct_gre_keymap_flush(struct net *net)
-{
-	struct nf_gre_net *net_gre = gre_pernet(net);
-	struct nf_ct_gre_keymap *km, *tmp;
-
-	spin_lock_bh(&keymap_lock);
-	list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
-		list_del_rcu(&km->list);
-		kfree_rcu(km, rcu);
-	}
-	spin_unlock_bh(&keymap_lock);
-}
-
 static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
 				const struct nf_conntrack_tuple *t)
 {

From c23a9fd209bc6f8c1fa6ee303fdf037d784a1627 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 1 Jul 2021 08:02:49 +0300
Subject: [PATCH 066/158] netfilter: ctnetlink: suspicious RCU usage in
 ctnetlink_dump_helpinfo

Two patches listed below removed ctnetlink_dump_helpinfo call from under
rcu_read_lock. Now its rcu_dereference generates following warning:
=============================
WARNING: suspicious RCU usage
5.13.0+ #5 Not tainted
-----------------------------
net/netfilter/nf_conntrack_netlink.c:221 suspicious rcu_dereference_check() usage!

other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
stack backtrace:
CPU: 1 PID: 2251 Comm: conntrack Not tainted 5.13.0+ #5
Call Trace:
 dump_stack+0x7f/0xa1
 ctnetlink_dump_helpinfo+0x134/0x150 [nf_conntrack_netlink]
 ctnetlink_fill_info+0x2c2/0x390 [nf_conntrack_netlink]
 ctnetlink_dump_table+0x13f/0x370 [nf_conntrack_netlink]
 netlink_dump+0x10c/0x370
 __netlink_dump_start+0x1a7/0x260
 ctnetlink_get_conntrack+0x1e5/0x250 [nf_conntrack_netlink]
 nfnetlink_rcv_msg+0x613/0x993 [nfnetlink]
 netlink_rcv_skb+0x50/0x100
 nfnetlink_rcv+0x55/0x120 [nfnetlink]
 netlink_unicast+0x181/0x260
 netlink_sendmsg+0x23f/0x460
 sock_sendmsg+0x5b/0x60
 __sys_sendto+0xf1/0x160
 __x64_sys_sendto+0x24/0x30
 do_syscall_64+0x36/0x70
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Fixes: 49ca022bccc5 ("netfilter: ctnetlink: don't dump ct extensions of unconfirmed conntracks")
Fixes: 0b35f6031a00 ("netfilter: Remove duplicated rcu_read_lock.")
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4e1a9dba7077..e81af33b233b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -218,6 +218,7 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
 	if (!help)
 		return 0;
 
+	rcu_read_lock();
 	helper = rcu_dereference(help->helper);
 	if (!helper)
 		goto out;
@@ -233,9 +234,11 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
 
 	nla_nest_end(skb, nest_helper);
 out:
+	rcu_read_unlock();
 	return 0;
 
 nla_put_failure:
+	rcu_read_unlock();
 	return -1;
 }
 

From 633fa666401c42f9a106a509b7702c58c84524e2 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 1 Jul 2021 15:16:53 +0200
Subject: [PATCH 067/158] net/sched: sch_taprio: fix typo in comment

I have checked that the IEEE standard 802.1Q-2018 section 8.6.9.4.5
is called AdminGateStates.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_taprio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 66fe2b82af9a..07b30d0601d7 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -564,7 +564,7 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch)
 	/* if there's no entry, it means that the schedule didn't
 	 * start yet, so force all gates to be open, this is in
 	 * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5
-	 * "AdminGateSates"
+	 * "AdminGateStates"
 	 */
 	gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
 

From bde3c8ffdd4153a3e9f0b0d51d972b30113b35ac Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 1 Jul 2021 22:41:19 +0200
Subject: [PATCH 068/158] gve: Simplify code and axe the use of a deprecated
 API

The wrappers in include/linux/pci-dma-compat.h should go away.

Replace 'pci_set_dma_mask/pci_set_consistent_dma_mask' by an equivalent
and less verbose 'dma_set_mask_and_coherent()' call.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Catherine Sullivan <csully@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/google/gve/gve_main.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index c03984b26db4..099a2bc5ae67 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1477,19 +1477,12 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	pci_set_master(pdev);
 
-	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
 	if (err) {
 		dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err);
 		goto abort_with_pci_region;
 	}
 
-	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (err) {
-		dev_err(&pdev->dev,
-			"Failed to set consistent dma mask: err=%d\n", err);
-		goto abort_with_pci_region;
-	}
-
 	reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0);
 	if (!reg_bar) {
 		dev_err(&pdev->dev, "Failed to map pci bar!\n");

From 40fc3054b45820c28ea3c65e2c86d041dc244a8a Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Fri, 2 Jul 2021 02:47:00 +0300
Subject: [PATCH 069/158] net: ipv6: fix return value of ip6_skb_dst_mtu

Commit 628a5c561890 ("[INET]: Add IP(V6)_PMTUDISC_RPOBE") introduced
ip6_skb_dst_mtu with return value of signed int which is inconsistent
with actually returned values. Also 2 users of this function actually
assign its value to unsigned int variable and only __xfrm6_output
assigns result of this function to signed variable but actually uses
as unsigned in further comparisons and calls. Change this function
to return unsigned int value.

Fixes: 628a5c561890 ("[INET]: Add IP(V6)_PMTUDISC_RPOBE")
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 2 +-
 net/ipv6/xfrm6_output.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index f14149df5a65..625a38ccb5d9 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -263,7 +263,7 @@ static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		 int (*output)(struct net *, struct sock *, struct sk_buff *));
 
-static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
+static inline unsigned int ip6_skb_dst_mtu(struct sk_buff *skb)
 {
 	int mtu;
 
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 57fa27c1cdf9..d0d280077721 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -49,7 +49,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct xfrm_state *x = dst->xfrm;
-	int mtu;
+	unsigned int mtu;
 	bool toobig;
 
 #ifdef CONFIG_NETFILTER

From 1bfa4d0cb5adf954e0f4870a3ecb7cb19506320c Mon Sep 17 00:00:00 2001
From: Bailey Forrest <bcf@google.com>
Date: Thu, 1 Jul 2021 20:13:36 -0700
Subject: [PATCH 070/158] gve: DQO: Remove incorrect prefetch

The prefetch is incorrectly using the dma address instead of the virtual
address.

It's supposed to be:
prefetch((char *)buf_state->page_info.page_address +
	 buf_state->page_info.page_offset)

However, after correcting this mistake, there is no evidence of
performance improvement.

Fixes: 9b8dd5e5ea48 ("gve: DQO: Add RX path")
Signed-off-by: Bailey Forrest <bcf@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/google/gve/gve_rx_dqo.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c
index 77bb8227f89b..8500621b2cd4 100644
--- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c
+++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c
@@ -566,13 +566,6 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
 		return 0;
 	}
 
-	/* Prefetch the payload header. */
-	prefetch((char *)buf_state->addr + buf_state->page_info.page_offset);
-#if L1_CACHE_BYTES < 128
-	prefetch((char *)buf_state->addr + buf_state->page_info.page_offset +
-		 L1_CACHE_BYTES);
-#endif
-
 	if (eop && buf_len <= priv->rx_copybreak) {
 		rx->skb_head = gve_rx_copy(priv->dev, napi,
 					   &buf_state->page_info, buf_len, 0);

From 8955b90c3cdad199137809aac8ccbbb585355913 Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Fri, 2 Jul 2021 11:34:31 +0800
Subject: [PATCH 071/158] net/sched: act_ct: fix err check for
 nf_conntrack_confirm

The confirm operation should be checked. If there are any failed,
the packet should be dropped like in ovs and netfilter.

Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Signed-off-by: wenxu <wenxu@ucloud.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_ct.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index a656baa321fe..a62f404d0e16 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1026,7 +1026,8 @@ do_nat:
 		/* This will take care of sending queued events
 		 * even if the connection is already confirmed.
 		 */
-		nf_conntrack_confirm(skb);
+		if (nf_conntrack_confirm(skb) != NF_ACCEPT)
+			goto drop;
 	}
 
 	if (!skip_add)

From 561022acb1ce62e50f7a8258687a21b84282a4cb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Jul 2021 13:09:03 -0700
Subject: [PATCH 072/158] tcp: annotate data races around tp->mtu_info

While tp->mtu_info is read while socket is owned, the write
sides happen from err handlers (tcp_v[46]_mtu_reduced)
which only own the socket spinlock.

Fixes: 563d34d05786 ("tcp: dont drop MTU reduction indications")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 4 ++--
 net/ipv6/tcp_ipv6.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e66ad6bfe808..b9dc2d6197be 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -342,7 +342,7 @@ void tcp_v4_mtu_reduced(struct sock *sk)
 
 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 		return;
-	mtu = tcp_sk(sk)->mtu_info;
+	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
 	dst = inet_csk_update_pmtu(sk, mtu);
 	if (!dst)
 		return;
@@ -546,7 +546,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
 			if (sk->sk_state == TCP_LISTEN)
 				goto out;
 
-			tp->mtu_info = info;
+			WRITE_ONCE(tp->mtu_info, info);
 			if (!sock_owned_by_user(sk)) {
 				tcp_v4_mtu_reduced(sk);
 			} else {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 578ab6305c3f..593c32fe57ed 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -352,7 +352,7 @@ static void tcp_v6_mtu_reduced(struct sock *sk)
 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 		return;
 
-	dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info);
+	dst = inet6_csk_update_pmtu(sk, READ_ONCE(tcp_sk(sk)->mtu_info));
 	if (!dst)
 		return;
 
@@ -443,7 +443,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (!ip6_sk_accept_pmtu(sk))
 			goto out;
 
-		tp->mtu_info = ntohl(info);
+		WRITE_ONCE(tp->mtu_info, ntohl(info));
 		if (!sock_owned_by_user(sk))
 			tcp_v6_mtu_reduced(sk);
 		else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,

From 81b4a0cc7565b08cadd0d02bae3434f127d1d72a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 2 Jul 2021 07:41:01 -0700
Subject: [PATCH 073/158] sock: fix error in sock_setsockopt()

Some tests are failing, John bisected the issue to a recent commit.

sock_set_timestamp() parameters should be :

1) sk
2) optname
3) valbool

Fixes: 371087aa476a ("sock: expose so_timestamp options for mptcp")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Bisected-by: John Sperbeck <jsperbeck@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Mat Martineau <mathew.j.martineau@linux.intel.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index cad107112204..1c4b0468bc2c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1116,7 +1116,7 @@ set_sndbuf:
 	case SO_TIMESTAMP_NEW:
 	case SO_TIMESTAMPNS_OLD:
 	case SO_TIMESTAMPNS_NEW:
-		sock_set_timestamp(sk, valbool, optname);
+		sock_set_timestamp(sk, optname, valbool);
 		break;
 
 	case SO_TIMESTAMPING_NEW:

From a019abd8022061b917da767cd1a66ed823724eab Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller@proxmox.com>
Date: Fri, 2 Jul 2021 14:07:36 +0200
Subject: [PATCH 074/158] net: bridge: sync fdb to new unicast-filtering ports

Since commit 2796d0c648c9 ("bridge: Automatically manage
port promiscuous mode.")
bridges with `vlan_filtering 1` and only 1 auto-port don't
set IFF_PROMISC for unicast-filtering-capable ports.

Normally on port changes `br_manage_promisc` is called to
update the promisc flags and unicast filters if necessary,
but it cannot distinguish between *new* ports and ones
losing their promisc flag, and new ports end up not
receiving the MAC address list.

Fix this by calling `br_fdb_sync_static` in `br_add_if`
after the port promisc flags are updated and the unicast
filter was supposed to have been filled.

Fixes: 2796d0c648c9 ("bridge: Automatically manage port promiscuous mode.")
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_if.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f7d2f472ae24..6e4a32354a13 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -562,7 +562,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 	struct net_bridge_port *p;
 	int err = 0;
 	unsigned br_hr, dev_hr;
-	bool changed_addr;
+	bool changed_addr, fdb_synced = false;
 
 	/* Don't allow bridging non-ethernet like devices. */
 	if ((dev->flags & IFF_LOOPBACK) ||
@@ -652,6 +652,19 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 	list_add_rcu(&p->list, &br->port_list);
 
 	nbp_update_port_count(br);
+	if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) {
+		/* When updating the port count we also update all ports'
+		 * promiscuous mode.
+		 * A port leaving promiscuous mode normally gets the bridge's
+		 * fdb synced to the unicast filter (if supported), however,
+		 * `br_port_clear_promisc` does not distinguish between
+		 * non-promiscuous ports and *new* ports, so we need to
+		 * sync explicitly here.
+		 */
+		fdb_synced = br_fdb_sync_static(br, p) == 0;
+		if (!fdb_synced)
+			netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n");
+	}
 
 	netdev_update_features(br->dev);
 
@@ -701,6 +714,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 	return 0;
 
 err7:
+	if (fdb_synced)
+		br_fdb_unsync_static(br, p);
 	list_del_rcu(&p->list);
 	br_fdb_delete_by_port(br, p, 0, 1);
 	nbp_update_port_count(br);

From 77ac5e40c44eb78333fbc38482d61fc2af7dda0a Mon Sep 17 00:00:00 2001
From: Louis Peens <louis.peens@corigine.com>
Date: Fri, 2 Jul 2021 11:21:38 +0200
Subject: [PATCH 075/158] net/sched: act_ct: remove and free nf_table callbacks

When cleaning up the nf_table in tcf_ct_flow_table_cleanup_work
there is no guarantee that the callback list, added to by
nf_flow_table_offload_add_cb, is empty. This means that it is
possible that the flow_block_cb memory allocated will be lost.

Fix this by iterating the list and freeing the flow_block_cb entries
before freeing the nf_table entry (via freeing ct_ft).

Fixes: 978703f42549 ("netfilter: flowtable: Add API for registering to flow table events")
Signed-off-by: Louis Peens <louis.peens@corigine.com>
Signed-off-by: Yinjun Zhang <yinjun.zhang@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_ct.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index a62f404d0e16..1b4b3514c94f 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -322,11 +322,22 @@ err_alloc:
 
 static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
 {
+	struct flow_block_cb *block_cb, *tmp_cb;
 	struct tcf_ct_flow_table *ct_ft;
+	struct flow_block *block;
 
 	ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
 			     rwork);
 	nf_flow_table_free(&ct_ft->nf_ft);
+
+	/* Remove any remaining callbacks before cleanup */
+	block = &ct_ft->nf_ft.flow_block;
+	down_write(&ct_ft->nf_ft.flow_block_lock);
+	list_for_each_entry_safe(block_cb, tmp_cb, &block->cb_list, list) {
+		list_del(&block_cb->list);
+		flow_block_cb_free(block_cb);
+	}
+	up_write(&ct_ft->nf_ft.flow_block_lock);
 	kfree(ct_ft);
 
 	module_put(THIS_MODULE);

From 7cc93d888df764a13f196e3d4aef38869f7dd217 Mon Sep 17 00:00:00 2001
From: Louis Peens <louis.peens@corigine.com>
Date: Fri, 2 Jul 2021 11:21:39 +0200
Subject: [PATCH 076/158] nfp: flower-ct: remove callback delete deadlock

The current location of the callback delete can lead to a race
condition where deleting the callback requires a write_lock on
the nf_table, but at the same time another thread from netfilter
could have taken a read lock on the table before trying to offload.
Since the driver is taking a rtnl_lock this can lead into a deadlock
situation, where the netfilter offload will wait for the cls_flower
rtnl_lock to be released, but this cannot happen since this is
waiting for the nf_table read_lock to be released before it can
delete the callback.

Solve this by completely removing the nf_flow_table_offload_del_cb
call, as this will now be cleaned up by act_ct itself when cleaning
up the specific nf_table.

Fixes: 62268e78145f ("nfp: flower-ct: add nft callback stubs")
Signed-off-by: Louis Peens <louis.peens@corigine.com>
Signed-off-by: Yinjun Zhang <yinjun.zhang@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/netronome/nfp/flower/conntrack.c   | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/conntrack.c b/drivers/net/ethernet/netronome/nfp/flower/conntrack.c
index 273d529d43c2..128020b1573e 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/conntrack.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/conntrack.c
@@ -1141,20 +1141,7 @@ int nfp_fl_ct_del_flow(struct nfp_fl_ct_map_entry *ct_map_ent)
 		nfp_fl_ct_clean_flow_entry(ct_entry);
 		kfree(ct_map_ent);
 
-		/* If this is the last pre_ct_rule it means that it is
-		 * very likely that the nft table will be cleaned up next,
-		 * as this happens on the removal of the last act_ct flow.
-		 * However we cannot deregister the callback on the removal
-		 * of the last nft flow as this runs into a deadlock situation.
-		 * So deregister the callback on removal of the last pre_ct flow
-		 * and remove any remaining nft flow entries. We also cannot
-		 * save this state and delete the callback later since the
-		 * nft table would already have been freed at that time.
-		 */
 		if (!zt->pre_ct_count) {
-			nf_flow_table_offload_del_cb(zt->nft,
-						     nfp_fl_ct_handle_nft_flow,
-						     zt);
 			zt->nft = NULL;
 			nfp_fl_ct_clean_nft_entries(zt);
 		}

From b22580233d473dbf7bbfa4f6549c09e2c80e9e64 Mon Sep 17 00:00:00 2001
From: Ronak Doshi <doshir@vmware.com>
Date: Thu, 1 Jul 2021 23:44:27 -0700
Subject: [PATCH 077/158] vmxnet3: fix cksum offload issues for tunnels with
 non-default udp ports

Commit dacce2be3312 ("vmxnet3: add geneve and vxlan tunnel offload
support") added support for encapsulation offload. However, the inner
offload capability is to be restricted to UDP tunnels with default
Vxlan and Geneve ports.

This patch fixes the issue for tunnels with non-default ports using
features check capability and filtering appropriate features for such
tunnels.

Fixes: dacce2be3312 ("vmxnet3: add geneve and vxlan tunnel offload support")
Signed-off-by: Ronak Doshi <doshir@vmware.com>
Acked-by: Guolin Yang <gyang@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_ethtool.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index c0bd9cbc43b1..1b483cf2b1ca 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -1,7 +1,7 @@
 /*
  * Linux driver for VMware's vmxnet3 ethernet NIC.
  *
- * Copyright (C) 2008-2020, VMware, Inc. All Rights Reserved.
+ * Copyright (C) 2008-2021, VMware, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -26,6 +26,10 @@
 
 
 #include "vmxnet3_int.h"
+#include <net/vxlan.h>
+#include <net/geneve.h>
+
+#define VXLAN_UDP_PORT 8472
 
 struct vmxnet3_stat_desc {
 	char desc[ETH_GSTRING_LEN];
@@ -262,6 +266,8 @@ netdev_features_t vmxnet3_features_check(struct sk_buff *skb,
 	if (VMXNET3_VERSION_GE_4(adapter) &&
 	    skb->encapsulation && skb->ip_summed == CHECKSUM_PARTIAL) {
 		u8 l4_proto = 0;
+		u16 port;
+		struct udphdr *udph;
 
 		switch (vlan_get_protocol(skb)) {
 		case htons(ETH_P_IP):
@@ -274,8 +280,20 @@ netdev_features_t vmxnet3_features_check(struct sk_buff *skb,
 			return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
 		}
 
-		if (l4_proto != IPPROTO_UDP)
+		switch (l4_proto) {
+		case IPPROTO_UDP:
+			udph = udp_hdr(skb);
+			port = be16_to_cpu(udph->dest);
+			/* Check if offloaded port is supported */
+			if (port != GENEVE_UDP_PORT &&
+			    port != IANA_VXLAN_UDP_PORT &&
+			    port != VXLAN_UDP_PORT) {
+				return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+			}
+			break;
+		default:
 			return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+		}
 	}
 	return features;
 }

From b43c8909be52f2baca8884f967b418a88424494a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Sat, 3 Jul 2021 00:38:43 +0200
Subject: [PATCH 078/158] udp: properly flush normal packet at GRO time

If an UDP packet enters the GRO engine but is not eligible
for aggregation and is not targeting an UDP tunnel,
udp_gro_receive() will not set the flush bit, and packet
could delayed till the next napi flush.

Fix the issue ensuring non GROed packets traverse
skb_gro_flush_final().

Reported-and-tested-by: Matthias Treydte <mt@waldheinz.de>
Fixes: 18f25dc39990 ("udp: skip L4 aggregation for UDP tunnel packets")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 54e06b88af69..9dde1e5fb449 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -525,8 +525,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 
 		if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
 		    (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist)
-			pp = call_gro_receive(udp_gro_receive_segment, head, skb);
-		return pp;
+			return call_gro_receive(udp_gro_receive_segment, head, skb);
+
+		/* no GRO, be sure flush the current packet */
+		goto out;
 	}
 
 	if (NAPI_GRO_CB(skb)->encap_mark ||

From 6ff63a150b5556012589ae59efac1b5eeb7d32c3 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Sat, 3 Jul 2021 21:17:27 +0200
Subject: [PATCH 079/158] net: marvell: always set skb_shared_info in
 mvneta_swbm_add_rx_fragment

Always set skb_shared_info data structure in mvneta_swbm_add_rx_fragment
routine even if the fragment contains only the ethernet FCS.

Fixes: 039fbc47f9f1 ("net: mvneta: alloc skb_shared_info on the mvneta_rx_swbm stack")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvneta.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 361bc4fbe20b..76a7777c746d 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2299,19 +2299,19 @@ mvneta_swbm_add_rx_fragment(struct mvneta_port *pp,
 		skb_frag_off_set(frag, pp->rx_offset_correction);
 		skb_frag_size_set(frag, data_len);
 		__skb_frag_set_page(frag, page);
-
-		/* last fragment */
-		if (len == *size) {
-			struct skb_shared_info *sinfo;
-
-			sinfo = xdp_get_shared_info_from_buff(xdp);
-			sinfo->nr_frags = xdp_sinfo->nr_frags;
-			memcpy(sinfo->frags, xdp_sinfo->frags,
-			       sinfo->nr_frags * sizeof(skb_frag_t));
-		}
 	} else {
 		page_pool_put_full_page(rxq->page_pool, page, true);
 	}
+
+	/* last fragment */
+	if (len == *size) {
+		struct skb_shared_info *sinfo;
+
+		sinfo = xdp_get_shared_info_from_buff(xdp);
+		sinfo->nr_frags = xdp_sinfo->nr_frags;
+		memcpy(sinfo->frags, xdp_sinfo->frags,
+		       sinfo->nr_frags * sizeof(skb_frag_t));
+	}
 	*size -= len;
 }
 

From 55eac20617ca84129273ab248f4d7bfe456967de Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Mon, 5 Jul 2021 16:53:06 +0800
Subject: [PATCH 080/158] ptp: fix NULL pointer dereference in
 ptp_clock_register

Fix NULL pointer dereference in ptp_clock_register. The argument
"parent" of ptp_clock_register may be NULL pointer.

Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index f012fa581cf4..ce6d9fc85607 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -236,7 +236,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 	}
 
 	/* PTP virtual clock is being registered under physical clock */
-	if (parent->class && parent->class->name &&
+	if (parent && parent->class && parent->class->name &&
 	    strcmp(parent->class->name, "ptp") == 0)
 		ptp->is_virtual_clock = true;
 

From f6a175cfcc8df578adfdf06b05c82b3b8c8b5cfd Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Mon, 5 Jul 2021 17:46:17 +0800
Subject: [PATCH 081/158] ptp: fix format string mismatch in ptp_sysfs.c

Fix format string mismatch in ptp_sysfs.c. Use %u for unsigned int.

Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c
index 6a36590ca77a..b3d96b747292 100644
--- a/drivers/ptp/ptp_sysfs.c
+++ b/drivers/ptp/ptp_sysfs.c
@@ -179,7 +179,7 @@ static ssize_t n_vclocks_show(struct device *dev,
 	if (mutex_lock_interruptible(&ptp->n_vclocks_mux))
 		return -ERESTARTSYS;
 
-	size = snprintf(page, PAGE_SIZE - 1, "%d\n", ptp->n_vclocks);
+	size = snprintf(page, PAGE_SIZE - 1, "%u\n", ptp->n_vclocks);
 
 	mutex_unlock(&ptp->n_vclocks_mux);
 
@@ -252,7 +252,7 @@ static ssize_t max_vclocks_show(struct device *dev,
 	struct ptp_clock *ptp = dev_get_drvdata(dev);
 	ssize_t size;
 
-	size = snprintf(page, PAGE_SIZE - 1, "%d\n", ptp->max_vclocks);
+	size = snprintf(page, PAGE_SIZE - 1, "%u\n", ptp->max_vclocks);
 
 	return size;
 }

From 81c52c42afd92b741289208c65e5063b9e23ffb4 Mon Sep 17 00:00:00 2001
From: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Date: Mon, 5 Jul 2021 18:26:53 +0800
Subject: [PATCH 082/158] net: stmmac: separate the tas basetime calculation
 function

Separate the TAS basetime calculation function so that it can be
called by other functions.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac.h  |  3 ++
 .../net/ethernet/stmicro/stmmac/stmmac_tc.c   | 38 ++++++++++++-------
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index e735134e8487..fcdb1d20389b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -349,6 +349,9 @@ void stmmac_enable_rx_queue(struct stmmac_priv *priv, u32 queue);
 void stmmac_disable_tx_queue(struct stmmac_priv *priv, u32 queue);
 void stmmac_enable_tx_queue(struct stmmac_priv *priv, u32 queue);
 int stmmac_xsk_wakeup(struct net_device *dev, u32 queue, u32 flags);
+struct timespec64 stmmac_calc_tas_basetime(ktime_t old_base_time,
+					   ktime_t current_time,
+					   u64 cycle_time);
 
 #if IS_ENABLED(CONFIG_STMMAC_SELFTESTS)
 void stmmac_selftest_run(struct net_device *dev,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index 92dab609d4f8..596626c71189 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -711,6 +711,29 @@ static int tc_setup_cls(struct stmmac_priv *priv,
 	return ret;
 }
 
+struct timespec64 stmmac_calc_tas_basetime(ktime_t old_base_time,
+					   ktime_t current_time,
+					   u64 cycle_time)
+{
+	struct timespec64 time;
+
+	if (ktime_after(old_base_time, current_time)) {
+		time = ktime_to_timespec64(old_base_time);
+	} else {
+		s64 n;
+		ktime_t base_time;
+
+		n = div64_s64(ktime_sub_ns(current_time, old_base_time),
+			      cycle_time);
+		base_time = ktime_add_ns(old_base_time,
+					 (n + 1) * cycle_time);
+
+		time = ktime_to_timespec64(base_time);
+	}
+
+	return time;
+}
+
 static int tc_setup_taprio(struct stmmac_priv *priv,
 			   struct tc_taprio_qopt_offload *qopt)
 {
@@ -814,19 +837,8 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 	/* Adjust for real system time */
 	priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, &current_time);
 	current_time_ns = timespec64_to_ktime(current_time);
-	if (ktime_after(qopt->base_time, current_time_ns)) {
-		time = ktime_to_timespec64(qopt->base_time);
-	} else {
-		ktime_t base_time;
-		s64 n;
-
-		n = div64_s64(ktime_sub_ns(current_time_ns, qopt->base_time),
-			      qopt->cycle_time);
-		base_time = ktime_add_ns(qopt->base_time,
-					 (n + 1) * qopt->cycle_time);
-
-		time = ktime_to_timespec64(base_time);
-	}
+	time = stmmac_calc_tas_basetime(qopt->base_time, current_time_ns,
+					qopt->cycle_time);
 
 	priv->plat->est->btr[0] = (u32)time.tv_nsec;
 	priv->plat->est->btr[1] = (u32)time.tv_sec;

From b2aae654a4794ef898ad33a179f341eb610f6b85 Mon Sep 17 00:00:00 2001
From: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Date: Mon, 5 Jul 2021 18:26:54 +0800
Subject: [PATCH 083/158] net: stmmac: add mutex lock to protect est parameters

Add a mutex lock to protect est structure parameters so that the
EST parameters can be updated by other threads.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 12 +++++++++++-
 include/linux/stmmac.h                          |  1 +
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index 596626c71189..2e3cdf540168 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -796,14 +796,18 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 					 GFP_KERNEL);
 		if (!plat->est)
 			return -ENOMEM;
+
+		mutex_init(&priv->plat->est->lock);
 	} else {
 		memset(plat->est, 0, sizeof(*plat->est));
 	}
 
 	size = qopt->num_entries;
 
+	mutex_lock(&priv->plat->est->lock);
 	priv->plat->est->gcl_size = size;
 	priv->plat->est->enable = qopt->enable;
+	mutex_unlock(&priv->plat->est->lock);
 
 	for (i = 0; i < size; i++) {
 		s64 delta_ns = qopt->entries[i].interval;
@@ -834,6 +838,7 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 		priv->plat->est->gcl[i] = delta_ns | (gates << wid);
 	}
 
+	mutex_lock(&priv->plat->est->lock);
 	/* Adjust for real system time */
 	priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, &current_time);
 	current_time_ns = timespec64_to_ktime(current_time);
@@ -847,8 +852,10 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 	priv->plat->est->ctr[0] = do_div(ctr, NSEC_PER_SEC);
 	priv->plat->est->ctr[1] = (u32)ctr;
 
-	if (fpe && !priv->dma_cap.fpesel)
+	if (fpe && !priv->dma_cap.fpesel) {
+		mutex_unlock(&priv->plat->est->lock);
 		return -EOPNOTSUPP;
+	}
 
 	/* Actual FPE register configuration will be done after FPE handshake
 	 * is success.
@@ -857,6 +864,7 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 
 	ret = stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
 				   priv->plat->clk_ptp_rate);
+	mutex_unlock(&priv->plat->est->lock);
 	if (ret) {
 		netdev_err(priv->dev, "failed to configure EST\n");
 		goto disable;
@@ -872,9 +880,11 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 	return 0;
 
 disable:
+	mutex_lock(&priv->plat->est->lock);
 	priv->plat->est->enable = false;
 	stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
 			     priv->plat->clk_ptp_rate);
+	mutex_unlock(&priv->plat->est->lock);
 
 	priv->plat->fpe_cfg->enable = false;
 	stmmac_fpe_configure(priv, priv->ioaddr,
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index d5ae621d66ba..09157b8a5810 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -115,6 +115,7 @@ struct stmmac_axi {
 
 #define EST_GCL		1024
 struct stmmac_est {
+	struct mutex lock;
 	int enable;
 	u32 btr_offset[2];
 	u32 btr[2];

From e9e3720002f61cd637a49ecafae77cac230eefae Mon Sep 17 00:00:00 2001
From: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Date: Mon, 5 Jul 2021 18:26:55 +0800
Subject: [PATCH 084/158] net: stmmac: ptp: update tas basetime after ptp
 adjust

After adjusting the ptp time, the Qbv base time may be the past time
of the new current time. dwmac5 hardware limited the base time cannot
be set as past time. This patch add a btr_reserve to store the base
time get from qopt, then calculate the base time and reset the Qbv
configuration after ptp time adjust.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/stmicro/stmmac/stmmac_ptp.c  | 41 ++++++++++++++++++-
 .../net/ethernet/stmicro/stmmac/stmmac_tc.c   |  6 ++-
 include/linux/stmmac.h                        |  1 +
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index 4e86cdf2bc9f..580cc035536b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -62,7 +62,8 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 	u32 sec, nsec;
 	u32 quotient, reminder;
 	int neg_adj = 0;
-	bool xmac;
+	bool xmac, est_rst = false;
+	int ret;
 
 	xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
 
@@ -75,10 +76,48 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 	sec = quotient;
 	nsec = reminder;
 
+	/* If EST is enabled, disabled it before adjust ptp time. */
+	if (priv->plat->est && priv->plat->est->enable) {
+		est_rst = true;
+		mutex_lock(&priv->plat->est->lock);
+		priv->plat->est->enable = false;
+		stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
+				     priv->plat->clk_ptp_rate);
+		mutex_unlock(&priv->plat->est->lock);
+	}
+
 	spin_lock_irqsave(&priv->ptp_lock, flags);
 	stmmac_adjust_systime(priv, priv->ptpaddr, sec, nsec, neg_adj, xmac);
 	spin_unlock_irqrestore(&priv->ptp_lock, flags);
 
+	/* Caculate new basetime and re-configured EST after PTP time adjust. */
+	if (est_rst) {
+		struct timespec64 current_time, time;
+		ktime_t current_time_ns, basetime;
+		u64 cycle_time;
+
+		mutex_lock(&priv->plat->est->lock);
+		priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, &current_time);
+		current_time_ns = timespec64_to_ktime(current_time);
+		time.tv_nsec = priv->plat->est->btr_reserve[0];
+		time.tv_sec = priv->plat->est->btr_reserve[1];
+		basetime = timespec64_to_ktime(time);
+		cycle_time = priv->plat->est->ctr[1] * NSEC_PER_SEC +
+			     priv->plat->est->ctr[0];
+		time = stmmac_calc_tas_basetime(basetime,
+						current_time_ns,
+						cycle_time);
+
+		priv->plat->est->btr[0] = (u32)time.tv_nsec;
+		priv->plat->est->btr[1] = (u32)time.tv_sec;
+		priv->plat->est->enable = true;
+		ret = stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
+					   priv->plat->clk_ptp_rate);
+		mutex_unlock(&priv->plat->est->lock);
+		if (ret)
+			netdev_err(priv->dev, "failed to configure EST\n");
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index 2e3cdf540168..4f3b6437b114 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -739,7 +739,7 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 {
 	u32 size, wid = priv->dma_cap.estwid, dep = priv->dma_cap.estdep;
 	struct plat_stmmacenet_data *plat = priv->plat;
-	struct timespec64 time, current_time;
+	struct timespec64 time, current_time, qopt_time;
 	ktime_t current_time_ns;
 	bool fpe = false;
 	int i, ret = 0;
@@ -848,6 +848,10 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 	priv->plat->est->btr[0] = (u32)time.tv_nsec;
 	priv->plat->est->btr[1] = (u32)time.tv_sec;
 
+	qopt_time = ktime_to_timespec64(qopt->base_time);
+	priv->plat->est->btr_reserve[0] = (u32)qopt_time.tv_nsec;
+	priv->plat->est->btr_reserve[1] = (u32)qopt_time.tv_sec;
+
 	ctr = qopt->cycle_time;
 	priv->plat->est->ctr[0] = do_div(ctr, NSEC_PER_SEC);
 	priv->plat->est->ctr[1] = (u32)ctr;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 09157b8a5810..a6f03b36fc4f 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -117,6 +117,7 @@ struct stmmac_axi {
 struct stmmac_est {
 	struct mutex lock;
 	int enable;
+	u32 btr_reserve[2];
 	u32 btr_offset[2];
 	u32 btr[2];
 	u32 ctr[2];

From 5a0ae9872d5cb5f27590eed168d4b3b144350ed7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Mon, 5 Jul 2021 12:38:41 +0200
Subject: [PATCH 085/158] bpf, samples: Add -fno-asynchronous-unwind-tables to
 BPF Clang invocation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The samples/bpf Makefile currently compiles BPF files in a way that will
produce an .eh_frame section, which will in turn confuse libbpf and produce
errors when loading BPF programs, like:

  libbpf: elf: skipping unrecognized data section(32) .eh_frame
  libbpf: elf: skipping relo section(33) .rel.eh_frame for section(32) .eh_frame

Fix this by instruction Clang not to produce this section, as it's useless
for BPF anyway.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210705103841.180260-1-toke@redhat.com
---
 samples/bpf/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 520434ea966f..036998d11ded 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -331,6 +331,7 @@ $(obj)/%.o: $(src)/%.c
 		-Wno-gnu-variable-sized-type-not-at-end \
 		-Wno-address-of-packed-member -Wno-tautological-compare \
 		-Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
+		-fno-asynchronous-unwind-tables \
 		-I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
 		-O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \
 		$(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \

From 2620e92ae6ed83260eb46d214554cd308ee35d92 Mon Sep 17 00:00:00 2001
From: Wang Hai <wanghai38@huawei.com>
Date: Mon, 28 Jun 2021 17:18:15 +0800
Subject: [PATCH 086/158] bpf, samples: Fix xdpsock with '-M' parameter missing
 unload process

Execute the following command and exit, then execute it again, the following
error will be reported:

  $ sudo ./samples/bpf/xdpsock -i ens4f2 -M
  ^C
  $ sudo ./samples/bpf/xdpsock -i ens4f2 -M
  libbpf: elf: skipping unrecognized data section(16) .eh_frame
  libbpf: elf: skipping relo section(17) .rel.eh_frame for section(16) .eh_frame
  libbpf: Kernel error message: XDP program already attached
  ERROR: link set xdp fd failed

Commit c9d27c9e8dc7 ("samples: bpf: Do not unload prog within xdpsock") removed
the unloading prog code because of the presence of bpf_link. This is fine if
XDP_SHARED_UMEM is disabled, but if it is enabled, unloading the prog is still
needed.

Fixes: c9d27c9e8dc7 ("samples: bpf: Do not unload prog within xdpsock")
Signed-off-by: Wang Hai <wanghai38@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Cc: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20210628091815.2373487-1-wanghai38@huawei.com
---
 samples/bpf/xdpsock_user.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 53e300f860bb..33d0bdebbed8 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -96,6 +96,7 @@ static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
 static int opt_timeout = 1000;
 static bool opt_need_wakeup = true;
 static u32 opt_num_xsks = 1;
+static u32 prog_id;
 static bool opt_busy_poll;
 static bool opt_reduced_cap;
 
@@ -461,6 +462,23 @@ static void *poller(void *arg)
 	return NULL;
 }
 
+static void remove_xdp_program(void)
+{
+	u32 curr_prog_id = 0;
+
+	if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
+		printf("bpf_get_link_xdp_id failed\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if (prog_id == curr_prog_id)
+		bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+	else if (!curr_prog_id)
+		printf("couldn't find a prog id on a given interface\n");
+	else
+		printf("program on interface changed, not removing\n");
+}
+
 static void int_exit(int sig)
 {
 	benchmark_done = true;
@@ -471,6 +489,9 @@ static void __exit_with_error(int error, const char *file, const char *func,
 {
 	fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
 		line, error, strerror(error));
+
+	if (opt_num_xsks > 1)
+		remove_xdp_program();
 	exit(EXIT_FAILURE);
 }
 
@@ -490,6 +511,9 @@ static void xdpsock_cleanup(void)
 		if (write(sock, &cmd, sizeof(int)) < 0)
 			exit_with_error(errno);
 	}
+
+	if (opt_num_xsks > 1)
+		remove_xdp_program();
 }
 
 static void swap_mac_addresses(void *data)
@@ -857,6 +881,10 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
 	if (ret)
 		exit_with_error(-ret);
 
+	ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
+	if (ret)
+		exit_with_error(-ret);
+
 	xsk->app_stats.rx_empty_polls = 0;
 	xsk->app_stats.fill_fail_polls = 0;
 	xsk->app_stats.copy_tx_sendtos = 0;

From bc832065b60f973771ff3e657214bb21b559833c Mon Sep 17 00:00:00 2001
From: Gu Shengxian <gushengxian@yulong.com>
Date: Mon, 5 Jul 2021 18:35:43 -0700
Subject: [PATCH 087/158] bpftool: Properly close va_list 'ap' by va_end() on
 error

va_list 'ap' was opened but not closed by va_end() in error case. It should
be closed by va_end() before the return.

Fixes: aa52bcbe0e72 ("tools: bpftool: Fix json dump crash on powerpc")
Signed-off-by: Gu Shengxian <gushengxian@yulong.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: https://lore.kernel.org/bpf/20210706013543.671114-1-gushengxian507419@gmail.com
---
 tools/bpf/bpftool/jit_disasm.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c
index e7e7eee9f172..24734f2249d6 100644
--- a/tools/bpf/bpftool/jit_disasm.c
+++ b/tools/bpf/bpftool/jit_disasm.c
@@ -43,11 +43,13 @@ static int fprintf_json(void *out, const char *fmt, ...)
 {
 	va_list ap;
 	char *s;
+	int err;
 
 	va_start(ap, fmt);
-	if (vasprintf(&s, fmt, ap) < 0)
-		return -1;
+	err = vasprintf(&s, fmt, ap);
 	va_end(ap);
+	if (err < 0)
+		return -1;
 
 	if (!oper_count) {
 		int i;

From c4edc3ccbc63947e697bd2e30afca8bfaa144998 Mon Sep 17 00:00:00 2001
From: Ali Abdallah <aabdallah@suse.de>
Date: Thu, 20 May 2021 12:53:11 +0200
Subject: [PATCH 088/158] netfilter: conntrack: improve RST handling when tuple
 is re-used

If we receive a SYN packet in original direction on an existing
connection tracking entry, we let this SYN through because conntrack
might be out-of-sync.

Conntrack gets back in sync when server responds with SYN/ACK and state
gets updated accordingly.

However, if server replies with RST, this packet might be marked as
INVALID because td_maxack value reflects the *old* conntrack state
and not the state of the originator of the RST.

Avoid td_maxack-based checks if previous packet was a SYN.

Unfortunately that is not be enough: an out of order ACK in original
direction updates last_index, so we still end up marking valid RST.

Thus disable the sequence check when we are not in established state and
the received RST has a sequence of 0.

Because marking RSTs as invalid usually leads to unwanted timeouts,
also skip RST sequence checks if a conntrack entry is already closing.

Such entries can already be evicted via GC in case the table is full.

Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Ali Abdallah <aabdallah@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_tcp.c | 53 +++++++++++++++++---------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index eb4de92077a8..b8ff67671e93 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -823,6 +823,22 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 	return true;
 }
 
+static bool tcp_can_early_drop(const struct nf_conn *ct)
+{
+	switch (ct->proto.tcp.state) {
+	case TCP_CONNTRACK_FIN_WAIT:
+	case TCP_CONNTRACK_LAST_ACK:
+	case TCP_CONNTRACK_TIME_WAIT:
+	case TCP_CONNTRACK_CLOSE:
+	case TCP_CONNTRACK_CLOSE_WAIT:
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
 /* Returns verdict for packet, or -1 for invalid. */
 int nf_conntrack_tcp_packet(struct nf_conn *ct,
 			    struct sk_buff *skb,
@@ -1030,9 +1046,28 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
 		if (index != TCP_RST_SET)
 			break;
 
-		if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) {
+		/* If we are closing, tuple might have been re-used already.
+		 * last_index, last_ack, and all other ct fields used for
+		 * sequence/window validation are outdated in that case.
+		 *
+		 * As the conntrack can already be expired by GC under pressure,
+		 * just skip validation checks.
+		 */
+		if (tcp_can_early_drop(ct))
+			goto in_window;
+
+		/* td_maxack might be outdated if we let a SYN through earlier */
+		if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
+		    ct->proto.tcp.last_index != TCP_SYN_SET) {
 			u32 seq = ntohl(th->seq);
 
+			/* If we are not in established state and SEQ=0 this is most
+			 * likely an answer to a SYN we let go through above (last_index
+			 * can be updated due to out-of-order ACKs).
+			 */
+			if (seq == 0 && !nf_conntrack_tcp_established(ct))
+				break;
+
 			if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
 				/* Invalid RST  */
 				spin_unlock_bh(&ct->lock);
@@ -1165,22 +1200,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
 	return NF_ACCEPT;
 }
 
-static bool tcp_can_early_drop(const struct nf_conn *ct)
-{
-	switch (ct->proto.tcp.state) {
-	case TCP_CONNTRACK_FIN_WAIT:
-	case TCP_CONNTRACK_LAST_ACK:
-	case TCP_CONNTRACK_TIME_WAIT:
-	case TCP_CONNTRACK_CLOSE:
-	case TCP_CONNTRACK_CLOSE_WAIT:
-		return true;
-	default:
-		break;
-	}
-
-	return false;
-}
-
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
 #include <linux/netfilter/nfnetlink.h>

From 1da4cd82dd180224503e745ccf3220e3490d8897 Mon Sep 17 00:00:00 2001
From: Ali Abdallah <ali.abdallah@suse.com>
Date: Thu, 27 May 2021 09:19:06 +0200
Subject: [PATCH 089/158] netfilter: conntrack: add new sysctl to disable RST
 check

This patch adds a new sysctl tcp_ignore_invalid_rst to disable marking
out of segments RSTs as INVALID.

Signed-off-by: Ali Abdallah <aabdallah@suse.de>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/nf_conntrack-sysctl.rst |  6 ++++++
 include/net/netns/conntrack.h                    |  1 +
 net/netfilter/nf_conntrack_proto_tcp.c           |  6 +++++-
 net/netfilter/nf_conntrack_standalone.c          | 10 ++++++++++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 0467b30e4abe..d31ed6c1cb0d 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -110,6 +110,12 @@ nf_conntrack_tcp_be_liberal - BOOLEAN
 	Be conservative in what you do, be liberal in what you accept from others.
 	If it's non-zero, we mark only out of window RST segments as INVALID.
 
+nf_conntrack_tcp_ignore_invalid_rst - BOOLEAN
+	- 0 - disabled (default)
+	- 1 - enabled
+
+	If it's 1, we don't mark out of window RST segments as INVALID.
+
 nf_conntrack_tcp_loose - BOOLEAN
 	- 0 - disabled
 	- not 0 - enabled (default)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index c3094b83a525..37e5300c7e5a 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -27,6 +27,7 @@ struct nf_tcp_net {
 	u8 tcp_loose;
 	u8 tcp_be_liberal;
 	u8 tcp_max_retrans;
+	u8 tcp_ignore_invalid_rst;
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
 	unsigned int offload_timeout;
 	unsigned int offload_pickup;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b8ff67671e93..3259416f2ea4 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1068,7 +1068,8 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
 			if (seq == 0 && !nf_conntrack_tcp_established(ct))
 				break;
 
-			if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
+			if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
+			    !tn->tcp_ignore_invalid_rst) {
 				/* Invalid RST  */
 				spin_unlock_bh(&ct->lock);
 				nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
@@ -1466,6 +1467,9 @@ void nf_conntrack_tcp_init_net(struct net *net)
 	 */
 	tn->tcp_be_liberal = 0;
 
+	/* If it's non-zero, we turn off RST sequence number check */
+	tn->tcp_ignore_invalid_rst = 0;
+
 	/* Max number of the retransmitted packets without receiving an (acceptable)
 	 * ACK from the destination. If this number is reached, a shorter timer
 	 * will be started.
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index f57a951c9b5e..214d9f9e499b 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -579,6 +579,7 @@ enum nf_ct_sysctl_index {
 #endif
 	NF_SYSCTL_CT_PROTO_TCP_LOOSE,
 	NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
+	NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST,
 	NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
 	NF_SYSCTL_CT_PROTO_TIMEOUT_UDP,
 	NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
@@ -798,6 +799,14 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.extra1 	= SYSCTL_ZERO,
 		.extra2 	= SYSCTL_ONE,
 	},
+	[NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = {
+		.procname	= "nf_conntrack_tcp_ignore_invalid_rst",
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	[NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = {
 		.procname	= "nf_conntrack_tcp_max_retrans",
 		.maxlen		= sizeof(u8),
@@ -1004,6 +1013,7 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
 	XASSIGN(LOOSE, &tn->tcp_loose);
 	XASSIGN(LIBERAL, &tn->tcp_be_liberal);
 	XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans);
+	XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst);
 #undef XASSIGN
 
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)

From cf4466ea47db891be785f867ca7f99e0cd9898c6 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Sun, 27 Jun 2021 18:19:18 +0200
Subject: [PATCH 090/158] netfilter: conntrack: Mark access for KCSAN

KCSAN detected an data race with ipc/sem.c that is intentional.

As nf_conntrack_lock() uses the same algorithm: Update
nf_conntrack_core as well:

nf_conntrack_lock() contains
  a1) spin_lock()
  a2) smp_load_acquire(nf_conntrack_locks_all).

a1) actually accesses one lock from an array of locks.

nf_conntrack_locks_all() contains
  b1) nf_conntrack_locks_all=true (normal write)
  b2) spin_lock()
  b3) spin_unlock()

b2 and b3 are done for every lock.

This guarantees that nf_conntrack_locks_all() prevents any
concurrent nf_conntrack_lock() owners:
If a thread past a1), then b2) will block until that thread releases
the lock.
If the threat is before a1, then b3)+a1) ensure the write b1) is
visible, thus a2) is guaranteed to see the updated value.

But: This is only the latest time when b1) becomes visible.
It may also happen that b1) is visible an undefined amount of time
before the b3). And thus KCSAN will notice a data race.

In addition, the compiler might be too clever.

Solution: Use WRITE_ONCE().

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 085a11f1eb43..83c52df85870 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -149,7 +149,15 @@ static void nf_conntrack_all_lock(void)
 
 	spin_lock(&nf_conntrack_locks_all_lock);
 
-	nf_conntrack_locks_all = true;
+	/* For nf_contrack_locks_all, only the latest time when another
+	 * CPU will see an update is controlled, by the "release" of the
+	 * spin_lock below.
+	 * The earliest time is not controlled, an thus KCSAN could detect
+	 * a race when nf_conntract_lock() reads the variable.
+	 * WRITE_ONCE() is used to ensure the compiler will not
+	 * optimize the write.
+	 */
+	WRITE_ONCE(nf_conntrack_locks_all, true);
 
 	for (i = 0; i < CONNTRACK_LOCKS; i++) {
 		spin_lock(&nf_conntrack_locks[i]);

From 6ac4bac4ce48604cf0f4b04d61884552520ca55e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 5 Jul 2021 17:45:36 +0200
Subject: [PATCH 091/158] netfilter: nft_last: honor NFTA_LAST_SET on
 restoration

NFTA_LAST_SET tells us if this expression has ever seen a packet, do not
ignore this attribute when restoring the ruleset.

Fixes: 836382dc2471 ("netfilter: nf_tables: add last expression")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_last.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index 913ac45167f2..bbb352b64c73 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -23,15 +23,21 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 {
 	struct nft_last_priv *priv = nft_expr_priv(expr);
 	u64 last_jiffies;
+	u32 last_set = 0;
 	int err;
 
-	if (tb[NFTA_LAST_MSECS]) {
+	if (tb[NFTA_LAST_SET]) {
+		last_set = ntohl(nla_get_be32(tb[NFTA_LAST_SET]));
+		if (last_set == 1)
+			priv->last_set = 1;
+	}
+
+	if (last_set && tb[NFTA_LAST_MSECS]) {
 		err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies);
 		if (err < 0)
 			return err;
 
 		priv->last_jiffies = jiffies + (unsigned long)last_jiffies;
-		priv->last_set = 1;
 	}
 
 	return 0;

From d1b5b80da7058883758df2b5b7f506d4d4f9a5fa Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 5 Jul 2021 20:14:21 +0200
Subject: [PATCH 092/158] netfilter: nft_last: incorrect arithmetics when
 restoring last used

Subtract the jiffies that have passed by to current jiffies to fix last
used restoration.

Fixes: 836382dc2471 ("netfilter: nf_tables: add last expression")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_last.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index bbb352b64c73..8088b99f2ee3 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -37,7 +37,7 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		if (err < 0)
 			return err;
 
-		priv->last_jiffies = jiffies + (unsigned long)last_jiffies;
+		priv->last_jiffies = jiffies - (unsigned long)last_jiffies;
 	}
 
 	return 0;

From 8550ff8d8c75416e984d9c4b082845e57e560984 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@nvidia.com>
Date: Mon, 5 Jul 2021 13:54:51 +0300
Subject: [PATCH 093/158] skbuff: Release nfct refcount on napi stolen or
 re-used skbs

When multiple SKBs are merged to a new skb under napi GRO,
or SKB is re-used by napi, if nfct was set for them in the
driver, it will not be released while freeing their stolen
head state or on re-use.

Release nfct on napi's stolen or re-used SKBs, and
in gro_list_prepare, check conntrack metadata diff.

Fixes: 5c6b94604744 ("net/mlx5e: CT: Handle misses after executing CT action")
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Paul Blakey <paulb@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c    | 13 +++++++++++++
 net/core/skbuff.c |  1 +
 2 files changed, 14 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index c253c2aafe97..177a5aec0b6b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6008,6 +6008,18 @@ static void gro_list_prepare(const struct list_head *head,
 			diffs = memcmp(skb_mac_header(p),
 				       skb_mac_header(skb),
 				       maclen);
+
+		diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
+
+		if (!diffs) {
+			struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT);
+			struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT);
+
+			diffs |= (!!p_ext) ^ (!!skb_ext);
+			if (!diffs && unlikely(skb_ext))
+				diffs |= p_ext->chain ^ skb_ext->chain;
+		}
+
 		NAPI_GRO_CB(p)->same_flow = !diffs;
 	}
 }
@@ -6270,6 +6282,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 	skb_shinfo(skb)->gso_type = 0;
 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 	skb_ext_reset(skb);
+	nf_reset_ct(skb);
 
 	napi->skb = skb;
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 12aabcda6db2..f63de967ac25 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -943,6 +943,7 @@ void __kfree_skb_defer(struct sk_buff *skb)
 
 void napi_skb_free_stolen_head(struct sk_buff *skb)
 {
+	nf_reset_ct(skb);
 	skb_dst_drop(skb);
 	skb_ext_put(skb);
 	napi_skb_cache_put(skb);

From be5d1b61a2ad28c7e57fe8bfa277373e8ecffcdc Mon Sep 17 00:00:00 2001
From: Nguyen Dinh Phi <phind.uet@gmail.com>
Date: Tue, 6 Jul 2021 07:19:12 +0800
Subject: [PATCH 094/158] tcp: fix tcp_init_transfer() to not reset
 icsk_ca_initialized

This commit fixes a bug (found by syzkaller) that could cause spurious
double-initializations for congestion control modules, which could cause
memory leaks or other problems for congestion control modules (like CDG)
that allocate memory in their init functions.

The buggy scenario constructed by syzkaller was something like:

(1) create a TCP socket
(2) initiate a TFO connect via sendto()
(3) while socket is in TCP_SYN_SENT, call setsockopt(TCP_CONGESTION),
    which calls:
       tcp_set_congestion_control() ->
         tcp_reinit_congestion_control() ->
           tcp_init_congestion_control()
(4) receive ACK, connection is established, call tcp_init_transfer(),
    set icsk_ca_initialized=0 (without first calling cc->release()),
    call tcp_init_congestion_control() again.

Note that in this sequence tcp_init_congestion_control() is called
twice without a cc->release() call in between. Thus, for CC modules
that allocate memory in their init() function, e.g, CDG, a memory leak
may occur. The syzkaller tool managed to find a reproducer that
triggered such a leak in CDG.

The bug was introduced when that commit 8919a9b31eb4 ("tcp: Only init
congestion control if not initialized already")
introduced icsk_ca_initialized and set icsk_ca_initialized to 0 in
tcp_init_transfer(), missing the possibility for a sequence like the
one above, where a process could call setsockopt(TCP_CONGESTION) in
state TCP_SYN_SENT (i.e. after the connect() or TFO open sendmsg()),
which would call tcp_init_congestion_control(). It did not intend to
reset any initialization that the user had already explicitly made;
it just missed the possibility of that particular sequence (which
syzkaller managed to find).

Fixes: 8919a9b31eb4 ("tcp: Only init congestion control if not initialized already")
Reported-by: syzbot+f1e24a0594d4e3a895d3@syzkaller.appspotmail.com
Signed-off-by: Nguyen Dinh Phi <phind.uet@gmail.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e6ca5a1f3b59..a5a8d0a378b2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5922,8 +5922,8 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
 		tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
 	tp->snd_cwnd_stamp = tcp_jiffies32;
 
-	icsk->icsk_ca_initialized = 0;
 	bpf_skops_established(sk, bpf_op, skb);
+	/* Initialize congestion control unless BPF initialized it already: */
 	if (!icsk->icsk_ca_initialized)
 		tcp_init_congestion_control(sk);
 	tcp_init_buffer_space(sk);

From b648eba4c69e5819880b4907e7fcb2bb576069ab Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Jul 2021 15:38:06 +0000
Subject: [PATCH 095/158] bonding: fix suspicious RCU usage in
 bond_ipsec_add_sa()

To dereference bond->curr_active_slave, it uses rcu_dereference().
But it and the caller doesn't acquire RCU so a warning occurs.
So add rcu_read_lock().

Test commands:
    ip link add dummy0 type dummy
    ip link add bond0 type bond
    ip link set dummy0 master bond0
    ip link set dummy0 up
    ip link set bond0 up
    ip x s add proto esp dst 14.1.1.1 src 15.1.1.1 spi 0x07 \
	    mode transport \
	    reqid 0x07 replay-window 32 aead 'rfc4106(gcm(aes))' \
	    0x44434241343332312423222114131211f4f3f2f1 128 sel \
	    src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp offload \
	    dev bond0 dir in

Splat looks like:
=============================
WARNING: suspicious RCU usage
5.13.0-rc3+ #1168 Not tainted
-----------------------------
drivers/net/bonding/bond_main.c:411 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
1 lock held by ip/684:
 #0: ffffffff9a2757c0 (&net->xfrm.xfrm_cfg_mutex){+.+.}-{3:3},
at: xfrm_netlink_rcv+0x59/0x80 [xfrm_user]
   55.191733][  T684] stack backtrace:
CPU: 0 PID: 684 Comm: ip Not tainted 5.13.0-rc3+ #1168
Call Trace:
 dump_stack+0xa4/0xe5
 bond_ipsec_add_sa+0x18c/0x1f0 [bonding]
 xfrm_dev_state_add+0x2a9/0x770
 ? memcpy+0x38/0x60
 xfrm_add_sa+0x2278/0x3b10 [xfrm_user]
 ? xfrm_get_policy+0xaa0/0xaa0 [xfrm_user]
 ? register_lock_class+0x1750/0x1750
 xfrm_user_rcv_msg+0x331/0x660 [xfrm_user]
 ? rcu_read_lock_sched_held+0x91/0xc0
 ? xfrm_user_state_lookup.constprop.39+0x320/0x320 [xfrm_user]
 ? find_held_lock+0x3a/0x1c0
 ? mutex_lock_io_nested+0x1210/0x1210
 ? sched_clock_cpu+0x18/0x170
 netlink_rcv_skb+0x121/0x350
 ? xfrm_user_state_lookup.constprop.39+0x320/0x320 [xfrm_user]
 ? netlink_ack+0x9d0/0x9d0
 ? netlink_deliver_tap+0x17c/0xa50
 xfrm_netlink_rcv+0x68/0x80 [xfrm_user]
 netlink_unicast+0x41c/0x610
 ? netlink_attachskb+0x710/0x710
 netlink_sendmsg+0x6b9/0xb70
[ ... ]

Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 0ff7567bd04f..d4d718e04dcc 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -403,10 +403,12 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 	struct net_device *bond_dev = xs->xso.dev;
 	struct bonding *bond;
 	struct slave *slave;
+	int err;
 
 	if (!bond_dev)
 		return -EINVAL;
 
+	rcu_read_lock();
 	bond = netdev_priv(bond_dev);
 	slave = rcu_dereference(bond->curr_active_slave);
 	xs->xso.real_dev = slave->dev;
@@ -415,10 +417,13 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 	if (!(slave->dev->xfrmdev_ops
 	      && slave->dev->xfrmdev_ops->xdo_dev_state_add)) {
 		slave_warn(bond_dev, slave->dev, "Slave does not support ipsec offload\n");
+		rcu_read_unlock();
 		return -EINVAL;
 	}
 
-	return slave->dev->xfrmdev_ops->xdo_dev_state_add(xs);
+	err = slave->dev->xfrmdev_ops->xdo_dev_state_add(xs);
+	rcu_read_unlock();
+	return err;
 }
 
 /**

From 105cd17a866017b45f3c45901b394c711c97bf40 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Jul 2021 15:38:07 +0000
Subject: [PATCH 096/158] bonding: fix null dereference in bond_ipsec_add_sa()

If bond doesn't have real device, bond->curr_active_slave is null.
But bond_ipsec_add_sa() dereferences bond->curr_active_slave without
null checking.
So, null-ptr-deref would occur.

Test commands:
    ip link add bond0 type bond
    ip link set bond0 up
    ip x s add proto esp dst 14.1.1.1 src 15.1.1.1 spi \
0x07 mode transport reqid 0x07 replay-window 32 aead 'rfc4106(gcm(aes))' \
0x44434241343332312423222114131211f4f3f2f1 128 sel src 14.0.0.52/24 \
dst 14.0.0.70/24 proto tcp offload dev bond0 dir in

Splat looks like:
KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
CPU: 4 PID: 680 Comm: ip Not tainted 5.13.0-rc3+ #1168
RIP: 0010:bond_ipsec_add_sa+0xc4/0x2e0 [bonding]
Code: 85 21 02 00 00 4d 8b a6 48 0c 00 00 e8 75 58 44 ce 85 c0 0f 85 14
01 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 e2 48 c1 ea 03 <80> 3c 02
00 0f 85 fc 01 00 00 48 8d bb e0 02 00 00 4d 8b 2c 24 48
RSP: 0018:ffff88810946f508 EFLAGS: 00010246
RAX: dffffc0000000000 RBX: ffff88810b4e8040 RCX: 0000000000000001
RDX: 0000000000000000 RSI: ffffffff8fe34280 RDI: ffff888115abe100
RBP: ffff88810946f528 R08: 0000000000000003 R09: fffffbfff2287e11
R10: 0000000000000001 R11: ffff888115abe0c8 R12: 0000000000000000
R13: ffffffffc0aea9a0 R14: ffff88800d7d2000 R15: ffff88810b4e8330
FS:  00007efc5552e680(0000) GS:ffff888119c00000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055c2530dbf40 CR3: 0000000103056004 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 xfrm_dev_state_add+0x2a9/0x770
 ? memcpy+0x38/0x60
 xfrm_add_sa+0x2278/0x3b10 [xfrm_user]
 ? xfrm_get_policy+0xaa0/0xaa0 [xfrm_user]
 ? register_lock_class+0x1750/0x1750
 xfrm_user_rcv_msg+0x331/0x660 [xfrm_user]
 ? rcu_read_lock_sched_held+0x91/0xc0
 ? xfrm_user_state_lookup.constprop.39+0x320/0x320 [xfrm_user]
 ? find_held_lock+0x3a/0x1c0
 ? mutex_lock_io_nested+0x1210/0x1210
 ? sched_clock_cpu+0x18/0x170
 netlink_rcv_skb+0x121/0x350
 ? xfrm_user_state_lookup.constprop.39+0x320/0x320 [xfrm_user]
 ? netlink_ack+0x9d0/0x9d0
 ? netlink_deliver_tap+0x17c/0xa50
 xfrm_netlink_rcv+0x68/0x80 [xfrm_user]
 netlink_unicast+0x41c/0x610
 ? netlink_attachskb+0x710/0x710
 netlink_sendmsg+0x6b9/0xb70
[ ...]

Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index d4d718e04dcc..5466b24ceab6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -411,6 +411,11 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 	rcu_read_lock();
 	bond = netdev_priv(bond_dev);
 	slave = rcu_dereference(bond->curr_active_slave);
+	if (!slave) {
+		rcu_read_unlock();
+		return -ENODEV;
+	}
+
 	xs->xso.real_dev = slave->dev;
 	bond->xs = xs;
 

From 09adf7566d436322ced595b166dea48b06852efe Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Jul 2021 15:38:08 +0000
Subject: [PATCH 097/158] net: netdevsim: use xso.real_dev instead of xso.dev
 in callback functions of struct xfrmdev_ops

There are two pointers in struct xfrm_state_offload, *dev, *real_dev.
These are used in callback functions of struct xfrmdev_ops.
The *dev points whether bonding interface or real interface.
If bonding ipsec offload is used, it points bonding interface If not,
it points real interface.
And real_dev always points real interface.
So, netdevsim should always use real_dev instead of dev.
Of course, real_dev always not be null.

Test commands:
    ip netns add A
    ip netns exec A bash
    modprobe netdevsim
    echo "1 1" > /sys/bus/netdevsim/new_device
    ip link add bond0 type bond mode active-backup
    ip link set eth0 master bond0
    ip link set eth0 up
    ip link set bond0 up
    ip x s add proto esp dst 14.1.1.1 src 15.1.1.1 spi 0x07 mode \
transport reqid 0x07 replay-window 32 aead 'rfc4106(gcm(aes))' \
0x44434241343332312423222114131211f4f3f2f1 128 sel src 14.0.0.52/24 \
dst 14.0.0.70/24 proto tcp offload dev bond0 dir in

Splat looks like:
BUG: spinlock bad magic on CPU#5, kworker/5:1/53
 lock: 0xffff8881068c2cc8, .magic: 11121314, .owner: <none>/-1,
.owner_cpu: -235736076
CPU: 5 PID: 53 Comm: kworker/5:1 Not tainted 5.13.0-rc3+ #1168
Workqueue: events linkwatch_event
Call Trace:
 dump_stack+0xa4/0xe5
 do_raw_spin_lock+0x20b/0x270
 ? rwlock_bug.part.1+0x90/0x90
 _raw_spin_lock_nested+0x5f/0x70
 bond_get_stats+0xe4/0x4c0 [bonding]
 ? rcu_read_lock_sched_held+0xc0/0xc0
 ? bond_neigh_init+0x2c0/0x2c0 [bonding]
 ? dev_get_alias+0xe2/0x190
 ? dev_get_port_parent_id+0x14a/0x360
 ? rtnl_unregister+0x190/0x190
 ? dev_get_phys_port_name+0xa0/0xa0
 ? memset+0x1f/0x40
 ? memcpy+0x38/0x60
 ? rtnl_phys_switch_id_fill+0x91/0x100
 dev_get_stats+0x8c/0x270
 rtnl_fill_stats+0x44/0xbe0
 ? nla_put+0xbe/0x140
 rtnl_fill_ifinfo+0x1054/0x3ad0
[ ... ]

Fixes: 272c2330adc9 ("xfrm: bail early on slave pass over skb")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/netdevsim/ipsec.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
index 3811f1bde84e..b80ed2ffd45e 100644
--- a/drivers/net/netdevsim/ipsec.c
+++ b/drivers/net/netdevsim/ipsec.c
@@ -85,7 +85,7 @@ static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs,
 				       u32 *mykey, u32 *mysalt)
 {
 	const char aes_gcm_name[] = "rfc4106(gcm(aes))";
-	struct net_device *dev = xs->xso.dev;
+	struct net_device *dev = xs->xso.real_dev;
 	unsigned char *key_data;
 	char *alg_name = NULL;
 	int key_len;
@@ -134,7 +134,7 @@ static int nsim_ipsec_add_sa(struct xfrm_state *xs)
 	u16 sa_idx;
 	int ret;
 
-	dev = xs->xso.dev;
+	dev = xs->xso.real_dev;
 	ns = netdev_priv(dev);
 	ipsec = &ns->ipsec;
 
@@ -194,7 +194,7 @@ static int nsim_ipsec_add_sa(struct xfrm_state *xs)
 
 static void nsim_ipsec_del_sa(struct xfrm_state *xs)
 {
-	struct netdevsim *ns = netdev_priv(xs->xso.dev);
+	struct netdevsim *ns = netdev_priv(xs->xso.real_dev);
 	struct nsim_ipsec *ipsec = &ns->ipsec;
 	u16 sa_idx;
 
@@ -211,7 +211,7 @@ static void nsim_ipsec_del_sa(struct xfrm_state *xs)
 
 static bool nsim_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
 {
-	struct netdevsim *ns = netdev_priv(xs->xso.dev);
+	struct netdevsim *ns = netdev_priv(xs->xso.real_dev);
 	struct nsim_ipsec *ipsec = &ns->ipsec;
 
 	ipsec->ok++;

From 2de7e4f67599affc97132bd07e30e3bd59d0b777 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Jul 2021 15:38:09 +0000
Subject: [PATCH 098/158] ixgbevf: use xso.real_dev instead of xso.dev in
 callback functions of struct xfrmdev_ops

There are two pointers in struct xfrm_state_offload, *dev, *real_dev.
These are used in callback functions of struct xfrmdev_ops.
The *dev points whether bonding interface or real interface.
If bonding ipsec offload is used, it points bonding interface If not,
it points real interface.
And real_dev always points real interface.
So, ixgbevf should always use real_dev instead of dev.
Of course, real_dev always not be null.

Test commands:
    ip link add bond0 type bond
    #eth0 is ixgbevf interface
    ip link set eth0 master bond0
    ip link set bond0 up
    ip x s add proto esp dst 14.1.1.1 src 15.1.1.1 spi 0x07 mode \
transport reqid 0x07 replay-window 32 aead 'rfc4106(gcm(aes))' \
0x44434241343332312423222114131211f4f3f2f1 128 sel src 14.0.0.52/24 \
dst 14.0.0.70/24 proto tcp offload dev bond0 dir in

Splat looks like:
KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
CPU: 6 PID: 688 Comm: ip Not tainted 5.13.0-rc3+ #1168
RIP: 0010:ixgbevf_ipsec_find_empty_idx+0x28/0x1b0 [ixgbevf]
Code: 00 00 0f 1f 44 00 00 55 53 48 89 fb 48 83 ec 08 40 84 f6 0f 84 9c
00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02
84 c0 74 08 3c 01 0f 8e 4c 01 00 00 66 81 3b 00 04 0f
RSP: 0018:ffff8880089af390 EFLAGS: 00010246
RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000001
RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000
RBP: ffff8880089af4f8 R08: 0000000000000003 R09: fffffbfff4287e11
R10: 0000000000000001 R11: ffff888005de8908 R12: 0000000000000000
R13: ffff88810936a000 R14: ffff88810936a000 R15: ffff888004d78040
FS:  00007fdf9883a680(0000) GS:ffff88811a400000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055bc14adbf40 CR3: 000000000b87c005 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 ixgbevf_ipsec_add_sa+0x1bf/0x9c0 [ixgbevf]
 ? rcu_read_lock_sched_held+0x91/0xc0
 ? ixgbevf_ipsec_parse_proto_keys.isra.9+0x280/0x280 [ixgbevf]
 ? lock_acquire+0x191/0x720
 ? bond_ipsec_add_sa+0x48/0x350 [bonding]
 ? lockdep_hardirqs_on_prepare+0x3e0/0x3e0
 ? rcu_read_lock_held+0x91/0xa0
 ? rcu_read_lock_sched_held+0xc0/0xc0
 bond_ipsec_add_sa+0x193/0x350 [bonding]
 xfrm_dev_state_add+0x2a9/0x770
 ? memcpy+0x38/0x60
 xfrm_add_sa+0x2278/0x3b10 [xfrm_user]
 ? xfrm_get_policy+0xaa0/0xaa0 [xfrm_user]
 ? register_lock_class+0x1750/0x1750
 xfrm_user_rcv_msg+0x331/0x660 [xfrm_user]
 ? rcu_read_lock_sched_held+0x91/0xc0
 ? xfrm_user_state_lookup.constprop.39+0x320/0x320 [xfrm_user]
 ? find_held_lock+0x3a/0x1c0
 ? mutex_lock_io_nested+0x1210/0x1210
 ? sched_clock_cpu+0x18/0x170
 netlink_rcv_skb+0x121/0x350
[ ... ]

Fixes: 272c2330adc9 ("xfrm: bail early on slave pass over skb")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/ixgbevf/ipsec.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
index caaea2c920a6..e3e4676af9e4 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
@@ -211,7 +211,7 @@ struct xfrm_state *ixgbevf_ipsec_find_rx_state(struct ixgbevf_ipsec *ipsec,
 static int ixgbevf_ipsec_parse_proto_keys(struct xfrm_state *xs,
 					  u32 *mykey, u32 *mysalt)
 {
-	struct net_device *dev = xs->xso.dev;
+	struct net_device *dev = xs->xso.real_dev;
 	unsigned char *key_data;
 	char *alg_name = NULL;
 	int key_len;
@@ -260,12 +260,15 @@ static int ixgbevf_ipsec_parse_proto_keys(struct xfrm_state *xs,
  **/
 static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs)
 {
-	struct net_device *dev = xs->xso.dev;
-	struct ixgbevf_adapter *adapter = netdev_priv(dev);
-	struct ixgbevf_ipsec *ipsec = adapter->ipsec;
+	struct net_device *dev = xs->xso.real_dev;
+	struct ixgbevf_adapter *adapter;
+	struct ixgbevf_ipsec *ipsec;
 	u16 sa_idx;
 	int ret;
 
+	adapter = netdev_priv(dev);
+	ipsec = adapter->ipsec;
+
 	if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) {
 		netdev_err(dev, "Unsupported protocol 0x%04x for IPsec offload\n",
 			   xs->id.proto);
@@ -383,11 +386,14 @@ static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs)
  **/
 static void ixgbevf_ipsec_del_sa(struct xfrm_state *xs)
 {
-	struct net_device *dev = xs->xso.dev;
-	struct ixgbevf_adapter *adapter = netdev_priv(dev);
-	struct ixgbevf_ipsec *ipsec = adapter->ipsec;
+	struct net_device *dev = xs->xso.real_dev;
+	struct ixgbevf_adapter *adapter;
+	struct ixgbevf_ipsec *ipsec;
 	u16 sa_idx;
 
+	adapter = netdev_priv(dev);
+	ipsec = adapter->ipsec;
+
 	if (xs->xso.flags & XFRM_OFFLOAD_INBOUND) {
 		sa_idx = xs->xso.offload_handle - IXGBE_IPSEC_BASE_RX_INDEX;
 

From a22c39b831a081da9b2c488bd970a4412d926f30 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Jul 2021 15:38:10 +0000
Subject: [PATCH 099/158] bonding: fix suspicious RCU usage in
 bond_ipsec_del_sa()

To dereference bond->curr_active_slave, it uses rcu_dereference().
But it and the caller doesn't acquire RCU so a warning occurs.
So add rcu_read_lock().

Test commands:
    ip netns add A
    ip netns exec A bash
    modprobe netdevsim
    echo "1 1" > /sys/bus/netdevsim/new_device
    ip link add bond0 type bond
    ip link set eth0 master bond0
    ip link set eth0 up
    ip link set bond0 up
    ip x s add proto esp dst 14.1.1.1 src 15.1.1.1 spi 0x07 mode \
transport reqid 0x07 replay-window 32 aead 'rfc4106(gcm(aes))' \
0x44434241343332312423222114131211f4f3f2f1 128 sel src 14.0.0.52/24 \
dst 14.0.0.70/24 proto tcp offload dev bond0 dir in
    ip x s f

Splat looks like:
=============================
WARNING: suspicious RCU usage
5.13.0-rc3+ #1168 Not tainted
-----------------------------
drivers/net/bonding/bond_main.c:448 suspicious rcu_dereference_check()
usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
2 locks held by ip/705:
 #0: ffff888106701780 (&net->xfrm.xfrm_cfg_mutex){+.+.}-{3:3},
at: xfrm_netlink_rcv+0x59/0x80 [xfrm_user]
 #1: ffff8880075b0098 (&x->lock){+.-.}-{2:2},
at: xfrm_state_delete+0x16/0x30

stack backtrace:
CPU: 6 PID: 705 Comm: ip Not tainted 5.13.0-rc3+ #1168
Call Trace:
 dump_stack+0xa4/0xe5
 bond_ipsec_del_sa+0x16a/0x1c0 [bonding]
 __xfrm_state_delete+0x51f/0x730
 xfrm_state_delete+0x1e/0x30
 xfrm_state_flush+0x22f/0x390
 xfrm_flush_sa+0xd8/0x260 [xfrm_user]
 ? xfrm_flush_policy+0x290/0x290 [xfrm_user]
 xfrm_user_rcv_msg+0x331/0x660 [xfrm_user]
 ? rcu_read_lock_sched_held+0x91/0xc0
 ? xfrm_user_state_lookup.constprop.39+0x320/0x320 [xfrm_user]
 ? find_held_lock+0x3a/0x1c0
 ? mutex_lock_io_nested+0x1210/0x1210
 ? sched_clock_cpu+0x18/0x170
 netlink_rcv_skb+0x121/0x350
[ ... ]

Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5466b24ceab6..aa9c469ebbb5 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -444,21 +444,24 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
 	if (!bond_dev)
 		return;
 
+	rcu_read_lock();
 	bond = netdev_priv(bond_dev);
 	slave = rcu_dereference(bond->curr_active_slave);
 
 	if (!slave)
-		return;
+		goto out;
 
 	xs->xso.real_dev = slave->dev;
 
 	if (!(slave->dev->xfrmdev_ops
 	      && slave->dev->xfrmdev_ops->xdo_dev_state_delete)) {
 		slave_warn(bond_dev, slave->dev, "%s: no slave xdo_dev_state_delete\n", __func__);
-		return;
+		goto out;
 	}
 
 	slave->dev->xfrmdev_ops->xdo_dev_state_delete(xs);
+out:
+	rcu_read_unlock();
 }
 
 /**

From b121693381b112b78c076dea171ee113e237c0e4 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Jul 2021 15:38:11 +0000
Subject: [PATCH 100/158] bonding: disallow setting nested bonding + ipsec
 offload

bonding interface can be nested and it supports ipsec offload.
So, it allows setting the nested bonding + ipsec scenario.
But code does not support this scenario.
So, it should be disallowed.

interface graph:
bond2
   |
bond1
   |
eth0

The nested bonding + ipsec offload may not a real usecase.
So, disallowing this scenario is fine.

Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index aa9c469ebbb5..f7b89743fab9 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -419,8 +419,9 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 	xs->xso.real_dev = slave->dev;
 	bond->xs = xs;
 
-	if (!(slave->dev->xfrmdev_ops
-	      && slave->dev->xfrmdev_ops->xdo_dev_state_add)) {
+	if (!slave->dev->xfrmdev_ops ||
+	    !slave->dev->xfrmdev_ops->xdo_dev_state_add ||
+	    netif_is_bond_master(slave->dev)) {
 		slave_warn(bond_dev, slave->dev, "Slave does not support ipsec offload\n");
 		rcu_read_unlock();
 		return -EINVAL;
@@ -453,8 +454,9 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
 
 	xs->xso.real_dev = slave->dev;
 
-	if (!(slave->dev->xfrmdev_ops
-	      && slave->dev->xfrmdev_ops->xdo_dev_state_delete)) {
+	if (!slave->dev->xfrmdev_ops ||
+	    !slave->dev->xfrmdev_ops->xdo_dev_state_delete ||
+	    netif_is_bond_master(slave->dev)) {
 		slave_warn(bond_dev, slave->dev, "%s: no slave xdo_dev_state_delete\n", __func__);
 		goto out;
 	}
@@ -479,8 +481,9 @@ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
 	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
 		return true;
 
-	if (!(slave_dev->xfrmdev_ops
-	      && slave_dev->xfrmdev_ops->xdo_dev_offload_ok)) {
+	if (!slave_dev->xfrmdev_ops ||
+	    !slave_dev->xfrmdev_ops->xdo_dev_offload_ok ||
+	    netif_is_bond_master(slave_dev)) {
 		slave_warn(bond_dev, slave_dev, "%s: no slave xdo_dev_offload_ok\n", __func__);
 		return false;
 	}

From 9a5605505d9c7dbfdb89cc29a8f5fc5cf9fd2334 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Jul 2021 15:38:12 +0000
Subject: [PATCH 101/158] bonding: Add struct bond_ipesc to manage SA

bonding has been supporting ipsec offload.
When SA is added, bonding just passes SA to its own active real interface.
But it doesn't manage SA.
So, when events(add/del real interface, active real interface change, etc)
occur, bonding can't handle that well because It doesn't manage SA.
So some problems(panic, UAF, refcnt leak)occur.

In order to make it stable, it should manage SA.
That's the reason why struct bond_ipsec is added.
When a new SA is added to bonding interface, it is stored in the
bond_ipsec list. And the SA is passed to a current active real interface.
If events occur, it uses bond_ipsec data to handle these events.
bond->ipsec_list is protected by bond->ipsec_lock.

If a current active real interface is changed, the following logic works.
1. delete all SAs from old active real interface
2. Add all SAs to the new active real interface.
3. If a new active real interface doesn't support ipsec offload or SA's
option, it sets real_dev to NULL.

Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 139 +++++++++++++++++++++++++++-----
 include/net/bonding.h           |   9 ++-
 2 files changed, 127 insertions(+), 21 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f7b89743fab9..165fa55cfb38 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -401,6 +401,7 @@ static int bond_vlan_rx_kill_vid(struct net_device *bond_dev,
 static int bond_ipsec_add_sa(struct xfrm_state *xs)
 {
 	struct net_device *bond_dev = xs->xso.dev;
+	struct bond_ipsec *ipsec;
 	struct bonding *bond;
 	struct slave *slave;
 	int err;
@@ -416,9 +417,6 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 		return -ENODEV;
 	}
 
-	xs->xso.real_dev = slave->dev;
-	bond->xs = xs;
-
 	if (!slave->dev->xfrmdev_ops ||
 	    !slave->dev->xfrmdev_ops->xdo_dev_state_add ||
 	    netif_is_bond_master(slave->dev)) {
@@ -427,11 +425,63 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 		return -EINVAL;
 	}
 
+	ipsec = kmalloc(sizeof(*ipsec), GFP_ATOMIC);
+	if (!ipsec) {
+		rcu_read_unlock();
+		return -ENOMEM;
+	}
+	xs->xso.real_dev = slave->dev;
+
 	err = slave->dev->xfrmdev_ops->xdo_dev_state_add(xs);
+	if (!err) {
+		ipsec->xs = xs;
+		INIT_LIST_HEAD(&ipsec->list);
+		spin_lock_bh(&bond->ipsec_lock);
+		list_add(&ipsec->list, &bond->ipsec_list);
+		spin_unlock_bh(&bond->ipsec_lock);
+	} else {
+		kfree(ipsec);
+	}
 	rcu_read_unlock();
 	return err;
 }
 
+static void bond_ipsec_add_sa_all(struct bonding *bond)
+{
+	struct net_device *bond_dev = bond->dev;
+	struct bond_ipsec *ipsec;
+	struct slave *slave;
+
+	rcu_read_lock();
+	slave = rcu_dereference(bond->curr_active_slave);
+	if (!slave)
+		goto out;
+
+	if (!slave->dev->xfrmdev_ops ||
+	    !slave->dev->xfrmdev_ops->xdo_dev_state_add ||
+	    netif_is_bond_master(slave->dev)) {
+		spin_lock_bh(&bond->ipsec_lock);
+		if (!list_empty(&bond->ipsec_list))
+			slave_warn(bond_dev, slave->dev,
+				   "%s: no slave xdo_dev_state_add\n",
+				   __func__);
+		spin_unlock_bh(&bond->ipsec_lock);
+		goto out;
+	}
+
+	spin_lock_bh(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		ipsec->xs->xso.real_dev = slave->dev;
+		if (slave->dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs)) {
+			slave_warn(bond_dev, slave->dev, "%s: failed to add SA\n", __func__);
+			ipsec->xs->xso.real_dev = NULL;
+		}
+	}
+	spin_unlock_bh(&bond->ipsec_lock);
+out:
+	rcu_read_unlock();
+}
+
 /**
  * bond_ipsec_del_sa - clear out this specific SA
  * @xs: pointer to transformer state struct
@@ -439,6 +489,7 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 static void bond_ipsec_del_sa(struct xfrm_state *xs)
 {
 	struct net_device *bond_dev = xs->xso.dev;
+	struct bond_ipsec *ipsec;
 	struct bonding *bond;
 	struct slave *slave;
 
@@ -452,7 +503,10 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
 	if (!slave)
 		goto out;
 
-	xs->xso.real_dev = slave->dev;
+	if (!xs->xso.real_dev)
+		goto out;
+
+	WARN_ON(xs->xso.real_dev != slave->dev);
 
 	if (!slave->dev->xfrmdev_ops ||
 	    !slave->dev->xfrmdev_ops->xdo_dev_state_delete ||
@@ -463,6 +517,48 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
 
 	slave->dev->xfrmdev_ops->xdo_dev_state_delete(xs);
 out:
+	spin_lock_bh(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (ipsec->xs == xs) {
+			list_del(&ipsec->list);
+			kfree(ipsec);
+			break;
+		}
+	}
+	spin_unlock_bh(&bond->ipsec_lock);
+	rcu_read_unlock();
+}
+
+static void bond_ipsec_del_sa_all(struct bonding *bond)
+{
+	struct net_device *bond_dev = bond->dev;
+	struct bond_ipsec *ipsec;
+	struct slave *slave;
+
+	rcu_read_lock();
+	slave = rcu_dereference(bond->curr_active_slave);
+	if (!slave) {
+		rcu_read_unlock();
+		return;
+	}
+
+	spin_lock_bh(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (!ipsec->xs->xso.real_dev)
+			continue;
+
+		if (!slave->dev->xfrmdev_ops ||
+		    !slave->dev->xfrmdev_ops->xdo_dev_state_delete ||
+		    netif_is_bond_master(slave->dev)) {
+			slave_warn(bond_dev, slave->dev,
+				   "%s: no slave xdo_dev_state_delete\n",
+				   __func__);
+		} else {
+			slave->dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs);
+		}
+		ipsec->xs->xso.real_dev = NULL;
+	}
+	spin_unlock_bh(&bond->ipsec_lock);
 	rcu_read_unlock();
 }
 
@@ -474,22 +570,27 @@ out:
 static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
 {
 	struct net_device *bond_dev = xs->xso.dev;
-	struct bonding *bond = netdev_priv(bond_dev);
-	struct slave *curr_active = rcu_dereference(bond->curr_active_slave);
-	struct net_device *slave_dev = curr_active->dev;
+	struct net_device *real_dev;
+	struct slave *curr_active;
+	struct bonding *bond;
+
+	bond = netdev_priv(bond_dev);
+	curr_active = rcu_dereference(bond->curr_active_slave);
+	real_dev = curr_active->dev;
 
 	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
 		return true;
 
-	if (!slave_dev->xfrmdev_ops ||
-	    !slave_dev->xfrmdev_ops->xdo_dev_offload_ok ||
-	    netif_is_bond_master(slave_dev)) {
-		slave_warn(bond_dev, slave_dev, "%s: no slave xdo_dev_offload_ok\n", __func__);
+	if (!xs->xso.real_dev)
+		return false;
+
+	if (!real_dev->xfrmdev_ops ||
+	    !real_dev->xfrmdev_ops->xdo_dev_offload_ok ||
+	    netif_is_bond_master(real_dev)) {
 		return false;
 	}
 
-	xs->xso.real_dev = slave_dev;
-	return slave_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
+	return real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
 }
 
 static const struct xfrmdev_ops bond_xfrmdev_ops = {
@@ -1006,8 +1107,7 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
 		return;
 
 #ifdef CONFIG_XFRM_OFFLOAD
-	if (old_active && bond->xs)
-		bond_ipsec_del_sa(bond->xs);
+	bond_ipsec_del_sa_all(bond);
 #endif /* CONFIG_XFRM_OFFLOAD */
 
 	if (new_active) {
@@ -1082,10 +1182,7 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
 	}
 
 #ifdef CONFIG_XFRM_OFFLOAD
-	if (new_active && bond->xs) {
-		xfrm_dev_state_flush(dev_net(bond->dev), bond->dev, true);
-		bond_ipsec_add_sa(bond->xs);
-	}
+	bond_ipsec_add_sa_all(bond);
 #endif /* CONFIG_XFRM_OFFLOAD */
 
 	/* resend IGMP joins since active slave has changed or
@@ -3343,6 +3440,7 @@ static int bond_master_netdev_event(unsigned long event,
 		return bond_event_changename(event_bond);
 	case NETDEV_UNREGISTER:
 		bond_remove_proc_entry(event_bond);
+		xfrm_dev_state_flush(dev_net(bond_dev), bond_dev, true);
 		break;
 	case NETDEV_REGISTER:
 		bond_create_proc_entry(event_bond);
@@ -4910,7 +5008,8 @@ void bond_setup(struct net_device *bond_dev)
 #ifdef CONFIG_XFRM_OFFLOAD
 	/* set up xfrm device ops (only supported in active-backup right now) */
 	bond_dev->xfrmdev_ops = &bond_xfrmdev_ops;
-	bond->xs = NULL;
+	INIT_LIST_HEAD(&bond->ipsec_list);
+	spin_lock_init(&bond->ipsec_lock);
 #endif /* CONFIG_XFRM_OFFLOAD */
 
 	/* don't acquire bond device's netif_tx_lock when transmitting */
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 15335732e166..625d9c72dee3 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -201,6 +201,11 @@ struct bond_up_slave {
  */
 #define BOND_LINK_NOCHANGE -1
 
+struct bond_ipsec {
+	struct list_head list;
+	struct xfrm_state *xs;
+};
+
 /*
  * Here are the locking policies for the two bonding locks:
  * Get rcu_read_lock when reading or RTNL when writing slave list.
@@ -249,7 +254,9 @@ struct bonding {
 #endif /* CONFIG_DEBUG_FS */
 	struct rtnl_link_stats64 bond_stats;
 #ifdef CONFIG_XFRM_OFFLOAD
-	struct xfrm_state *xs;
+	struct list_head ipsec_list;
+	/* protecting ipsec_list */
+	spinlock_t ipsec_lock;
 #endif /* CONFIG_XFRM_OFFLOAD */
 };
 

From 955b785ec6b3b2f9b91914d6eeac8ee66ee29239 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Jul 2021 15:38:13 +0000
Subject: [PATCH 102/158] bonding: fix suspicious RCU usage in
 bond_ipsec_offload_ok()

To dereference bond->curr_active_slave, it uses rcu_dereference().
But it and the caller doesn't acquire RCU so a warning occurs.
So add rcu_read_lock().

Splat looks like:
WARNING: suspicious RCU usage
5.13.0-rc6+ #1179 Not tainted
drivers/net/bonding/bond_main.c:571 suspicious
rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
1 lock held by ping/974:
 #0: ffff888109e7db70 (sk_lock-AF_INET){+.+.}-{0:0},
at: raw_sendmsg+0x1303/0x2cb0

stack backtrace:
CPU: 2 PID: 974 Comm: ping Not tainted 5.13.0-rc6+ #1179
Call Trace:
 dump_stack+0xa4/0xe5
 bond_ipsec_offload_ok+0x1f4/0x260 [bonding]
 xfrm_output+0x179/0x890
 xfrm4_output+0xfa/0x410
 ? __xfrm4_output+0x4b0/0x4b0
 ? __ip_make_skb+0xecc/0x2030
 ? xfrm4_udp_encap_rcv+0x800/0x800
 ? ip_local_out+0x21/0x3a0
 ip_send_skb+0x37/0xa0
 raw_sendmsg+0x1bfd/0x2cb0

Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 165fa55cfb38..780f87869e36 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -573,24 +573,34 @@ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
 	struct net_device *real_dev;
 	struct slave *curr_active;
 	struct bonding *bond;
+	int err;
 
 	bond = netdev_priv(bond_dev);
+	rcu_read_lock();
 	curr_active = rcu_dereference(bond->curr_active_slave);
 	real_dev = curr_active->dev;
 
-	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
-		return true;
+	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
+		err = true;
+		goto out;
+	}
 
-	if (!xs->xso.real_dev)
-		return false;
+	if (!xs->xso.real_dev) {
+		err = false;
+		goto out;
+	}
 
 	if (!real_dev->xfrmdev_ops ||
 	    !real_dev->xfrmdev_ops->xdo_dev_offload_ok ||
 	    netif_is_bond_master(real_dev)) {
-		return false;
+		err = false;
+		goto out;
 	}
 
-	return real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
+	err = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
+out:
+	rcu_read_unlock();
+	return err;
 }
 
 static const struct xfrmdev_ops bond_xfrmdev_ops = {

From 168e696a36792a4a3b2525a06249e7472ef90186 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Jul 2021 15:38:14 +0000
Subject: [PATCH 103/158] bonding: fix incorrect return value of
 bond_ipsec_offload_ok()

bond_ipsec_offload_ok() is called to check whether the interface supports
ipsec offload or not.
bonding interface support ipsec offload only in active-backup mode.
So, if a bond interface is not in active-backup mode, it should return
false but it returns true.

Fixes: a3b658cfb664 ("bonding: allow xfrm offload setup post-module-load")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 780f87869e36..d22d78303311 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -581,7 +581,7 @@ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
 	real_dev = curr_active->dev;
 
 	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
-		err = true;
+		err = false;
 		goto out;
 	}
 

From ad1f37970875eef98eeaf478f55045f388b794a5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 6 Jul 2021 12:18:02 +0100
Subject: [PATCH 104/158] octeontx2-pf: Fix assigned error return value that is
 never used

Currently when the call to otx2_mbox_alloc_msg_cgx_mac_addr_update fails
the error return variable rc is being assigned -ENOMEM and does not
return early. rc is then re-assigned and the error case is not handled
correctly. Fix this by returning -ENOMEM rather than assigning rc.

Addresses-Coverity: ("Unused value")
Fixes: 79d2be385e9e ("octeontx2-pf: offload DMAC filters to CGX/RPM block")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c
index ffe3e94562d0..383a6b5cb698 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c
@@ -161,7 +161,7 @@ int otx2_dmacflt_update(struct otx2_nic *pf, u8 *mac, u8 bit_pos)
 
 	if (!req) {
 		mutex_unlock(&pf->mbox.lock);
-		rc = -ENOMEM;
+		return -ENOMEM;
 	}
 
 	ether_addr_copy(req->mac_addr, mac);

From ccd27f05ae7b8ebc40af5b004e94517a919aa862 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 6 Jul 2021 11:13:35 +0200
Subject: [PATCH 105/158] ipv6: fix 'disable_policy' for fwd packets

The goal of commit df789fe75206 ("ipv6: Provide ipv6 version of
"disable_policy" sysctl") was to have the disable_policy from ipv4
available on ipv6.
However, it's not exactly the same mechanism. On IPv4, all packets coming
from an interface, which has disable_policy set, bypass the policy check.
For ipv6, this is done only for local packets, ie for packets destinated to
an address configured on the incoming interface.

Let's align ipv6 with ipv4 so that the 'disable_policy' sysctl has the same
effect for both protocols.

My first approach was to create a new kind of route cache entries, to be
able to set DST_NOPOLICY without modifying routes. This would have added a
lot of code. Because the local delivery path is already handled, I choose
to focus on the forwarding path to minimize code churn.

Fixes: df789fe75206 ("ipv6: Provide ipv6 version of "disable_policy" sysctl")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 984050f35c61..d4ee2169afd8 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -479,7 +479,9 @@ int ip6_forward(struct sk_buff *skb)
 	if (skb_warn_if_lro(skb))
 		goto drop;
 
-	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+	if (!net->ipv6.devconf_all->disable_policy &&
+	    !idev->cnf.disable_policy &&
+	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 		goto drop;
 	}

From af0efa050caa66e8f304c42c94c76cb6c480cb7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke@redhat.com>
Date: Tue, 6 Jul 2021 14:23:55 +0200
Subject: [PATCH 106/158] libbpf: Restore errno return for functions that were
 already returning it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The update to streamline libbpf error reporting intended to change all
functions to return the errno as a negative return value if
LIBBPF_STRICT_DIRECT_ERRS is set. However, if the flag is *not* set, the
return value changes for the two functions that were already returning a
negative errno unconditionally: bpf_link__unpin() and perf_buffer__poll().

This is a user-visible API change that breaks applications; so let's revert
these two functions back to unconditionally returning a negative errno
value.

Fixes: e9fc3ce99b34 ("libbpf: Streamline error reporting for high-level APIs")
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210706122355.236082-1-toke@redhat.com
---
 tools/lib/bpf/libbpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1e04ce724240..6f5e2757bb3c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10136,7 +10136,7 @@ int bpf_link__unpin(struct bpf_link *link)
 
 	err = unlink(link->pin_path);
 	if (err != 0)
-		return libbpf_err_errno(err);
+		return -errno;
 
 	pr_debug("link fd=%d: unpinned from %s\n", link->fd, link->pin_path);
 	zfree(&link->pin_path);
@@ -11197,7 +11197,7 @@ int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms)
 
 	cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, timeout_ms);
 	if (cnt < 0)
-		return libbpf_err_errno(cnt);
+		return -errno;
 
 	for (i = 0; i < cnt; i++) {
 		struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr;

From d322957ebfb9c21c2c72b66680f7c3ccd724e081 Mon Sep 17 00:00:00 2001
From: Duncan Roe <duncan_roe@optusnet.com.au>
Date: Wed, 7 Jul 2021 10:57:51 +1000
Subject: [PATCH 107/158] netfilter: uapi: refer to nfnetlink_conntrack.h, not
 nf_conntrack_netlink.h

nf_conntrack_netlink.h does not exist, refer to nfnetlink_conntrack.h instead.

Signed-off-by: Duncan Roe <duncan_roe@optusnet.com.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_log.h   | 2 +-
 include/uapi/linux/netfilter/nfnetlink_queue.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/netfilter/nfnetlink_log.h b/include/uapi/linux/netfilter/nfnetlink_log.h
index 45c8d3b027e0..0af9c113d665 100644
--- a/include/uapi/linux/netfilter/nfnetlink_log.h
+++ b/include/uapi/linux/netfilter/nfnetlink_log.h
@@ -61,7 +61,7 @@ enum nfulnl_attr_type {
 	NFULA_HWTYPE,			/* hardware type */
 	NFULA_HWHEADER,			/* hardware header */
 	NFULA_HWLEN,			/* hardware header length */
-	NFULA_CT,                       /* nf_conntrack_netlink.h */
+	NFULA_CT,                       /* nfnetlink_conntrack.h */
 	NFULA_CT_INFO,                  /* enum ip_conntrack_info */
 	NFULA_VLAN,			/* nested attribute: packet vlan info */
 	NFULA_L2HDR,			/* full L2 header */
diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index bcb2cb5d40b9..aed90c4df0c8 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -51,11 +51,11 @@ enum nfqnl_attr_type {
 	NFQA_IFINDEX_PHYSOUTDEV,	/* __u32 ifindex */
 	NFQA_HWADDR,			/* nfqnl_msg_packet_hw */
 	NFQA_PAYLOAD,			/* opaque data payload */
-	NFQA_CT,			/* nf_conntrack_netlink.h */
+	NFQA_CT,			/* nfnetlink_conntrack.h */
 	NFQA_CT_INFO,			/* enum ip_conntrack_info */
 	NFQA_CAP_LEN,			/* __u32 length of captured packet */
 	NFQA_SKB_INFO,			/* __u32 skb meta information */
-	NFQA_EXP,			/* nf_conntrack_netlink.h */
+	NFQA_EXP,			/* nfnetlink_conntrack.h */
 	NFQA_UID,			/* __u32 sk uid */
 	NFQA_GID,			/* __u32 sk gid */
 	NFQA_SECCTX,			/* security context string */

From 5616e895ecc56db8ba959e53638031a21353e0e2 Mon Sep 17 00:00:00 2001
From: SanjayKumar Jeyakumar <vjsanjay@gmail.com>
Date: Wed, 7 Jul 2021 10:59:14 +0530
Subject: [PATCH 108/158] tools/runqslower: Use __state instead of state

Commit 2f064a59a11f ("sched: Change task_struct::state") renamed task->state
to task->__state in task_struct. Fix runqslower to use the new name of the
field.

Fixes: 2f064a59a11f ("sched: Change task_struct::state")
Signed-off-by: SanjayKumar Jeyakumar <vjsanjay@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20210707052914.21473-1-vjsanjay@gmail.com
---
 tools/bpf/runqslower/runqslower.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c
index 645530ca7e98..ab9353f2fd46 100644
--- a/tools/bpf/runqslower/runqslower.bpf.c
+++ b/tools/bpf/runqslower/runqslower.bpf.c
@@ -74,7 +74,7 @@ int handle__sched_switch(u64 *ctx)
 	u32 pid;
 
 	/* ivcsw: treat like an enqueue event and store timestamp */
-	if (prev->state == TASK_RUNNING)
+	if (prev->__state == TASK_RUNNING)
 		trace_enqueue(prev);
 
 	pid = next->pid;

From 0d472c69c6a5e22cef9e5809e2f6d0ccd5934f4a Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 7 Jul 2021 15:50:57 +0800
Subject: [PATCH 109/158] stmmac: dwmac-loongson: Fix unsigned comparison to
 zero

plat->phy_interface is unsigned integer, so the condition
can't be less than zero and the warning will never printed.

Fixes: 30bba69d7db4 ("stmmac: pci: Add dwmac support for Loongson")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
index e108b0d2bd28..4c9a37dd0d3f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
@@ -49,9 +49,9 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id
 {
 	struct plat_stmmacenet_data *plat;
 	struct stmmac_resources res;
-	bool mdio = false;
-	int ret, i;
 	struct device_node *np;
+	int ret, i, phy_mode;
+	bool mdio = false;
 
 	np = dev_of_node(&pdev->dev);
 
@@ -108,10 +108,11 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id
 	if (plat->bus_id < 0)
 		plat->bus_id = pci_dev_id(pdev);
 
-	plat->phy_interface = device_get_phy_mode(&pdev->dev);
-	if (plat->phy_interface < 0)
+	phy_mode = device_get_phy_mode(&pdev->dev);
+	if (phy_mode < 0)
 		dev_err(&pdev->dev, "phy_mode not found\n");
 
+	plat->phy_interface = phy_mode;
 	plat->interface = PHY_INTERFACE_MODE_GMII;
 
 	pci_set_master(pdev);

From eca81f09145d765c21dd8fb1ba5d874ca255c32c Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 7 Jul 2021 15:53:35 +0800
Subject: [PATCH 110/158] stmmac: platform: Fix signedness bug in
 stmmac_probe_config_dt()

The "plat->phy_interface" variable is an enum and in this context GCC
will treat it as an unsigned int so the error handling is never
triggered.

Fixes: b9f0b2f634c0 ("net: stmmac: platform: fix probe for ACPI devices")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 072eff8079d0..5ca710844cc1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -397,6 +397,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac)
 	struct device_node *np = pdev->dev.of_node;
 	struct plat_stmmacenet_data *plat;
 	struct stmmac_dma_cfg *dma_cfg;
+	int phy_mode;
 	void *ret;
 	int rc;
 
@@ -412,10 +413,11 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac)
 		eth_zero_addr(mac);
 	}
 
-	plat->phy_interface = device_get_phy_mode(&pdev->dev);
-	if (plat->phy_interface < 0)
-		return ERR_PTR(plat->phy_interface);
+	phy_mode = device_get_phy_mode(&pdev->dev);
+	if (phy_mode < 0)
+		return ERR_PTR(phy_mode);
 
+	plat->phy_interface = phy_mode;
 	plat->interface = stmmac_of_get_mac_mode(np);
 	if (plat->interface < 0)
 		plat->interface = plat->phy_interface;

From 24b671aad4eae423e1abf5b7f08d9a5235458b8d Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 7 Jul 2021 16:15:29 +0800
Subject: [PATCH 111/158] selftests: icmp_redirect: remove from checking for
 IPv6 route get

If the kernel doesn't enable option CONFIG_IPV6_SUBTREES, the RTA_SRC
info will not be exported to userspace in rt6_fill_node(). And ip cmd will
not print "from ::" to the route output. So remove this check.

Fixes: ec8105352869 ("selftests: Add redirect tests")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/icmp_redirect.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/icmp_redirect.sh b/tools/testing/selftests/net/icmp_redirect.sh
index c19ecc6a8614..3a111ac1edc3 100755
--- a/tools/testing/selftests/net/icmp_redirect.sh
+++ b/tools/testing/selftests/net/icmp_redirect.sh
@@ -315,7 +315,7 @@ check_exception()
 
 	if [ "$with_redirect" = "yes" ]; then
 		ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \
-		grep -q "${H2_N2_IP6} from :: via ${R2_LLADDR} dev br0.*${mtu}"
+		grep -q "${H2_N2_IP6} .*via ${R2_LLADDR} dev br0.*${mtu}"
 	elif [ -n "${mtu}" ]; then
 		ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \
 		grep -q "${mtu}"

From 0e02bf5de46ae30074a2e1a8194a422a84482a1a Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 7 Jul 2021 16:15:30 +0800
Subject: [PATCH 112/158] selftests: icmp_redirect: IPv6 PMTU info should be
 cleared after redirect

After redirecting, it's already a new path. So the old PMTU info should
be cleared. The IPv6 test "mtu exception plus redirect" should only
has redirect info without old PMTU.

The IPv4 test can not be changed because of legacy.

Fixes: ec8105352869 ("selftests: Add redirect tests")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/icmp_redirect.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/icmp_redirect.sh b/tools/testing/selftests/net/icmp_redirect.sh
index 3a111ac1edc3..ecbf57f264ed 100755
--- a/tools/testing/selftests/net/icmp_redirect.sh
+++ b/tools/testing/selftests/net/icmp_redirect.sh
@@ -313,9 +313,10 @@ check_exception()
 	fi
 	log_test $? 0 "IPv4: ${desc}"
 
-	if [ "$with_redirect" = "yes" ]; then
+	# No PMTU info for test "redirect" and "mtu exception plus redirect"
+	if [ "$with_redirect" = "yes" ] && [ "$desc" != "redirect exception plus mtu" ]; then
 		ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \
-		grep -q "${H2_N2_IP6} .*via ${R2_LLADDR} dev br0.*${mtu}"
+		grep -v "mtu" | grep -q "${H2_N2_IP6} .*via ${R2_LLADDR} dev br0"
 	elif [ -n "${mtu}" ]; then
 		ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \
 		grep -q "${mtu}"

From 1d719254c139fb62fb8056fb496b6fd007e71550 Mon Sep 17 00:00:00 2001
From: Wei Li <liwei391@huawei.com>
Date: Mon, 28 Jun 2021 11:04:09 +0800
Subject: [PATCH 113/158] tools: bpf: Fix error in 'make -C tools/ bpf_install'

make[2]: *** No rule to make target 'install'.  Stop.
make[1]: *** [Makefile:122: runqslower_install] Error 2
make: *** [Makefile:116: bpf_install] Error 2

There is no rule for target 'install' in tools/bpf/runqslower/Makefile,
and there is no need to install it, so just remove 'runqslower_install'.

Fixes: 9c01546d26d2 ("tools/bpf: Add runqslower tool to tools/bpf")
Signed-off-by: Wei Li <liwei391@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210628030409.3459095-1-liwei391@huawei.com
---
 tools/bpf/Makefile | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile
index 39bb322707b4..b11cfc86a3d0 100644
--- a/tools/bpf/Makefile
+++ b/tools/bpf/Makefile
@@ -97,7 +97,7 @@ clean: bpftool_clean runqslower_clean resolve_btfids_clean
 	$(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpf
 	$(Q)$(RM) -r -- $(OUTPUT)feature
 
-install: $(PROGS) bpftool_install runqslower_install
+install: $(PROGS) bpftool_install
 	$(call QUIET_INSTALL, bpf_jit_disasm)
 	$(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/bin
 	$(Q)$(INSTALL) $(OUTPUT)bpf_jit_disasm $(DESTDIR)$(prefix)/bin/bpf_jit_disasm
@@ -118,9 +118,6 @@ bpftool_clean:
 runqslower:
 	$(call descend,runqslower)
 
-runqslower_install:
-	$(call descend,runqslower,install)
-
 runqslower_clean:
 	$(call descend,runqslower,clean)
 
@@ -131,5 +128,5 @@ resolve_btfids_clean:
 	$(call descend,resolve_btfids,clean)
 
 .PHONY: all install clean bpftool bpftool_install bpftool_clean \
-	runqslower runqslower_install runqslower_clean \
+	runqslower runqslower_clean \
 	resolve_btfids resolve_btfids_clean

From 271dbc31843244e5192f0f8a8be0da26995f944a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 7 Jul 2021 13:01:00 +0300
Subject: [PATCH 114/158] sock: unlock on error in sock_setsockopt()

If copy_from_sockptr() then we need to unlock before returning.

Fixes: d463126e23f1 ("net: sock: extend SO_TIMESTAMPING for PHC binding")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 1c4b0468bc2c..a3eea6e0b30a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1123,8 +1123,10 @@ set_sndbuf:
 	case SO_TIMESTAMPING_OLD:
 		if (optlen == sizeof(timestamping)) {
 			if (copy_from_sockptr(&timestamping, optval,
-					      sizeof(timestamping)))
-				return -EFAULT;
+					      sizeof(timestamping))) {
+				ret = -EFAULT;
+				break;
+			}
 		} else {
 			memset(&timestamping, 0, sizeof(timestamping));
 			timestamping.flags = val;

From 92c4bed59bc0ef2a92b99ca1fe1c7107ffa3125c Mon Sep 17 00:00:00 2001
From: "Roy, UjjaL" <royujjal@gmail.com>
Date: Wed, 7 Jul 2021 23:48:33 +0530
Subject: [PATCH 115/158] ipmr: Fix indentation issue

Fixed indentation by removing extra spaces.

Signed-off-by: Roy, UjjaL <royujjal@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 7b12a40dd465..2dda856ca260 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2119,7 +2119,7 @@ int ip_mr_input(struct sk_buff *skb)
 				raw_rcv(mroute_sk, skb);
 				return 0;
 			}
-		    }
+		}
 	}
 
 	/* already under rcu_read_lock() */

From 9615fe36b31d926f1c5107013b772dc226a6a7ca Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 7 Jul 2021 21:10:51 -0700
Subject: [PATCH 116/158] skbuff: Fix build with SKB extensions disabled

We will fail to build with CONFIG_SKB_EXTENSIONS disabled after
8550ff8d8c75 ("skbuff: Release nfct refcount on napi stolen or re-used
skbs") since there is an unconditionally use of skb_ext_find() without
an appropriate stub. Simply build the code conditionally and properly
guard against both COFNIG_SKB_EXTENSIONS as well as
CONFIG_NET_TC_SKB_EXT being disabled.

Fixes: Fixes: 8550ff8d8c75 ("skbuff: Release nfct refcount on napi stolen or re-used skbs")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 177a5aec0b6b..03c95a0867bb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6010,7 +6010,7 @@ static void gro_list_prepare(const struct list_head *head,
 				       maclen);
 
 		diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
-
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
 		if (!diffs) {
 			struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT);
 			struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT);
@@ -6019,6 +6019,7 @@ static void gro_list_prepare(const struct list_head *head,
 			if (!diffs && unlikely(skb_ext))
 				diffs |= p_ext->chain ^ skb_ext->chain;
 		}
+#endif
 
 		NAPI_GRO_CB(p)->same_flow = !diffs;
 	}

From c7bb4b89033b764eb07db4e060548a6311d801ee Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Jul 2021 00:21:09 -0700
Subject: [PATCH 117/158] ipv6: tcp: drop silly ICMPv6 packet too big messages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While TCP stack scales reasonably well, there is still one part that
can be used to DDOS it.

IPv6 Packet too big messages have to lookup/insert a new route,
and if abused by attackers, can easily put hosts under high stress,
with many cpus contending on a spinlock while one is stuck in fib6_run_gc()

ip6_protocol_deliver_rcu()
 icmpv6_rcv()
  icmpv6_notify()
   tcp_v6_err()
    tcp_v6_mtu_reduced()
     inet6_csk_update_pmtu()
      ip6_rt_update_pmtu()
       __ip6_rt_update_pmtu()
        ip6_rt_cache_alloc()
         ip6_dst_alloc()
          dst_alloc()
           ip6_dst_gc()
            fib6_run_gc()
             spin_lock_bh() ...

Some of our servers have been hit by malicious ICMPv6 packets
trying to _increase_ the MTU/MSS of TCP flows.

We believe these ICMPv6 packets are a result of a bug in one ISP stack,
since they were blindly sent back for _every_ (small) packet sent to them.

These packets are for one TCP flow:
09:24:36.266491 IP6 Addr1 > Victim ICMP6, packet too big, mtu 1460, length 1240
09:24:36.266509 IP6 Addr1 > Victim ICMP6, packet too big, mtu 1460, length 1240
09:24:36.316688 IP6 Addr1 > Victim ICMP6, packet too big, mtu 1460, length 1240
09:24:36.316704 IP6 Addr1 > Victim ICMP6, packet too big, mtu 1460, length 1240
09:24:36.608151 IP6 Addr1 > Victim ICMP6, packet too big, mtu 1460, length 1240

TCP stack can filter some silly requests :

1) MTU below IPV6_MIN_MTU can be filtered early in tcp_v6_err()
2) tcp_v6_mtu_reduced() can drop requests trying to increase current MSS.

This tests happen before the IPv6 routing stack is entered, thus
removing the potential contention and route exhaustion.

Note that IPv6 stack was performing these checks, but too late
(ie : after the route has been added, and after the potential
garbage collect war)

v2: fix typo caught by Martin, thanks !
v3: exports tcp_mtu_to_mss(), caught by David, thanks !

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c |  1 +
 net/ipv6/tcp_ipv6.c   | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bde781f46b41..29553fce8502 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1732,6 +1732,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
 	return __tcp_mtu_to_mss(sk, pmtu) -
 	       (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
 }
+EXPORT_SYMBOL(tcp_mtu_to_mss);
 
 /* Inverse of above */
 int tcp_mss_to_mtu(struct sock *sk, int mss)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 593c32fe57ed..323989927a0a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -348,11 +348,20 @@ failure:
 static void tcp_v6_mtu_reduced(struct sock *sk)
 {
 	struct dst_entry *dst;
+	u32 mtu;
 
 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 		return;
 
-	dst = inet6_csk_update_pmtu(sk, READ_ONCE(tcp_sk(sk)->mtu_info));
+	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
+
+	/* Drop requests trying to increase our current mss.
+	 * Check done in __ip6_rt_update_pmtu() is too late.
+	 */
+	if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache)
+		return;
+
+	dst = inet6_csk_update_pmtu(sk, mtu);
 	if (!dst)
 		return;
 
@@ -433,6 +442,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	}
 
 	if (type == ICMPV6_PKT_TOOBIG) {
+		u32 mtu = ntohl(info);
+
 		/* We are not interested in TCP_LISTEN and open_requests
 		 * (SYN-ACKs send out by Linux are always <576bytes so
 		 * they should go through unfragmented).
@@ -443,7 +454,11 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (!ip6_sk_accept_pmtu(sk))
 			goto out;
 
-		WRITE_ONCE(tp->mtu_info, ntohl(info));
+		if (mtu < IPV6_MIN_MTU)
+			goto out;
+
+		WRITE_ONCE(tp->mtu_info, mtu);
+
 		if (!sock_owned_by_user(sk))
 			tcp_v6_mtu_reduced(sk);
 		else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,

From debdd8e31895fdd1e2cfeb7a5aff1c83e49a91ba Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Thu, 8 Jul 2021 11:04:08 -0700
Subject: [PATCH 118/158] ptp: Relocate lookup cookie to correct block.

An earlier commit set the pps_lookup cookie, but the line
was somehow added to the wrong code block.  Correct this.

Fixes: 8602e40fc813 ("ptp: Set lookup cookie when creating a PTP PPS source.")
Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Dario Binacchi <dariobin@libero.it>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index ce6d9fc85607..4dfc52e06704 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -232,7 +232,6 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 			pr_err("failed to create ptp aux_worker %d\n", err);
 			goto kworker_err;
 		}
-		ptp->pps_source->lookup_cookie = ptp;
 	}
 
 	/* PTP virtual clock is being registered under physical clock */
@@ -268,6 +267,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 			pr_err("failed to register pps source\n");
 			goto no_pps;
 		}
+		ptp->pps_source->lookup_cookie = ptp;
 	}
 
 	/* Initialize a new device of our class in our clock structure. */

From b9d233ea21f192702f8bbf3f5f640e2dde308b25 Mon Sep 17 00:00:00 2001
From: Gatis Peisenieks <gatis@mikrotik.com>
Date: Thu, 8 Jul 2021 12:49:04 +0300
Subject: [PATCH 119/158] atl1c: fix Mikrotik 10/25G NIC detection

Since Mikrotik 10/25G NIC MDIO op emulation is not 100% reliable,
on rare occasions it can happen that some physical functions of
the NIC do not get initialized due to timeouted early MDIO op.

This changes the atl1c probe on Mikrotik 10/25G NIC not to
depend on MDIO op emulation.

Signed-off-by: Gatis Peisenieks <gatis@mikrotik.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/atheros/atl1c/atl1c_hw.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_hw.c b/drivers/net/ethernet/atheros/atl1c/atl1c_hw.c
index 7dff20350865..f19370c33444 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_hw.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_hw.c
@@ -594,6 +594,11 @@ int atl1c_phy_init(struct atl1c_hw *hw)
 	int ret_val;
 	u16 mii_bmcr_data = BMCR_RESET;
 
+	if (hw->nic_type == athr_mt) {
+		hw->phy_configured = true;
+		return 0;
+	}
+
 	if ((atl1c_read_phy_reg(hw, MII_PHYSID1, &hw->phy_id1) != 0) ||
 		(atl1c_read_phy_reg(hw, MII_PHYSID2, &hw->phy_id2) != 0)) {
 		dev_err(&pdev->dev, "Error get phy ID\n");

From c34269041185dad1bab7a34f42ef9fab967a1684 Mon Sep 17 00:00:00 2001
From: Aaron Ma <aaron.ma@canonical.com>
Date: Thu, 8 Jul 2021 21:17:10 +0800
Subject: [PATCH 120/158] mt76: mt7921: continue to probe driver when fw
 already downloaded

When reboot system, no power cycles, firmware is already downloaded,
return -EIO will break driver as error:
mt7921e: probe of 0000:03:00.0 failed with error -5

Skip firmware download and continue to probe.

Signed-off-by: Aaron Ma <aaron.ma@canonical.com>
Fixes: 1c099ab44727c ("mt76: mt7921: add MCU support")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wireless/mediatek/mt76/mt7921/mcu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
index c2c4dc196802..cd690c64f65b 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
@@ -931,7 +931,7 @@ static int mt7921_load_firmware(struct mt7921_dev *dev)
 	ret = mt76_get_field(dev, MT_CONN_ON_MISC, MT_TOP_MISC2_FW_N9_RDY);
 	if (ret) {
 		dev_dbg(dev->mt76.dev, "Firmware is already download\n");
-		return -EIO;
+		goto fw_loaded;
 	}
 
 	ret = mt7921_load_patch(dev);
@@ -949,6 +949,7 @@ static int mt7921_load_firmware(struct mt7921_dev *dev)
 		return -EIO;
 	}
 
+fw_loaded:
 	mt76_queue_tx_cleanup(dev, dev->mt76.q_mcu[MT_MCUQ_FWDL], false);
 
 #ifdef CONFIG_PM

From 015fe6fd29c4b9ac0f61b8c4455ef88e6018b9cc Mon Sep 17 00:00:00 2001
From: Shahjada Abul Husain <shahjada@chelsio.com>
Date: Thu, 8 Jul 2021 21:51:56 +0530
Subject: [PATCH 121/158] cxgb4: fix IRQ free race during driver unload

IRQs are requested during driver's ndo_open() and then later
freed up in disable_interrupts() during driver unload.
A race exists where driver can set the CXGB4_FULL_INIT_DONE
flag in ndo_open() after the disable_interrupts() in driver
unload path checks it, and hence misses calling free_irq().

Fix by unregistering netdevice first and sync with driver's
ndo_open(). This ensures disable_interrupts() checks the flag
correctly and frees up the IRQs properly.

Fixes: b37987e8db5f ("cxgb4: Disable interrupts and napi before unregistering netdev")
Signed-off-by: Shahjada Abul Husain <shahjada@chelsio.com>
Signed-off-by: Raju Rangoju <rajur@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/chelsio/cxgb4/cxgb4_main.c    | 18 ++++++++++--------
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c |  3 +++
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 9a2b166d651e..dbf9a0e6601d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2643,6 +2643,9 @@ static void detach_ulds(struct adapter *adap)
 {
 	unsigned int i;
 
+	if (!is_uld(adap))
+		return;
+
 	mutex_lock(&uld_mutex);
 	list_del(&adap->list_node);
 
@@ -7141,10 +7144,13 @@ static void remove_one(struct pci_dev *pdev)
 		 */
 		destroy_workqueue(adapter->workq);
 
-		if (is_uld(adapter)) {
-			detach_ulds(adapter);
-			t4_uld_clean_up(adapter);
-		}
+		detach_ulds(adapter);
+
+		for_each_port(adapter, i)
+			if (adapter->port[i]->reg_state == NETREG_REGISTERED)
+				unregister_netdev(adapter->port[i]);
+
+		t4_uld_clean_up(adapter);
 
 		adap_free_hma_mem(adapter);
 
@@ -7152,10 +7158,6 @@ static void remove_one(struct pci_dev *pdev)
 
 		cxgb4_free_mps_ref_entries(adapter);
 
-		for_each_port(adapter, i)
-			if (adapter->port[i]->reg_state == NETREG_REGISTERED)
-				unregister_netdev(adapter->port[i]);
-
 		debugfs_remove_recursive(adapter->debugfs_root);
 
 		if (!is_t4(adapter->params.chip))
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
index 743af9e654aa..17faac715882 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
@@ -581,6 +581,9 @@ void t4_uld_clean_up(struct adapter *adap)
 {
 	unsigned int i;
 
+	if (!is_uld(adap))
+		return;
+
 	mutex_lock(&uld_mutex);
 	for (i = 0; i < CXGB4_ULD_MAX; i++) {
 		if (!adap->uld[i].handle)

From 96248d6da65744e1baaa29e5c4e5dc233e29838b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 8 Jul 2021 10:33:10 -0700
Subject: [PATCH 122/158] net: microchip: sparx5: fix kconfig warning

PHY_SPARX5_SERDES depends on OF so SPARX5_SWITCH should also depend
on OF since 'select' does not follow any dependencies.

WARNING: unmet direct dependencies detected for PHY_SPARX5_SERDES
  Depends on [n]: (ARCH_SPARX5 || COMPILE_TEST [=n]) && OF [=n] && HAS_IOMEM [=y]
  Selected by [y]:
  - SPARX5_SWITCH [=y] && NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_MICROCHIP [=y] && NET_SWITCHDEV [=y] && HAS_IOMEM [=y]

Fixes: 3cfa11bac9bb ("net: sparx5: add the basic sparx5 driver")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Lars Povlsen <lars.povlsen@microchip.com>
Cc: Steen Hegelund <Steen.Hegelund@microchip.com>
Cc: UNGLinuxDriver@microchip.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microchip/sparx5/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/microchip/sparx5/Kconfig b/drivers/net/ethernet/microchip/sparx5/Kconfig
index a80419d8d4b5..ac403d43c74c 100644
--- a/drivers/net/ethernet/microchip/sparx5/Kconfig
+++ b/drivers/net/ethernet/microchip/sparx5/Kconfig
@@ -2,6 +2,7 @@ config SPARX5_SWITCH
 	tristate "Sparx5 switch driver"
 	depends on NET_SWITCHDEV
 	depends on HAS_IOMEM
+	depends on OF
 	select PHYLINK
 	select PHY_SPARX5_SERDES
 	select RESET_CONTROLLER

From 27fa107d3b8d13a57cdd7c7a40bd6548d4b9cef8 Mon Sep 17 00:00:00 2001
From: Ivan Mikhaylov <i.mikhaylov@yadro.com>
Date: Thu, 8 Jul 2021 15:27:52 +0300
Subject: [PATCH 123/158] net/ncsi: fix restricted cast warning of sparse

Sparse reports:
net/ncsi/ncsi-rsp.c:406:24: warning: cast to restricted __be32
net/ncsi/ncsi-manage.c:732:33: warning: cast to restricted __be32
net/ncsi/ncsi-manage.c:756:25: warning: cast to restricted __be32
net/ncsi/ncsi-manage.c:779:25: warning: cast to restricted __be32

Signed-off-by: Ivan Mikhaylov <i.mikhaylov@yadro.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-manage.c | 6 +++---
 net/ncsi/ncsi-rsp.c    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index ca04b6df1341..42b54a3da2e6 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -700,7 +700,7 @@ static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca)
 	nca->payload = NCSI_OEM_BCM_CMD_GMA_LEN;
 
 	memset(data, 0, NCSI_OEM_BCM_CMD_GMA_LEN);
-	*(unsigned int *)data = ntohl(NCSI_OEM_MFR_BCM_ID);
+	*(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_BCM_ID);
 	data[5] = NCSI_OEM_BCM_CMD_GMA;
 
 	nca->data = data;
@@ -724,7 +724,7 @@ static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca)
 	nca->payload = NCSI_OEM_MLX_CMD_GMA_LEN;
 
 	memset(&u, 0, sizeof(u));
-	u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID);
+	u.data_u32[0] = ntohl((__force __be32)NCSI_OEM_MFR_MLX_ID);
 	u.data_u8[5] = NCSI_OEM_MLX_CMD_GMA;
 	u.data_u8[6] = NCSI_OEM_MLX_CMD_GMA_PARAM;
 
@@ -747,7 +747,7 @@ static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca)
 	int ret = 0;
 
 	memset(&u, 0, sizeof(u));
-	u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID);
+	u.data_u32[0] = ntohl((__force __be32)NCSI_OEM_MFR_MLX_ID);
 	u.data_u8[5] = NCSI_OEM_MLX_CMD_SMAF;
 	u.data_u8[6] = NCSI_OEM_MLX_CMD_SMAF_PARAM;
 	memcpy(&u.data_u8[MLX_SMAF_MAC_ADDR_OFFSET],
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 888ccc2d4e34..04bc50be5c01 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -403,7 +403,7 @@ static int ncsi_rsp_handler_ev(struct ncsi_request *nr)
 	/* Update to VLAN mode */
 	cmd = (struct ncsi_cmd_ev_pkt *)skb_network_header(nr->cmd);
 	ncm->enable = 1;
-	ncm->data[0] = ntohl(cmd->mode);
+	ncm->data[0] = ntohl((__force __be32)cmd->mode);
 
 	return 0;
 }

From abd2fddc94a619b96bf41c60429d4c32bd118e17 Mon Sep 17 00:00:00 2001
From: Ivan Mikhaylov <i.mikhaylov@yadro.com>
Date: Thu, 8 Jul 2021 15:27:53 +0300
Subject: [PATCH 124/158] net/ncsi: add NCSI Intel OEM command to keep PHY up

This allows to keep PHY link up and prevents any channel resets during
the host load.

It is KEEP_PHY_LINK_UP option(Veto bit) in i210 datasheet which
block PHY reset and power state changes.

Signed-off-by: Ivan Mikhaylov <i.mikhaylov@yadro.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/Kconfig       |  6 ++++++
 net/ncsi/internal.h    |  5 +++++
 net/ncsi/ncsi-manage.c | 45 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig
index 93309081f5a4..ea1dd32b6b1f 100644
--- a/net/ncsi/Kconfig
+++ b/net/ncsi/Kconfig
@@ -17,3 +17,9 @@ config NCSI_OEM_CMD_GET_MAC
 	help
 	  This allows to get MAC address from NCSI firmware and set them back to
 		controller.
+config NCSI_OEM_CMD_KEEP_PHY
+	bool "Keep PHY Link up"
+	depends on NET_NCSI
+	help
+	  This allows to keep PHY link up and prevents any channel resets during
+	  the host load.
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index cbbb0de4750a..0b6cfd3b31e0 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -78,6 +78,9 @@ enum {
 /* OEM Vendor Manufacture ID */
 #define NCSI_OEM_MFR_MLX_ID             0x8119
 #define NCSI_OEM_MFR_BCM_ID             0x113d
+#define NCSI_OEM_MFR_INTEL_ID           0x157
+/* Intel specific OEM command */
+#define NCSI_OEM_INTEL_CMD_KEEP_PHY     0x20   /* CMD ID for Keep PHY up */
 /* Broadcom specific OEM Command */
 #define NCSI_OEM_BCM_CMD_GMA            0x01   /* CMD ID for Get MAC */
 /* Mellanox specific OEM Command */
@@ -86,6 +89,7 @@ enum {
 #define NCSI_OEM_MLX_CMD_SMAF           0x01   /* CMD ID for Set MC Affinity */
 #define NCSI_OEM_MLX_CMD_SMAF_PARAM     0x07   /* Parameter for SMAF         */
 /* OEM Command payload lengths*/
+#define NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN 7
 #define NCSI_OEM_BCM_CMD_GMA_LEN        12
 #define NCSI_OEM_MLX_CMD_GMA_LEN        8
 #define NCSI_OEM_MLX_CMD_SMAF_LEN        60
@@ -271,6 +275,7 @@ enum {
 	ncsi_dev_state_probe_mlx_gma,
 	ncsi_dev_state_probe_mlx_smaf,
 	ncsi_dev_state_probe_cis,
+	ncsi_dev_state_probe_keep_phy,
 	ncsi_dev_state_probe_gvi,
 	ncsi_dev_state_probe_gc,
 	ncsi_dev_state_probe_gls,
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 42b54a3da2e6..89c7742cd72e 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -689,6 +689,35 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY)
+
+static int ncsi_oem_keep_phy_intel(struct ncsi_cmd_arg *nca)
+{
+	unsigned char data[NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN];
+	int ret = 0;
+
+	nca->payload = NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN;
+
+	memset(data, 0, NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN);
+	*(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_INTEL_ID);
+
+	data[4] = NCSI_OEM_INTEL_CMD_KEEP_PHY;
+
+	/* PHY Link up attribute */
+	data[6] = 0x1;
+
+	nca->data = data;
+
+	ret = ncsi_xmit_cmd(nca);
+	if (ret)
+		netdev_err(nca->ndp->ndev.dev,
+			   "NCSI: Failed to transmit cmd 0x%x during configure\n",
+			   nca->type);
+	return ret;
+}
+
+#endif
+
 #if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
 
 /* NCSI OEM Command APIs */
@@ -1392,7 +1421,23 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
 		}
 
 		nd->state = ncsi_dev_state_probe_gvi;
+		if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY))
+			nd->state = ncsi_dev_state_probe_keep_phy;
 		break;
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY)
+	case ncsi_dev_state_probe_keep_phy:
+		ndp->pending_req_num = 1;
+
+		nca.type = NCSI_PKT_CMD_OEM;
+		nca.package = ndp->active_package->id;
+		nca.channel = 0;
+		ret = ncsi_oem_keep_phy_intel(&nca);
+		if (ret)
+			goto error;
+
+		nd->state = ncsi_dev_state_probe_gvi;
+		break;
+#endif /* CONFIG_NCSI_OEM_CMD_KEEP_PHY */
 	case ncsi_dev_state_probe_gvi:
 	case ncsi_dev_state_probe_gc:
 	case ncsi_dev_state_probe_gls:

From 163f5de509a8ec193df94a9b9afbeb1a9e3f46a6 Mon Sep 17 00:00:00 2001
From: Ivan Mikhaylov <i.mikhaylov@yadro.com>
Date: Thu, 8 Jul 2021 15:27:54 +0300
Subject: [PATCH 125/158] net/ncsi: add dummy response handler for Intel boards

Add the dummy response handler for Intel boards to prevent incorrect
handling of OEM commands.

Signed-off-by: Ivan Mikhaylov <i.mikhaylov@yadro.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-rsp.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 04bc50be5c01..d48374894817 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -699,12 +699,19 @@ static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr)
 	return 0;
 }
 
+/* Response handler for Intel card */
+static int ncsi_rsp_handler_oem_intel(struct ncsi_request *nr)
+{
+	return 0;
+}
+
 static struct ncsi_rsp_oem_handler {
 	unsigned int	mfr_id;
 	int		(*handler)(struct ncsi_request *nr);
 } ncsi_rsp_oem_handlers[] = {
 	{ NCSI_OEM_MFR_MLX_ID, ncsi_rsp_handler_oem_mlx },
-	{ NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm }
+	{ NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm },
+	{ NCSI_OEM_MFR_INTEL_ID, ncsi_rsp_handler_oem_intel }
 };
 
 /* Response handler for OEM command */

From 2b452550a203d88112eaf0ba9fc4b750a000b496 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 8 Jul 2021 18:55:32 -0700
Subject: [PATCH 126/158] net: bcmgenet: Ensure all TX/RX queues DMAs are
 disabled

Make sure that we disable each of the TX and RX queues in the TDMA and
RDMA control registers. This is a correctness change to be symmetrical
with the code that enables the TX and RX queues.

Tested-by: Maxime Ripard <maxime@cerno.tech>
Fixes: 1c1008c793fa ("net: bcmgenet: add main driver file")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 35e9956e930c..db74241935ab 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -3238,15 +3238,21 @@ static void bcmgenet_get_hw_addr(struct bcmgenet_priv *priv,
 /* Returns a reusable dma control register value */
 static u32 bcmgenet_dma_disable(struct bcmgenet_priv *priv)
 {
+	unsigned int i;
 	u32 reg;
 	u32 dma_ctrl;
 
 	/* disable DMA */
 	dma_ctrl = 1 << (DESC_INDEX + DMA_RING_BUF_EN_SHIFT) | DMA_EN;
+	for (i = 0; i < priv->hw_params->tx_queues; i++)
+		dma_ctrl |= (1 << (i + DMA_RING_BUF_EN_SHIFT));
 	reg = bcmgenet_tdma_readl(priv, DMA_CTRL);
 	reg &= ~dma_ctrl;
 	bcmgenet_tdma_writel(priv, reg, DMA_CTRL);
 
+	dma_ctrl = 1 << (DESC_INDEX + DMA_RING_BUF_EN_SHIFT) | DMA_EN;
+	for (i = 0; i < priv->hw_params->rx_queues; i++)
+		dma_ctrl |= (1 << (i + DMA_RING_BUF_EN_SHIFT));
 	reg = bcmgenet_rdma_readl(priv, DMA_CTRL);
 	reg &= ~dma_ctrl;
 	bcmgenet_rdma_writel(priv, reg, DMA_CTRL);

From f263a81451c12da5a342d90572e317e611846f2c Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 7 Jul 2021 15:38:47 -0700
Subject: [PATCH 127/158] bpf: Track subprog poke descriptors correctly and fix
 use-after-free

Subprograms are calling map_poke_track(), but on program release there is no
hook to call map_poke_untrack(). However, on program release, the aux memory
(and poke descriptor table) is freed even though we still have a reference to
it in the element list of the map aux data. When we run map_poke_run(), we then
end up accessing free'd memory, triggering KASAN in prog_array_map_poke_run():

  [...]
  [  402.824689] BUG: KASAN: use-after-free in prog_array_map_poke_run+0xc2/0x34e
  [  402.824698] Read of size 4 at addr ffff8881905a7940 by task hubble-fgs/4337
  [  402.824705] CPU: 1 PID: 4337 Comm: hubble-fgs Tainted: G          I       5.12.0+ #399
  [  402.824715] Call Trace:
  [  402.824719]  dump_stack+0x93/0xc2
  [  402.824727]  print_address_description.constprop.0+0x1a/0x140
  [  402.824736]  ? prog_array_map_poke_run+0xc2/0x34e
  [  402.824740]  ? prog_array_map_poke_run+0xc2/0x34e
  [  402.824744]  kasan_report.cold+0x7c/0xd8
  [  402.824752]  ? prog_array_map_poke_run+0xc2/0x34e
  [  402.824757]  prog_array_map_poke_run+0xc2/0x34e
  [  402.824765]  bpf_fd_array_map_update_elem+0x124/0x1a0
  [...]

The elements concerned are walked as follows:

    for (i = 0; i < elem->aux->size_poke_tab; i++) {
           poke = &elem->aux->poke_tab[i];
    [...]

The access to size_poke_tab is a 4 byte read, verified by checking offsets
in the KASAN dump:

  [  402.825004] The buggy address belongs to the object at ffff8881905a7800
                 which belongs to the cache kmalloc-1k of size 1024
  [  402.825008] The buggy address is located 320 bytes inside of
                 1024-byte region [ffff8881905a7800, ffff8881905a7c00)

The pahole output of bpf_prog_aux:

  struct bpf_prog_aux {
    [...]
    /* --- cacheline 5 boundary (320 bytes) --- */
    u32                        size_poke_tab;        /*   320     4 */
    [...]

In general, subprograms do not necessarily manage their own data structures.
For example, BTF func_info and linfo are just pointers to the main program
structure. This allows reference counting and cleanup to be done on the latter
which simplifies their management a bit. The aux->poke_tab struct, however,
did not follow this logic. The initial proposed fix for this use-after-free
bug further embedded poke data tracking into the subprogram with proper
reference counting. However, Daniel and Alexei questioned why we were treating
these objects special; I agree, its unnecessary. The fix here removes the per
subprogram poke table allocation and map tracking and instead simply points
the aux->poke_tab pointer at the main programs poke table. This way, map
tracking is simplified to the main program and we do not need to manage them
per subprogram.

This also means, bpf_prog_free_deferred(), which unwinds the program reference
counting and kfrees objects, needs to ensure that we don't try to double free
the poke_tab when free'ing the subprog structures. This is easily solved by
NULL'ing the poke_tab pointer. The second detail is to ensure that per
subprogram JIT logic only does fixups on poke_tab[] entries it owns. To do
this, we add a pointer in the poke structure to point at the subprogram value
so JITs can easily check while walking the poke_tab structure if the current
entry belongs to the current program. The aux pointer is stable and therefore
suitable for such comparison. On the jit_subprogs() error path, we omit
cleaning up the poke->aux field because these are only ever referenced from
the JIT side, but on error we will never make it to the JIT, so its fine to
leave them dangling. Removing these pointers would complicate the error path
for no reason. However, we do need to untrack all poke descriptors from the
main program as otherwise they could race with the freeing of JIT memory from
the subprograms. Lastly, a748c6975dea3 ("bpf: propagate poke descriptors to
subprograms") had an off-by-one on the subprogram instruction index range
check as it was testing 'insn_idx >= subprog_start && insn_idx <= subprog_end'.
However, subprog_end is the next subprogram's start instruction.

Fixes: a748c6975dea3 ("bpf: propagate poke descriptors to subprograms")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210707223848.14580-2-john.fastabend@gmail.com
---
 arch/x86/net/bpf_jit_comp.c |  3 ++
 include/linux/bpf.h         |  1 +
 kernel/bpf/core.c           |  8 ++++-
 kernel/bpf/verifier.c       | 60 +++++++++++++------------------------
 4 files changed, 32 insertions(+), 40 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index e835164189f1..4b951458c9fc 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -570,6 +570,9 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
 
 	for (i = 0; i < prog->aux->size_poke_tab; i++) {
 		poke = &prog->aux->poke_tab[i];
+		if (poke->aux && poke->aux != prog->aux)
+			continue;
+
 		WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable));
 
 		if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f309fc1509f2..e8e2b0393ca9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -780,6 +780,7 @@ struct bpf_jit_poke_descriptor {
 	void *tailcall_target;
 	void *tailcall_bypass;
 	void *bypass_addr;
+	void *aux;
 	union {
 		struct {
 			struct bpf_map *map;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 034ad93a1ad7..9b1577498373 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2236,8 +2236,14 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 #endif
 	if (aux->dst_trampoline)
 		bpf_trampoline_put(aux->dst_trampoline);
-	for (i = 0; i < aux->func_cnt; i++)
+	for (i = 0; i < aux->func_cnt; i++) {
+		/* We can just unlink the subprog poke descriptor table as
+		 * it was originally linked to the main program and is also
+		 * released along with it.
+		 */
+		aux->func[i]->aux->poke_tab = NULL;
 		bpf_jit_free(aux->func[i]);
+	}
 	if (aux->func_cnt) {
 		kfree(aux->func);
 		bpf_prog_unlock_free(aux->prog);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index be38bb930bf1..42a4063de7cd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12121,33 +12121,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			goto out_free;
 		func[i]->is_func = 1;
 		func[i]->aux->func_idx = i;
-		/* the btf and func_info will be freed only at prog->aux */
+		/* Below members will be freed only at prog->aux */
 		func[i]->aux->btf = prog->aux->btf;
 		func[i]->aux->func_info = prog->aux->func_info;
+		func[i]->aux->poke_tab = prog->aux->poke_tab;
+		func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
 
 		for (j = 0; j < prog->aux->size_poke_tab; j++) {
-			u32 insn_idx = prog->aux->poke_tab[j].insn_idx;
-			int ret;
+			struct bpf_jit_poke_descriptor *poke;
 
-			if (!(insn_idx >= subprog_start &&
-			      insn_idx <= subprog_end))
-				continue;
-
-			ret = bpf_jit_add_poke_descriptor(func[i],
-							  &prog->aux->poke_tab[j]);
-			if (ret < 0) {
-				verbose(env, "adding tail call poke descriptor failed\n");
-				goto out_free;
-			}
-
-			func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1;
-
-			map_ptr = func[i]->aux->poke_tab[ret].tail_call.map;
-			ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux);
-			if (ret < 0) {
-				verbose(env, "tracking tail call prog failed\n");
-				goto out_free;
-			}
+			poke = &prog->aux->poke_tab[j];
+			if (poke->insn_idx < subprog_end &&
+			    poke->insn_idx >= subprog_start)
+				poke->aux = func[i]->aux;
 		}
 
 		/* Use bpf_prog_F_tag to indicate functions in stack traces.
@@ -12178,18 +12164,6 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		cond_resched();
 	}
 
-	/* Untrack main program's aux structs so that during map_poke_run()
-	 * we will not stumble upon the unfilled poke descriptors; each
-	 * of the main program's poke descs got distributed across subprogs
-	 * and got tracked onto map, so we are sure that none of them will
-	 * be missed after the operation below
-	 */
-	for (i = 0; i < prog->aux->size_poke_tab; i++) {
-		map_ptr = prog->aux->poke_tab[i].tail_call.map;
-
-		map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
-	}
-
 	/* at this point all bpf functions were successfully JITed
 	 * now populate all bpf_calls with correct addresses and
 	 * run last pass of JIT
@@ -12267,14 +12241,22 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
+	/* We failed JIT'ing, so at this point we need to unregister poke
+	 * descriptors from subprogs, so that kernel is not attempting to
+	 * patch it anymore as we're freeing the subprog JIT memory.
+	 */
+	for (i = 0; i < prog->aux->size_poke_tab; i++) {
+		map_ptr = prog->aux->poke_tab[i].tail_call.map;
+		map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+	}
+	/* At this point we're guaranteed that poke descriptors are not
+	 * live anymore. We can just unlink its descriptor table as it's
+	 * released with the main prog.
+	 */
 	for (i = 0; i < env->subprog_cnt; i++) {
 		if (!func[i])
 			continue;
-
-		for (j = 0; j < func[i]->aux->size_poke_tab; j++) {
-			map_ptr = func[i]->aux->poke_tab[j].tail_call.map;
-			map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux);
-		}
+		func[i]->aux->poke_tab = NULL;
 		bpf_jit_free(func[i]);
 	}
 	kfree(func);

From 1fb5ba29ad0835c5cbfc69a27f9c2733cb65726e Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 7 Jul 2021 15:38:48 -0700
Subject: [PATCH 128/158] bpf: Selftest to verify mixing bpf2bpf calls and
 tailcalls with insn patch

This adds some extra noise to the tailcall_bpf2bpf4 tests that will cause
verify to patch insns. This then moves around subprog start/end insn
index and poke descriptor insn index to ensure that verify and JIT will
continue to track these correctly.

If done correctly verifier should pass this program same as before and
JIT should emit tail call logic.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210707223848.14580-3-john.fastabend@gmail.com
---
 .../selftests/bpf/prog_tests/tailcalls.c      | 36 +++++++++++++------
 .../selftests/bpf/progs/tailcall_bpf2bpf4.c   | 18 ++++++++++
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index ee27d68d2a1c..b5940e6ca67c 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -715,6 +715,8 @@ out:
 	bpf_object__close(obj);
 }
 
+#include "tailcall_bpf2bpf4.skel.h"
+
 /* test_tailcall_bpf2bpf_4 checks that tailcall counter is correctly preserved
  * across tailcalls combined with bpf2bpf calls. for making sure that tailcall
  * counter behaves correctly, bpf program will go through following flow:
@@ -727,10 +729,15 @@ out:
  * the loop begins. At the end of the test make sure that the global counter is
  * equal to 31, because tailcall counter includes the first two tailcalls
  * whereas global counter is incremented only on loop presented on flow above.
+ *
+ * The noise parameter is used to insert bpf_map_update calls into the logic
+ * to force verifier to patch instructions. This allows us to ensure jump
+ * logic remains correct with instruction movement.
  */
-static void test_tailcall_bpf2bpf_4(void)
+static void test_tailcall_bpf2bpf_4(bool noise)
 {
-	int err, map_fd, prog_fd, main_fd, data_fd, i, val;
+	int err, map_fd, prog_fd, main_fd, data_fd, i;
+	struct tailcall_bpf2bpf4__bss val;
 	struct bpf_map *prog_array, *data_map;
 	struct bpf_program *prog;
 	struct bpf_object *obj;
@@ -774,11 +781,6 @@ static void test_tailcall_bpf2bpf_4(void)
 			goto out;
 	}
 
-	err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
-				&duration, &retval, NULL);
-	CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n",
-	      err, errno, retval);
-
 	data_map = bpf_object__find_map_by_name(obj, "tailcall.bss");
 	if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map)))
 		return;
@@ -787,10 +789,22 @@ static void test_tailcall_bpf2bpf_4(void)
 	if (CHECK_FAIL(map_fd < 0))
 		return;
 
+	i = 0;
+	val.noise = noise;
+	val.count = 0;
+	err = bpf_map_update_elem(data_fd, &i, &val, BPF_ANY);
+	if (CHECK_FAIL(err))
+		goto out;
+
+	err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
+				&duration, &retval, NULL);
+	CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n",
+	      err, errno, retval);
+
 	i = 0;
 	err = bpf_map_lookup_elem(data_fd, &i, &val);
-	CHECK(err || val != 31, "tailcall count", "err %d errno %d count %d\n",
-	      err, errno, val);
+	CHECK(err || val.count != 31, "tailcall count", "err %d errno %d count %d\n",
+	      err, errno, val.count);
 
 out:
 	bpf_object__close(obj);
@@ -815,5 +829,7 @@ void test_tailcalls(void)
 	if (test__start_subtest("tailcall_bpf2bpf_3"))
 		test_tailcall_bpf2bpf_3();
 	if (test__start_subtest("tailcall_bpf2bpf_4"))
-		test_tailcall_bpf2bpf_4();
+		test_tailcall_bpf2bpf_4(false);
+	if (test__start_subtest("tailcall_bpf2bpf_5"))
+		test_tailcall_bpf2bpf_4(true);
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
index 77df6d4db895..e89368a50b97 100644
--- a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
@@ -2,6 +2,13 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} nop_table SEC(".maps");
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
 	__uint(max_entries, 3);
@@ -10,10 +17,21 @@ struct {
 } jmp_table SEC(".maps");
 
 int count = 0;
+int noise = 0;
+
+__always_inline int subprog_noise(void)
+{
+	__u32 key = 0;
+
+	bpf_map_lookup_elem(&nop_table, &key);
+	return 0;
+}
 
 __noinline
 int subprog_tail_2(struct __sk_buff *skb)
 {
+	if (noise)
+		subprog_noise();
 	bpf_tail_call_static(skb, &jmp_table, 2);
 	return skb->len * 3;
 }

From c78eaeebe855fd93f2e77142ffd0404a54070d84 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Fri, 9 Jul 2021 17:09:53 +0300
Subject: [PATCH 129/158] net: moxa: fix UAF in moxart_mac_probe

In case of netdev registration failure the code path will
jump to init_fail label:

init_fail:
	netdev_err(ndev, "init failed\n");
	moxart_mac_free_memory(ndev);
irq_map_fail:
	free_netdev(ndev);
	return ret;

So, there is no need to call free_netdev() before jumping
to error handling path, since it can cause UAF or double-free
bug.

Fixes: 6c821bd9edc9 ("net: Add MOXA ART SoCs ethernet driver")
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/moxa/moxart_ether.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/moxa/moxart_ether.c b/drivers/net/ethernet/moxa/moxart_ether.c
index 5249b64f4fc5..49def6934cad 100644
--- a/drivers/net/ethernet/moxa/moxart_ether.c
+++ b/drivers/net/ethernet/moxa/moxart_ether.c
@@ -540,10 +540,8 @@ static int moxart_mac_probe(struct platform_device *pdev)
 	SET_NETDEV_DEV(ndev, &pdev->dev);
 
 	ret = register_netdev(ndev);
-	if (ret) {
-		free_netdev(ndev);
+	if (ret)
 		goto init_fail;
-	}
 
 	netdev_dbg(ndev, "%s: IRQ=%d address=%pM\n",
 		   __func__, ndev->irq, ndev->dev_addr);

From ad297cd2db8953e2202970e9504cab247b6c7cb4 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Fri, 9 Jul 2021 17:24:18 +0300
Subject: [PATCH 130/158] net: qcom/emac: fix UAF in emac_remove

adpt is netdev private data and it cannot be
used after free_netdev() call. Using adpt after free_netdev()
can cause UAF bug. Fix it by moving free_netdev() at the end of the
function.

Fixes: 54e19bc74f33 ("net: qcom/emac: do not use devm on internal phy pdev")
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/emac/emac.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index 8543bf3c3484..ad655f0a4965 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -735,12 +735,13 @@ static int emac_remove(struct platform_device *pdev)
 
 	put_device(&adpt->phydev->mdio.dev);
 	mdiobus_unregister(adpt->mii_bus);
-	free_netdev(netdev);
 
 	if (adpt->phy.digital)
 		iounmap(adpt->phy.digital);
 	iounmap(adpt->phy.base);
 
+	free_netdev(netdev);
+
 	return 0;
 }
 

From 0336f8ffece62f882ab3012820965a786a983f70 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Fri, 9 Jul 2021 17:58:29 +0300
Subject: [PATCH 131/158] net: ti: fix UAF in tlan_remove_one

priv is netdev private data and it cannot be
used after free_netdev() call. Using priv after free_netdev()
can cause UAF bug. Fix it by moving free_netdev() at the end of the
function.

Fixes: 1e0a8b13d355 ("tlan: cancel work at remove path")
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/tlan.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c
index 0b2ce4bdc2c3..e0cb713193ea 100644
--- a/drivers/net/ethernet/ti/tlan.c
+++ b/drivers/net/ethernet/ti/tlan.c
@@ -313,9 +313,8 @@ static void tlan_remove_one(struct pci_dev *pdev)
 	pci_release_regions(pdev);
 #endif
 
-	free_netdev(dev);
-
 	cancel_work_sync(&priv->tlan_tqueue);
+	free_netdev(dev);
 }
 
 static void tlan_start(struct net_device *dev)

From 43b90bfad34bcb81b8a5bc7dc650800f4be1787e Mon Sep 17 00:00:00 2001
From: Alexander Ovechkin <ovov@yandex-team.ru>
Date: Fri, 9 Jul 2021 18:28:23 +0300
Subject: [PATCH 132/158] net: send SYNACK packet with accepted fwmark

commit e05a90ec9e16 ("net: reflect mark on tcp syn ack packets")
fixed IPv4 only.

This part is for the IPv6 side.

Fixes: e05a90ec9e16 ("net: reflect mark on tcp syn ack packets")
Signed-off-by: Alexander Ovechkin <ovov@yandex-team.ru>
Acked-by: Dmitry Yakunin <zeil@yandex-team.ru>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 323989927a0a..0ce52d46e4f8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -555,7 +555,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 		opt = ireq->ipv6_opt;
 		if (!opt)
 			opt = rcu_dereference(np->opt);
-		err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
+		err = ip6_xmit(sk, skb, fl6, skb->mark ? : sk->sk_mark, opt,
 			       tclass, sk->sk_priority);
 		rcu_read_unlock();
 		err = net_xmit_eval(err);

From 358ed624207012f03318235017ac6fb41f8af592 Mon Sep 17 00:00:00 2001
From: Talal Ahmad <talalahmad@google.com>
Date: Fri, 9 Jul 2021 11:43:06 -0400
Subject: [PATCH 133/158] tcp: call sk_wmem_schedule before sk_mem_charge in
 zerocopy path

sk_wmem_schedule makes sure that sk_forward_alloc has enough
bytes for charging that is going to be done by sk_mem_charge.

In the transmit zerocopy path, there is sk_mem_charge but there was
no call to sk_wmem_schedule. This change adds that call.

Without this call to sk_wmem_schedule, sk_forward_alloc can go
negetive which is a bug because sk_forward_alloc is a per-socket
space that has been forward charged so this can't be negative.

Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY")
Signed-off-by: Talal Ahmad <talalahmad@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Wei Wang <weiwan@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d5ab5f243640..8cb44040ec68 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1375,6 +1375,9 @@ new_segment:
 			}
 			pfrag->offset += copy;
 		} else {
+			if (!sk_wmem_schedule(sk, copy))
+				goto wait_for_space;
+
 			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
 			if (err == -EMSGSIZE || err == -EEXIST) {
 				tcp_mark_push(tp, skb);

From 28b34f01a73435a754956ebae826e728c03ffa38 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <atenart@kernel.org>
Date: Fri, 9 Jul 2021 18:16:09 +0200
Subject: [PATCH 134/158] net: do not reuse skbuff allocated from
 skbuff_fclone_cache in the skb cache

Some socket buffers allocated in the fclone cache (in __alloc_skb) can
end-up in the following path[1]:

napi_skb_finish
  __kfree_skb_defer
    napi_skb_cache_put

The issue is napi_skb_cache_put is not fclone friendly and will put
those skbuff in the skb cache to be reused later, although this cache
only expects skbuff allocated from skbuff_head_cache. When this happens
the skbuff is eventually freed using the wrong origin cache, and we can
see traces similar to:

[ 1223.947534] cache_from_obj: Wrong slab cache. skbuff_head_cache but object is from skbuff_fclone_cache
[ 1223.948895] WARNING: CPU: 3 PID: 0 at mm/slab.h:442 kmem_cache_free+0x251/0x3e0
[ 1223.950211] Modules linked in:
[ 1223.950680] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.13.0+ #474
[ 1223.951587] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-3.fc34 04/01/2014
[ 1223.953060] RIP: 0010:kmem_cache_free+0x251/0x3e0

Leading sometimes to other memory related issues.

Fix this by using __kfree_skb for fclone skbuff, similar to what is done
the other place __kfree_skb_defer is called.

[1] At least in setups using veth pairs and tunnels. Building a kernel
    with KASAN we can for example see packets allocated in
    sk_stream_alloc_skb hit the above path and later the issue arises
    when the skbuff is reused.

Fixes: 9243adfc311a ("skbuff: queue NAPI_MERGED_FREE skbs into NAPI cache instead of freeing")
Cc: Alexander Lobakin <alobakin@pm.me>
Signed-off-by: Antoine Tenart <atenart@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 03c95a0867bb..64b21f0a2048 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6234,6 +6234,8 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
 	case GRO_MERGED_FREE:
 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
 			napi_skb_free_stolen_head(skb);
+		else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+			__kfree_skb(skb);
 		else
 			__kfree_skb_defer(skb);
 		break;

From 9992a078b1771da354ac1f9737e1e639b687caa2 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 9 Jul 2021 11:45:02 +0800
Subject: [PATCH 135/158] net: ip_tunnel: fix mtu calculation for ETHER tunnel
 devices

Commit 28e104d00281 ("net: ip_tunnel: fix mtu calculation") removed
dev->hard_header_len subtraction when calculate MTU for tunnel devices
as there is an overhead for device that has header_ops.

But there are ETHER tunnel devices, like gre_tap or erspan, which don't
have header_ops but set dev->hard_header_len during setup. This makes
pkts greater than (MTU - ETH_HLEN) could not be xmited. Fix it by
subtracting the ETHER tunnel devices' dev->hard_header_len for MTU
calculation.

Fixes: 28e104d00281 ("net: ip_tunnel: fix mtu calculation")
Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index f6cc26de5ed3..0dca00745ac3 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -317,7 +317,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 	}
 
 	dev->needed_headroom = t_hlen + hlen;
-	mtu -= t_hlen;
+	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
 
 	if (mtu < IPV4_MIN_MTU)
 		mtu = IPV4_MIN_MTU;
@@ -348,6 +348,9 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
 	t_hlen = nt->hlen + sizeof(struct iphdr);
 	dev->min_mtu = ETH_MIN_MTU;
 	dev->max_mtu = IP_MAX_MTU - t_hlen;
+	if (dev->type == ARPHRD_ETHER)
+		dev->max_mtu -= dev->hard_header_len;
+
 	ip_tunnel_add(itn, nt);
 	return nt;
 
@@ -489,11 +492,14 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 
 	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
 	pkt_size = skb->len - tunnel_hlen;
+	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
 
-	if (df)
+	if (df) {
 		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
-	else
+		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
+	} else {
 		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+	}
 
 	if (skb_valid_dst(skb))
 		skb_dst_update_pmtu_no_confirm(skb, mtu);
@@ -972,6 +978,9 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 	int max_mtu = IP_MAX_MTU - t_hlen;
 
+	if (dev->type == ARPHRD_ETHER)
+		max_mtu -= dev->hard_header_len;
+
 	if (new_mtu < ETH_MIN_MTU)
 		return -EINVAL;
 
@@ -1149,6 +1158,9 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 	if (tb[IFLA_MTU]) {
 		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
 
+		if (dev->type == ARPHRD_ETHER)
+			max -= dev->hard_header_len;
+
 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
 	}
 

From 67a9c94317402b826fc3db32afc8f39336803d97 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Fri, 9 Jul 2021 17:35:18 +0000
Subject: [PATCH 136/158] net: validate lwtstate->data before returning from
 skb_tunnel_info()

skb_tunnel_info() returns pointer of lwtstate->data as ip_tunnel_info
type without validation. lwtstate->data can have various types such as
mpls_iptunnel_encap, etc and these are not compatible.
So skb_tunnel_info() should validate before returning that pointer.

Splat looks like:
BUG: KASAN: slab-out-of-bounds in vxlan_get_route+0x418/0x4b0 [vxlan]
Read of size 2 at addr ffff888106ec2698 by task ping/811

CPU: 1 PID: 811 Comm: ping Not tainted 5.13.0+ #1195
Call Trace:
 dump_stack_lvl+0x56/0x7b
 print_address_description.constprop.8.cold.13+0x13/0x2ee
 ? vxlan_get_route+0x418/0x4b0 [vxlan]
 ? vxlan_get_route+0x418/0x4b0 [vxlan]
 kasan_report.cold.14+0x83/0xdf
 ? vxlan_get_route+0x418/0x4b0 [vxlan]
 vxlan_get_route+0x418/0x4b0 [vxlan]
 [ ... ]
 vxlan_xmit_one+0x148b/0x32b0 [vxlan]
 [ ... ]
 vxlan_xmit+0x25c5/0x4780 [vxlan]
 [ ... ]
 dev_hard_start_xmit+0x1ae/0x6e0
 __dev_queue_xmit+0x1f39/0x31a0
 [ ... ]
 neigh_xmit+0x2f9/0x940
 mpls_xmit+0x911/0x1600 [mpls_iptunnel]
 lwtunnel_xmit+0x18f/0x450
 ip_finish_output2+0x867/0x2040
 [ ... ]

Fixes: 61adedf3e3f1 ("route: move lwtunnel state to dst_entry")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_metadata.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 56cb3c38569a..14efa0ded75d 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -45,7 +45,9 @@ skb_tunnel_info(const struct sk_buff *skb)
 		return &md_dst->u.tun_info;
 
 	dst = skb_dst(skb);
-	if (dst && dst->lwtstate)
+	if (dst && dst->lwtstate &&
+	    (dst->lwtstate->type == LWTUNNEL_ENCAP_IP ||
+	     dst->lwtstate->type == LWTUNNEL_ENCAP_IP6))
 		return lwt_tun_info(dst->lwtstate);
 
 	return NULL;

From 0c71929b5893e410e0efbe1bbeca6f19a5f19956 Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@chinatelecom.cn>
Date: Fri, 9 Jul 2021 17:20:46 -0700
Subject: [PATCH 137/158] mptcp: fix warning in __skb_flow_dissect() when do
 syn cookie for subflow join

I did stress test with wrk[1] and webfsd[2] with the assistance of
mptcp-tools[3]:

  Server side:
      ./use_mptcp.sh webfsd -4 -R /tmp/ -p 8099
  Client side:
      ./use_mptcp.sh wrk -c 200 -d 30 -t 4 http://192.168.174.129:8099/

and got the following warning message:

[   55.552626] TCP: request_sock_subflow: Possible SYN flooding on port 8099. Sending cookies.  Check SNMP counters.
[   55.553024] ------------[ cut here ]------------
[   55.553027] WARNING: CPU: 0 PID: 10 at net/core/flow_dissector.c:984 __skb_flow_dissect+0x280/0x1650
...
[   55.553117] CPU: 0 PID: 10 Comm: ksoftirqd/0 Not tainted 5.12.0+ #18
[   55.553121] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
[   55.553124] RIP: 0010:__skb_flow_dissect+0x280/0x1650
...
[   55.553133] RSP: 0018:ffffb79580087770 EFLAGS: 00010246
[   55.553137] RAX: 0000000000000000 RBX: ffffffff8ddb58e0 RCX: ffffb79580087888
[   55.553139] RDX: ffffffff8ddb58e0 RSI: ffff8f7e4652b600 RDI: 0000000000000000
[   55.553141] RBP: ffffb79580087858 R08: 0000000000000000 R09: 0000000000000008
[   55.553143] R10: 000000008c622965 R11: 00000000d3313a5b R12: ffff8f7e4652b600
[   55.553146] R13: ffff8f7e465c9062 R14: 0000000000000000 R15: ffffb79580087888
[   55.553149] FS:  0000000000000000(0000) GS:ffff8f7f75e00000(0000) knlGS:0000000000000000
[   55.553152] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   55.553154] CR2: 00007f73d1d19000 CR3: 0000000135e10004 CR4: 00000000003706f0
[   55.553160] Call Trace:
[   55.553166]  ? __sha256_final+0x67/0xd0
[   55.553173]  ? sha256+0x7e/0xa0
[   55.553177]  __skb_get_hash+0x57/0x210
[   55.553182]  subflow_init_req_cookie_join_save+0xac/0xc0
[   55.553189]  subflow_check_req+0x474/0x550
[   55.553195]  ? ip_route_output_key_hash+0x67/0x90
[   55.553200]  ? xfrm_lookup_route+0x1d/0xa0
[   55.553207]  subflow_v4_route_req+0x8e/0xd0
[   55.553212]  tcp_conn_request+0x31e/0xab0
[   55.553218]  ? selinux_socket_sock_rcv_skb+0x116/0x210
[   55.553224]  ? tcp_rcv_state_process+0x179/0x6d0
[   55.553229]  tcp_rcv_state_process+0x179/0x6d0
[   55.553235]  tcp_v4_do_rcv+0xaf/0x220
[   55.553239]  tcp_v4_rcv+0xce4/0xd80
[   55.553243]  ? ip_route_input_rcu+0x246/0x260
[   55.553248]  ip_protocol_deliver_rcu+0x35/0x1b0
[   55.553253]  ip_local_deliver_finish+0x44/0x50
[   55.553258]  ip_local_deliver+0x6c/0x110
[   55.553262]  ? ip_rcv_finish_core.isra.19+0x5a/0x400
[   55.553267]  ip_rcv+0xd1/0xe0
...

After debugging, I found in __skb_flow_dissect(), skb->dev and skb->sk
are both NULL, then net is NULL, and trigger WARN_ON_ONCE(!net),
actually net is always NULL in this code path, as skb->dev is set to
NULL in tcp_v4_rcv(), and skb->sk is never set.

Code snippet in __skb_flow_dissect() that trigger warning:
  975         if (skb) {
  976                 if (!net) {
  977                         if (skb->dev)
  978                                 net = dev_net(skb->dev);
  979                         else if (skb->sk)
  980                                 net = sock_net(skb->sk);
  981                 }
  982         }
  983
  984         WARN_ON_ONCE(!net);

So, using seq and transport header derived hash.

[1] https://github.com/wg/wrk
[2] https://github.com/ourway/webfsd
[3] https://github.com/pabeni/mptcp-tools

Fixes: 9466a1ccebbe ("mptcp: enable JOIN requests even if cookies are in use")
Suggested-by: Paolo Abeni <pabeni@redhat.com>
Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Jianguo Wu <wujianguo@chinatelecom.cn>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/syncookies.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c
index abe0fd099746..37127781aee9 100644
--- a/net/mptcp/syncookies.c
+++ b/net/mptcp/syncookies.c
@@ -37,7 +37,21 @@ static spinlock_t join_entry_locks[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp
 
 static u32 mptcp_join_entry_hash(struct sk_buff *skb, struct net *net)
 {
-	u32 i = skb_get_hash(skb) ^ net_hash_mix(net);
+	static u32 mptcp_join_hash_secret __read_mostly;
+	struct tcphdr *th = tcp_hdr(skb);
+	u32 seq, i;
+
+	net_get_random_once(&mptcp_join_hash_secret,
+			    sizeof(mptcp_join_hash_secret));
+
+	if (th->syn)
+		seq = TCP_SKB_CB(skb)->seq;
+	else
+		seq = TCP_SKB_CB(skb)->seq - 1;
+
+	i = jhash_3words(seq, net_hash_mix(net),
+			 (__force __u32)th->source << 16 | (__force __u32)th->dest,
+			 mptcp_join_hash_secret);
 
 	return i % ARRAY_SIZE(join_entries);
 }

From 030d37bd1cd2443a1f21db47eb301899bfa45a2a Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@chinatelecom.cn>
Date: Fri, 9 Jul 2021 17:20:47 -0700
Subject: [PATCH 138/158] mptcp: remove redundant req destruct in
 subflow_check_req()

In subflow_check_req(), if subflow sport is mismatch, will put msk,
destroy token, and destruct req, then return -EPERM, which can be
done by subflow_req_destructor() via:

  tcp_conn_request()
    |--__reqsk_free()
      |--subflow_req_destructor()

So we should remove these redundant code, otherwise will call
tcp_v4_reqsk_destructor() twice, and may double free
inet_rsk(req)->ireq_opt.

Fixes: 5bc56388c74f ("mptcp: add port number check for MP_JOIN")
Signed-off-by: Jianguo Wu <wujianguo@chinatelecom.cn>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/subflow.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 66d0b1893d26..b15e2017168d 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -214,11 +214,6 @@ again:
 				 ntohs(inet_sk(sk_listener)->inet_sport),
 				 ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
 			if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
-				sock_put((struct sock *)subflow_req->msk);
-				mptcp_token_destroy_request(req);
-				tcp_request_sock_ops.destructor(req);
-				subflow_req->msk = NULL;
-				subflow_req->mp_join = 0;
 				SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
 				return -EPERM;
 			}

From 8547ea5f52dd8ef19b69c25c41b1415481b3503b Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@chinatelecom.cn>
Date: Fri, 9 Jul 2021 17:20:48 -0700
Subject: [PATCH 139/158] mptcp: fix syncookie process if mptcp can not_accept
 new subflow

Lots of "TCP: tcp_fin: Impossible, sk->sk_state=7" in client side
when doing stress testing using wrk and webfsd.

There are at least two cases may trigger this warning:
1.mptcp is in syncookie, and server recv MP_JOIN SYN request,
  in subflow_check_req(), the mptcp_can_accept_new_subflow()
  return false, so subflow_init_req_cookie_join_save() isn't
  called, i.e. not store the data present in the MP_JOIN syn
  request and the random nonce in hash table - join_entries[],
  but still send synack. When recv 3rd-ack,
  mptcp_token_join_cookie_init_state() will return false, and
  3rd-ack is dropped, then if mptcp conn is closed by client,
  client will send a DATA_FIN and a MPTCP FIN, the DATA_FIN
  doesn't have MP_CAPABLE or MP_JOIN,
  so mptcp_subflow_init_cookie_req() will return 0, and pass
  the cookie check, MP_JOIN request is fallback to normal TCP.
  Server will send a TCP FIN if closed, in client side,
  when process TCP FIN, it will do reset, the code path is:
    tcp_data_queue()->mptcp_incoming_options()
      ->check_fully_established()->mptcp_subflow_reset().
  mptcp_subflow_reset() will set sock state to TCP_CLOSE,
  so tcp_fin will hit TCP_CLOSE, and print the warning.

2.mptcp is in syncookie, and server recv 3rd-ack, in
  mptcp_subflow_init_cookie_req(), mptcp_can_accept_new_subflow()
  return false, and subflow_req->mp_join is not set to 1,
  so in subflow_syn_recv_sock() will not reset the MP_JOIN
  subflow, but fallback to normal TCP, and then the same thing
  happens when server will send a TCP FIN if closed.

For case1, subflow_check_req() return -EPERM,
then tcp_conn_request() will drop MP_JOIN SYN.

For case2, let subflow_syn_recv_sock() call
mptcp_can_accept_new_subflow(), and do fatal fallback, send reset.

Fixes: 9466a1ccebbe ("mptcp: enable JOIN requests even if cookies are in use")
Signed-off-by: Jianguo Wu <wujianguo@chinatelecom.cn>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/subflow.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index b15e2017168d..966f777d35ce 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -225,6 +225,8 @@ again:
 		if (unlikely(req->syncookie)) {
 			if (mptcp_can_accept_new_subflow(subflow_req->msk))
 				subflow_init_req_cookie_join_save(subflow_req, skb);
+			else
+				return -EPERM;
 		}
 
 		pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
@@ -264,9 +266,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
 		if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
 			return -EINVAL;
 
-		if (mptcp_can_accept_new_subflow(subflow_req->msk))
-			subflow_req->mp_join = 1;
-
+		subflow_req->mp_join = 1;
 		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
 	}
 

From 6787b7e350d3552651a3422d3d8980fbc8d65368 Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@chinatelecom.cn>
Date: Fri, 9 Jul 2021 17:20:49 -0700
Subject: [PATCH 140/158] mptcp: avoid processing packet if a subflow reset

If check_fully_established() causes a subflow reset, it should not
continue to process the packet in tcp_data_queue().
Add a return value to mptcp_incoming_options(), and return false if a
subflow has been reset, else return true. Then drop the packet in
tcp_data_queue()/tcp_rcv_state_process() if mptcp_incoming_options()
return false.

Fixes: d582484726c4 ("mptcp: fix fallback for MP_JOIN subflows")
Signed-off-by: Jianguo Wu <wujianguo@chinatelecom.cn>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mptcp.h  |  5 +++--
 net/ipv4/tcp_input.c | 19 +++++++++++++++----
 net/mptcp/options.c  | 19 +++++++++++++------
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index cb580b06152f..8b5af683a818 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -105,7 +105,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 			       unsigned int *size, unsigned int remaining,
 			       struct mptcp_out_options *opts);
-void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
+bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
 
 void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
 			 struct mptcp_out_options *opts);
@@ -227,9 +227,10 @@ static inline bool mptcp_established_options(struct sock *sk,
 	return false;
 }
 
-static inline void mptcp_incoming_options(struct sock *sk,
+static inline bool mptcp_incoming_options(struct sock *sk,
 					  struct sk_buff *skb)
 {
+	return true;
 }
 
 static inline void mptcp_skb_ext_move(struct sk_buff *to,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a5a8d0a378b2..149ceb5c94ff 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4247,6 +4247,9 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
 {
 	trace_tcp_receive_reset(sk);
 
+	/* mptcp can't tell us to ignore reset pkts,
+	 * so just ignore the return value of mptcp_incoming_options().
+	 */
 	if (sk_is_mptcp(sk))
 		mptcp_incoming_options(sk, skb);
 
@@ -4941,8 +4944,13 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	bool fragstolen;
 	int eaten;
 
-	if (sk_is_mptcp(sk))
-		mptcp_incoming_options(sk, skb);
+	/* If a subflow has been reset, the packet should not continue
+	 * to be processed, drop the packet.
+	 */
+	if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
+		__kfree_skb(skb);
+		return;
+	}
 
 	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
 		__kfree_skb(skb);
@@ -6523,8 +6531,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 	case TCP_CLOSING:
 	case TCP_LAST_ACK:
 		if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
-			if (sk_is_mptcp(sk))
-				mptcp_incoming_options(sk, skb);
+			/* If a subflow has been reset, the packet should not
+			 * continue to be processed, drop the packet.
+			 */
+			if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
+				goto discard;
 			break;
 		}
 		fallthrough;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index b5850afea343..4452455aef7f 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1035,7 +1035,8 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
 	return hmac == mp_opt->ahmac;
 }
 
-void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
+/* Return false if a subflow has been reset, else return true */
+bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
@@ -1053,12 +1054,16 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 			__mptcp_check_push(subflow->conn, sk);
 		__mptcp_data_acked(subflow->conn);
 		mptcp_data_unlock(subflow->conn);
-		return;
+		return true;
 	}
 
 	mptcp_get_options(sk, skb, &mp_opt);
+
+	/* The subflow can be in close state only if check_fully_established()
+	 * just sent a reset. If so, tell the caller to ignore the current packet.
+	 */
 	if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
-		return;
+		return sk->sk_state != TCP_CLOSE;
 
 	if (mp_opt.fastclose &&
 	    msk->local_key == mp_opt.rcvr_key) {
@@ -1100,7 +1105,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 	}
 
 	if (!mp_opt.dss)
-		return;
+		return true;
 
 	/* we can't wait for recvmsg() to update the ack_seq, otherwise
 	 * monodirectional flows will stuck
@@ -1119,12 +1124,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 		    schedule_work(&msk->work))
 			sock_hold(subflow->conn);
 
-		return;
+		return true;
 	}
 
 	mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
 	if (!mpext)
-		return;
+		return true;
 
 	memset(mpext, 0, sizeof(*mpext));
 
@@ -1153,6 +1158,8 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 		if (mpext->csum_reqd)
 			mpext->csum = mp_opt.csum;
 	}
+
+	return true;
 }
 
 static void mptcp_set_rwin(const struct tcp_sock *tp)

From a7da441621c7945fbfd43ed239c93b8073cda502 Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@chinatelecom.cn>
Date: Fri, 9 Jul 2021 17:20:50 -0700
Subject: [PATCH 141/158] selftests: mptcp: fix case multiple subflows limited
 by server

After patch "mptcp: fix syncookie process if mptcp can not_accept new
subflow", if subflow is limited, MP_JOIN SYN is dropped, and no SYN/ACK
will be replied.

So in case "multiple subflows limited by server", the expected SYN/ACK
number should be 1.

Fixes: 00587187ad30 ("selftests: mptcp: add test cases for mptcp join tests with syn cookies")
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Jianguo Wu <wujianguo@chinatelecom.cn>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 9a191c1a5de8..f02f4de2f3a0 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -1409,7 +1409,7 @@ syncookies_tests()
 	ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
 	ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
 	run_tests $ns1 $ns2 10.0.1.1
-	chk_join_nr "subflows limited by server w cookies" 2 2 1
+	chk_join_nr "subflows limited by server w cookies" 2 1 1
 
 	# test signal address with cookies
 	reset_with_cookies

From ce599c516386f09ca30848a1a4eb93d3fffbe187 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 9 Jul 2021 17:20:51 -0700
Subject: [PATCH 142/158] mptcp: properly account bulk freed memory

After commit 879526030c8b ("mptcp: protect the rx path with
the msk socket spinlock") the rmem currently used by a given
msk is really sk_rmem_alloc - rmem_released.

The safety check in mptcp_data_ready() does not take the above
in due account, as a result legit incoming data is kept in
subflow receive queue with no reason, delaying or blocking
MPTCP-level ack generation.

This change addresses the issue introducing a new helper to fetch
the rmem memory and using it as needed. Additionally add a MIB
counter for the exceptional event described above - the peer is
misbehaving.

Finally, introduce the required annotation when rmem_released is
updated.

Fixes: 879526030c8b ("mptcp: protect the rx path with the msk socket spinlock")
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/211
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/mib.c      |  1 +
 net/mptcp/mib.h      |  1 +
 net/mptcp/protocol.c | 12 +++++++-----
 net/mptcp/protocol.h | 10 +++++++++-
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 52ea2517e856..ff2cc0e3273d 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -44,6 +44,7 @@ static const struct snmp_mib mptcp_snmp_list[] = {
 	SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
 	SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
 	SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
+	SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 193466c9b549..0663cb12b448 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -37,6 +37,7 @@ enum linux_mptcp_mib_field {
 	MPTCP_MIB_RMSUBFLOW,		/* Remove a subflow */
 	MPTCP_MIB_MPPRIOTX,		/* Transmit a MP_PRIO */
 	MPTCP_MIB_MPPRIORX,		/* Received a MP_PRIO */
+	MPTCP_MIB_RCVPRUNED,		/* Incoming packet dropped due to memory limit */
 	__MPTCP_MIB_MAX
 };
 
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 7a5afa8c6866..a88924947815 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -474,7 +474,7 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
 	bool cleanup, rx_empty;
 
 	cleanup = (space > 0) && (space >= (old_space << 1));
-	rx_empty = !atomic_read(&sk->sk_rmem_alloc);
+	rx_empty = !__mptcp_rmem(sk);
 
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -720,8 +720,10 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
 		sk_rbuf = ssk_rbuf;
 
 	/* over limit? can't append more skbs to msk, Also, no need to wake-up*/
-	if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
+	if (__mptcp_rmem(sk) > sk_rbuf) {
+		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
 		return;
+	}
 
 	/* Wake-up the reader only for in-sequence data */
 	mptcp_data_lock(sk);
@@ -1754,7 +1756,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
 		if (!(flags & MSG_PEEK)) {
 			/* we will bulk release the skb memory later */
 			skb->destructor = NULL;
-			msk->rmem_released += skb->truesize;
+			WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
 			__skb_unlink(skb, &msk->receive_queue);
 			__kfree_skb(skb);
 		}
@@ -1873,7 +1875,7 @@ static void __mptcp_update_rmem(struct sock *sk)
 
 	atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
 	sk_mem_uncharge(sk, msk->rmem_released);
-	msk->rmem_released = 0;
+	WRITE_ONCE(msk->rmem_released, 0);
 }
 
 static void __mptcp_splice_receive_queue(struct sock *sk)
@@ -2380,7 +2382,7 @@ static int __mptcp_init_sock(struct sock *sk)
 	msk->out_of_order_queue = RB_ROOT;
 	msk->first_pending = NULL;
 	msk->wmem_reserved = 0;
-	msk->rmem_released = 0;
+	WRITE_ONCE(msk->rmem_released, 0);
 	msk->tx_pending_data = 0;
 
 	msk->first = NULL;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 426ed80fe72f..0f0c026c5f8b 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -296,9 +296,17 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
 	return (struct mptcp_sock *)sk;
 }
 
+/* the msk socket don't use the backlog, also account for the bulk
+ * free memory
+ */
+static inline int __mptcp_rmem(const struct sock *sk)
+{
+	return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
+}
+
 static inline int __mptcp_space(const struct sock *sk)
 {
-	return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_released);
+	return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
 }
 
 static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)

From 222722bc6ebfabf5d54467070f05cf9c0a55ea8c Mon Sep 17 00:00:00 2001
From: Yunjian Wang <wangyunjian@huawei.com>
Date: Sat, 10 Jul 2021 11:32:49 +0800
Subject: [PATCH 143/158] virtio_net: check virtqueue_add_sgs() return value

As virtqueue_add_sgs() can fail, we should check the return value.

Addresses-Coverity-ID: 1464439 ("Unchecked return value")
Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b0b81458ca94..13952e2dba5e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1743,6 +1743,7 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
 {
 	struct scatterlist *sgs[4], hdr, stat;
 	unsigned out_num = 0, tmp;
+	int ret;
 
 	/* Caller should know better */
 	BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
@@ -1762,7 +1763,12 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
 	sgs[out_num] = &stat;
 
 	BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
-	virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
+	ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
+	if (ret < 0) {
+		dev_warn(&vi->vdev->dev,
+			 "Failed to add sgs for command vq: %d\n.", ret);
+		return false;
+	}
 
 	if (unlikely(!virtqueue_kick(vi->cvq)))
 		return vi->ctrl->status == VIRTIO_NET_OK;

From 84f7e0bb4809f4497124b6b6904c07c8a0c73c58 Mon Sep 17 00:00:00 2001
From: kernel test robot <lkp@intel.com>
Date: Sun, 11 Jul 2021 18:12:56 +0200
Subject: [PATCH 144/158] dsa: fix for_each_child.cocci warnings

For_each_available_child_of_node should have of_node_put() before
return around line 423.

Generated by: scripts/coccinelle/iterators/for_each_child.cocci

CC: Alexander Lobakin <alobakin@pm.me>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: kernel test robot <lkp@intel.com>
Signed-off-by: Julia Lawall <julia.lawall@inria.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/microchip/ksz_common.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index a7e5ac60baef..1542bfb8b5e5 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -419,8 +419,10 @@ int ksz_switch_register(struct ksz_device *dev,
 				if (of_property_read_u32(port, "reg",
 							 &port_num))
 					continue;
-				if (!(dev->port_mask & BIT(port_num)))
+				if (!(dev->port_mask & BIT(port_num))) {
+					of_node_put(port);
 					return -EINVAL;
+				}
 				of_get_phy_mode(port,
 						&dev->ports[port_num].interface);
 			}

From a5de4be0aaaa66a2fa98e8a33bdbed3bd0682804 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Sun, 11 Jul 2021 18:38:15 +0200
Subject: [PATCH 145/158] net: phy: marvell10g: fix differentiation of 88X3310
 from 88X3340
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It seems that we cannot differentiate 88X3310 from 88X3340 by simply
looking at bit 3 of revision ID. This only works on revisions A0 and A1.
On revision B0, this bit is always 1.

Instead use the 3.d00d register for differentiation, since this register
contains information about number of ports on the device.

Fixes: 9885d016ffa9 ("net: phy: marvell10g: add separate structure for 88X3340")
Signed-off-by: Marek Behún <kabel@kernel.org>
Reported-by: Matteo Croce <mcroce@linux.microsoft.com>
Tested-by: Matteo Croce <mcroce@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell10g.c | 40 +++++++++++++++++++++++++++++++-----
 include/linux/marvell_phy.h  |  6 +-----
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index bbbc6ac8fa82..53a433442803 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -78,6 +78,11 @@ enum {
 	/* Temperature read register (88E2110 only) */
 	MV_PCS_TEMP		= 0x8042,
 
+	/* Number of ports on the device */
+	MV_PCS_PORT_INFO	= 0xd00d,
+	MV_PCS_PORT_INFO_NPORTS_MASK	= 0x0380,
+	MV_PCS_PORT_INFO_NPORTS_SHIFT	= 7,
+
 	/* These registers appear at 0x800X and 0xa00X - the 0xa00X control
 	 * registers appear to set themselves to the 0x800X when AN is
 	 * restarted, but status registers appear readable from either.
@@ -966,6 +971,30 @@ static const struct mv3310_chip mv2111_type = {
 #endif
 };
 
+static int mv3310_get_number_of_ports(struct phy_device *phydev)
+{
+	int ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_PCS, MV_PCS_PORT_INFO);
+	if (ret < 0)
+		return ret;
+
+	ret &= MV_PCS_PORT_INFO_NPORTS_MASK;
+	ret >>= MV_PCS_PORT_INFO_NPORTS_SHIFT;
+
+	return ret + 1;
+}
+
+static int mv3310_match_phy_device(struct phy_device *phydev)
+{
+	return mv3310_get_number_of_ports(phydev) == 1;
+}
+
+static int mv3340_match_phy_device(struct phy_device *phydev)
+{
+	return mv3310_get_number_of_ports(phydev) == 4;
+}
+
 static int mv211x_match_phy_device(struct phy_device *phydev, bool has_5g)
 {
 	int val;
@@ -994,7 +1023,8 @@ static int mv2111_match_phy_device(struct phy_device *phydev)
 static struct phy_driver mv3310_drivers[] = {
 	{
 		.phy_id		= MARVELL_PHY_ID_88X3310,
-		.phy_id_mask	= MARVELL_PHY_ID_88X33X0_MASK,
+		.phy_id_mask	= MARVELL_PHY_ID_MASK,
+		.match_phy_device = mv3310_match_phy_device,
 		.name		= "mv88x3310",
 		.driver_data	= &mv3310_type,
 		.get_features	= mv3310_get_features,
@@ -1011,8 +1041,9 @@ static struct phy_driver mv3310_drivers[] = {
 		.set_loopback	= genphy_c45_loopback,
 	},
 	{
-		.phy_id		= MARVELL_PHY_ID_88X3340,
-		.phy_id_mask	= MARVELL_PHY_ID_88X33X0_MASK,
+		.phy_id		= MARVELL_PHY_ID_88X3310,
+		.phy_id_mask	= MARVELL_PHY_ID_MASK,
+		.match_phy_device = mv3340_match_phy_device,
 		.name		= "mv88x3340",
 		.driver_data	= &mv3340_type,
 		.get_features	= mv3310_get_features,
@@ -1069,8 +1100,7 @@ static struct phy_driver mv3310_drivers[] = {
 module_phy_driver(mv3310_drivers);
 
 static struct mdio_device_id __maybe_unused mv3310_tbl[] = {
-	{ MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_88X33X0_MASK },
-	{ MARVELL_PHY_ID_88X3340, MARVELL_PHY_ID_88X33X0_MASK },
+	{ MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E2110, MARVELL_PHY_ID_MASK },
 	{ },
 };
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index acee44b9db26..0f06c2287b52 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -22,14 +22,10 @@
 #define MARVELL_PHY_ID_88E1545		0x01410ea0
 #define MARVELL_PHY_ID_88E1548P		0x01410ec0
 #define MARVELL_PHY_ID_88E3016		0x01410e60
+#define MARVELL_PHY_ID_88X3310		0x002b09a0
 #define MARVELL_PHY_ID_88E2110		0x002b09b0
 #define MARVELL_PHY_ID_88X2222		0x01410f10
 
-/* PHY IDs and mask for Alaska 10G PHYs */
-#define MARVELL_PHY_ID_88X33X0_MASK	0xfffffff8
-#define MARVELL_PHY_ID_88X3310		0x002b09a0
-#define MARVELL_PHY_ID_88X3340		0x002b09a8
-
 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */
 #define MARVELL_PHY_ID_88E1111_FINISAR	0x01ff0cc0
 

From 04bef83a3358946bfc98a5ecebd1b0003d83d882 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Sun, 11 Jul 2021 12:56:28 +0300
Subject: [PATCH 146/158] net: bridge: multicast: fix PIM hello router port
 marking race

When a PIM hello packet is received on a bridge port with multicast
snooping enabled, we mark it as a router port automatically, that
includes adding that port the router port list. The multicast lock
protects that list, but it is not acquired in the PIM message case
leading to a race condition, we need to take it to fix the race.

Cc: stable@vger.kernel.org
Fixes: 91b02d3d133b ("bridge: mcast: add router port on PIM hello message")
Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 53c3a9d80d9c..3bbbc6d7b7c3 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -3264,7 +3264,9 @@ static void br_multicast_pim(struct net_bridge *br,
 	    pim_hdr_type(pimhdr) != PIM_TYPE_HELLO)
 		return;
 
+	spin_lock(&br->multicast_lock);
 	br_ip4_multicast_mark_router(br, port);
+	spin_unlock(&br->multicast_lock);
 }
 
 static int br_ip4_multicast_mrd_rcv(struct net_bridge *br,

From 000b7287b67555fee39d39fff75229dedde0dcbf Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Sun, 11 Jul 2021 12:56:29 +0300
Subject: [PATCH 147/158] net: bridge: multicast: fix MRD advertisement router
 port marking race

When an MRD advertisement is received on a bridge port with multicast
snooping enabled, we mark it as a router port automatically, that
includes adding that port to the router port list. The multicast lock
protects that list, but it is not acquired in the MRD advertisement case
leading to a race condition, we need to take it to fix the race.

Cc: stable@vger.kernel.org
Cc: linus.luessing@c0d3.blue
Fixes: 4b3087c7e37f ("bridge: Snoop Multicast Router Advertisements")
Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 3bbbc6d7b7c3..d0434dc8c03b 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -3277,7 +3277,9 @@ static int br_ip4_multicast_mrd_rcv(struct net_bridge *br,
 	    igmp_hdr(skb)->type != IGMP_MRDISC_ADV)
 		return -ENOMSG;
 
+	spin_lock(&br->multicast_lock);
 	br_ip4_multicast_mark_router(br, port);
+	spin_unlock(&br->multicast_lock);
 
 	return 0;
 }
@@ -3345,7 +3347,9 @@ static void br_ip6_multicast_mrd_rcv(struct net_bridge *br,
 	if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV)
 		return;
 
+	spin_lock(&br->multicast_lock);
 	br_ip6_multicast_mark_router(br, port);
+	spin_unlock(&br->multicast_lock);
 }
 
 static int br_multicast_ipv6_rcv(struct net_bridge *br,

From a1739c307a072e46473a2ba239eb60e6d711c96c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 11 Jul 2021 15:31:47 -0700
Subject: [PATCH 148/158] net: hdlc: rename 'mod_init' & 'mod_exit' functions
 to be module-specific

Rename module_init & module_exit functions that are named
"mod_init" and "mod_exit" so that they are unique in both the
System.map file and in initcall_debug output instead of showing
up as almost anonymous "mod_init".

This is helpful for debugging and in determining how long certain
module_init calls take to execute.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Krzysztof Halasa <khc@pm.waw.pl>
Cc: netdev@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Martin Schiller <ms@dev.tdt.de>
Cc: linux-x25@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/hdlc_cisco.c   | 8 ++++----
 drivers/net/wan/hdlc_fr.c      | 8 ++++----
 drivers/net/wan/hdlc_ppp.c     | 8 ++++----
 drivers/net/wan/hdlc_raw.c     | 8 ++++----
 drivers/net/wan/hdlc_raw_eth.c | 8 ++++----
 drivers/net/wan/hdlc_x25.c     | 8 ++++----
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c
index 349ca18088e8..c54fdae950fb 100644
--- a/drivers/net/wan/hdlc_cisco.c
+++ b/drivers/net/wan/hdlc_cisco.c
@@ -364,19 +364,19 @@ static int cisco_ioctl(struct net_device *dev, struct ifreq *ifr)
 	return -EINVAL;
 }
 
-static int __init mod_init(void)
+static int __init hdlc_cisco_init(void)
 {
 	register_hdlc_protocol(&proto);
 	return 0;
 }
 
-static void __exit mod_exit(void)
+static void __exit hdlc_cisco_exit(void)
 {
 	unregister_hdlc_protocol(&proto);
 }
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_cisco_init);
+module_exit(hdlc_cisco_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("Cisco HDLC protocol support for generic HDLC");
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index 72250fe0a1df..25e3564ce118 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -1279,19 +1279,19 @@ static int fr_ioctl(struct net_device *dev, struct ifreq *ifr)
 	return -EINVAL;
 }
 
-static int __init mod_init(void)
+static int __init hdlc_fr_init(void)
 {
 	register_hdlc_protocol(&proto);
 	return 0;
 }
 
-static void __exit mod_exit(void)
+static void __exit hdlc_fr_exit(void)
 {
 	unregister_hdlc_protocol(&proto);
 }
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_fr_init);
+module_exit(hdlc_fr_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("Frame-Relay protocol support for generic HDLC");
diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c
index 834be2ae3e9e..b81ecf432a0c 100644
--- a/drivers/net/wan/hdlc_ppp.c
+++ b/drivers/net/wan/hdlc_ppp.c
@@ -705,20 +705,20 @@ static int ppp_ioctl(struct net_device *dev, struct ifreq *ifr)
 	return -EINVAL;
 }
 
-static int __init mod_init(void)
+static int __init hdlc_ppp_init(void)
 {
 	skb_queue_head_init(&tx_queue);
 	register_hdlc_protocol(&proto);
 	return 0;
 }
 
-static void __exit mod_exit(void)
+static void __exit hdlc_ppp_exit(void)
 {
 	unregister_hdlc_protocol(&proto);
 }
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_ppp_init);
+module_exit(hdlc_ppp_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("PPP protocol support for generic HDLC");
diff --git a/drivers/net/wan/hdlc_raw.c b/drivers/net/wan/hdlc_raw.c
index 388fcc09b4dd..54d28496fefd 100644
--- a/drivers/net/wan/hdlc_raw.c
+++ b/drivers/net/wan/hdlc_raw.c
@@ -90,7 +90,7 @@ static int raw_ioctl(struct net_device *dev, struct ifreq *ifr)
 }
 
 
-static int __init mod_init(void)
+static int __init hdlc_raw_init(void)
 {
 	register_hdlc_protocol(&proto);
 	return 0;
@@ -98,14 +98,14 @@ static int __init mod_init(void)
 
 
 
-static void __exit mod_exit(void)
+static void __exit hdlc_raw_exit(void)
 {
 	unregister_hdlc_protocol(&proto);
 }
 
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_raw_init);
+module_exit(hdlc_raw_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("Raw HDLC protocol support for generic HDLC");
diff --git a/drivers/net/wan/hdlc_raw_eth.c b/drivers/net/wan/hdlc_raw_eth.c
index c70a518b8b47..927596276a07 100644
--- a/drivers/net/wan/hdlc_raw_eth.c
+++ b/drivers/net/wan/hdlc_raw_eth.c
@@ -110,7 +110,7 @@ static int raw_eth_ioctl(struct net_device *dev, struct ifreq *ifr)
 }
 
 
-static int __init mod_init(void)
+static int __init hdlc_eth_init(void)
 {
 	register_hdlc_protocol(&proto);
 	return 0;
@@ -118,14 +118,14 @@ static int __init mod_init(void)
 
 
 
-static void __exit mod_exit(void)
+static void __exit hdlc_eth_exit(void)
 {
 	unregister_hdlc_protocol(&proto);
 }
 
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_eth_init);
+module_exit(hdlc_eth_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("Ethernet encapsulation support for generic HDLC");
diff --git a/drivers/net/wan/hdlc_x25.c b/drivers/net/wan/hdlc_x25.c
index d2bf72bf3bd7..9b7ebf8bd85c 100644
--- a/drivers/net/wan/hdlc_x25.c
+++ b/drivers/net/wan/hdlc_x25.c
@@ -365,19 +365,19 @@ static int x25_ioctl(struct net_device *dev, struct ifreq *ifr)
 	return -EINVAL;
 }
 
-static int __init mod_init(void)
+static int __init hdlc_x25_init(void)
 {
 	register_hdlc_protocol(&proto);
 	return 0;
 }
 
-static void __exit mod_exit(void)
+static void __exit hdlc_x25_exit(void)
 {
 	unregister_hdlc_protocol(&proto);
 }
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_x25_init);
+module_exit(hdlc_x25_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("X.25 protocol support for generic HDLC");

From 5796015fa968a3349027a27dcd04c71d95c53ba5 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Mon, 12 Jul 2021 09:45:06 +0300
Subject: [PATCH 149/158] ipv6: allocate enough headroom in
 ip6_finish_output2()

When TEE target mirrors traffic to another interface, sk_buff may
not have enough headroom to be processed correctly.
ip_finish_output2() detect this situation for ipv4 and allocates
new skb with enogh headroom. However ipv6 lacks this logic in
ip_finish_output2 and it leads to skb_under_panic:

 skbuff: skb_under_panic: text:ffffffffc0866ad4 len:96 put:24
 head:ffff97be85e31800 data:ffff97be85e317f8 tail:0x58 end:0xc0 dev:gre0
 ------------[ cut here ]------------
 kernel BUG at net/core/skbuff.c:110!
 invalid opcode: 0000 [#1] SMP PTI
 CPU: 2 PID: 393 Comm: kworker/2:2 Tainted: G           OE     5.13.0 #13
 Hardware name: Virtuozzo KVM, BIOS 1.11.0-2.vz7.4 04/01/2014
 Workqueue: ipv6_addrconf addrconf_dad_work
 RIP: 0010:skb_panic+0x48/0x4a
 Call Trace:
  skb_push.cold.111+0x10/0x10
  ipgre_header+0x24/0xf0 [ip_gre]
  neigh_connected_output+0xae/0xf0
  ip6_finish_output2+0x1a8/0x5a0
  ip6_output+0x5c/0x110
  nf_dup_ipv6+0x158/0x1000 [nf_dup_ipv6]
  tee_tg6+0x2e/0x40 [xt_TEE]
  ip6t_do_table+0x294/0x470 [ip6_tables]
  nf_hook_slow+0x44/0xc0
  nf_hook.constprop.34+0x72/0xe0
  ndisc_send_skb+0x20d/0x2e0
  ndisc_send_ns+0xd1/0x210
  addrconf_dad_work+0x3c8/0x540
  process_one_work+0x1d1/0x370
  worker_thread+0x30/0x390
  kthread+0x116/0x130
  ret_from_fork+0x22/0x30

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d4ee2169afd8..01bea76e3891 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -60,10 +60,38 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *dev = dst->dev;
+	unsigned int hh_len = LL_RESERVED_SPACE(dev);
+	int delta = hh_len - skb_headroom(skb);
 	const struct in6_addr *nexthop;
 	struct neighbour *neigh;
 	int ret;
 
+	/* Be paranoid, rather than too clever. */
+	if (unlikely(delta > 0) && dev->header_ops) {
+		/* pskb_expand_head() might crash, if skb is shared */
+		if (skb_shared(skb)) {
+			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+			if (likely(nskb)) {
+				if (skb->sk)
+					skb_set_owner_w(skb, skb->sk);
+				consume_skb(skb);
+			} else {
+				kfree_skb(skb);
+			}
+			skb = nskb;
+		}
+		if (skb &&
+		    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+			kfree_skb(skb);
+			skb = NULL;
+		}
+		if (!skb) {
+			IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
+			return -ENOMEM;
+		}
+	}
+
 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 

From 71ce9d92fc7089f287c3e95a981bdec7545a8588 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 12 Jul 2021 15:37:50 +0100
Subject: [PATCH 150/158] octeontx2-pf: Fix uninitialized boolean variable pps

In the case where act->id is FLOW_ACTION_POLICE and also
act->police.rate_bytes_ps > 0 or act->police.rate_pkt_ps is not > 0
the boolean variable pps contains an uninitialized value when
function otx2_tc_act_set_police is called. Fix this by initializing
pps to false.

Addresses-Coverity: ("Uninitialized scalar variable)"
Fixes: 68fbff68dbea ("octeontx2-pf: Add police action for TC flower")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
index 905fc02a7dfe..972b202b9884 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
@@ -288,7 +288,7 @@ static int otx2_tc_parse_actions(struct otx2_nic *nic,
 	struct otx2_nic *priv;
 	u32 burst, mark = 0;
 	u8 nr_police = 0;
-	bool pps;
+	bool pps = false;
 	u64 rate;
 	int i;
 

From 01757f536ac825e3614d583fee9acb48c64ed084 Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Tue, 13 Jul 2021 10:48:24 +0800
Subject: [PATCH 151/158] net: Use nlmsg_unicast() instead of netlink_unicast()

It has 'if (err >0 )' statement in nlmsg_unicast(), so use nlmsg_unicast()
instead of netlink_unicast(), this looks more concise.

v2: remove the change in netfilter.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c  | 2 +-
 net/ipv4/inet_diag.c     | 5 +----
 net/ipv4/raw_diag.c      | 7 ++-----
 net/ipv4/udp_diag.c      | 6 ++----
 net/mptcp/mptcp_diag.c   | 6 ++----
 net/netlink/af_netlink.c | 2 +-
 net/sctp/diag.c          | 6 ++----
 net/unix/diag.c          | 6 ++----
 8 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a933bd6345b1..9fe13e4f5d08 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1376,7 +1376,7 @@ static void nl_fib_input(struct sk_buff *skb)
 	portid = NETLINK_CB(skb).portid;      /* netlink portid */
 	NETLINK_CB(skb).portid = 0;        /* from kernel */
 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
-	netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
+	nlmsg_unicast(net->ipv4.fibnl, skb, portid);
 }
 
 static int __net_init nl_fib_lookup_init(struct net *net)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e65f4ef024a4..ef7897226f08 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -580,10 +580,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 		nlmsg_free(rep);
 		goto out;
 	}
-	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-			      MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
+	err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
 
 out:
 	if (sk)
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index 1b5b8af27aaf..ccacbde30a2c 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -119,11 +119,8 @@ static int raw_diag_dump_one(struct netlink_callback *cb,
 		return err;
 	}
 
-	err = netlink_unicast(net->diag_nlsk, rep,
-			      NETLINK_CB(in_skb).portid,
-			      MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
+	err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
 	return err;
 }
 
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index b2cee9a307d4..1ed8c4d78e5c 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -77,10 +77,8 @@ static int udp_dump_one(struct udp_table *tbl,
 		kfree_skb(rep);
 		goto out;
 	}
-	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-			      MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
+	err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
 out:
 	if (sk)
 		sock_put(sk);
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index 8f88ddeab6a2..f48eb6315bbb 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -57,10 +57,8 @@ static int mptcp_diag_dump_one(struct netlink_callback *cb,
 		kfree_skb(rep);
 		goto out;
 	}
-	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-			      MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
+	err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
 out:
 	sock_put(sk);
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d233ac4a91b6..380f95aacdec 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2471,7 +2471,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 
 	nlmsg_end(skb, rep);
 
-	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
+	nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid);
 }
 EXPORT_SYMBOL(netlink_ack);
 
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index 493fc01e5d2b..760b367644c1 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -284,10 +284,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
 		goto out;
 	}
 
-	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-			      MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
+	err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
 out:
 	return err;
 }
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 9ff64f9df1f3..7e7d7f45685a 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -295,10 +295,8 @@ again:
 
 		goto again;
 	}
-	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-			      MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
+	err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
 out:
 	if (sk)
 		sock_put(sk);

From e56c6bbd98dc1cefb6f9c5d795fd29016e4f2fe7 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 13 Jul 2021 12:33:50 +0300
Subject: [PATCH 152/158] net: ocelot: fix switchdev objects synced for wrong
 netdev with LAG offload

The point with a *dev and a *brport_dev is that when we have a LAG net
device that is a bridge port, *dev is an ocelot net device and
*brport_dev is the bonding/team net device. The ocelot net device
beneath the LAG does not exist from the bridge's perspective, so we need
to sync the switchdev objects belonging to the brport_dev and not to the
dev.

Fixes: e4bd44e89dcf ("net: ocelot: replay switchdev events when joining bridge")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot_net.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 3e89e34f86d5..e9d260d84bf3 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1298,6 +1298,7 @@ static int ocelot_netdevice_lag_leave(struct net_device *dev,
 }
 
 static int ocelot_netdevice_changeupper(struct net_device *dev,
+					struct net_device *brport_dev,
 					struct netdev_notifier_changeupper_info *info)
 {
 	struct netlink_ext_ack *extack;
@@ -1307,11 +1308,11 @@ static int ocelot_netdevice_changeupper(struct net_device *dev,
 
 	if (netif_is_bridge_master(info->upper_dev)) {
 		if (info->linking)
-			err = ocelot_netdevice_bridge_join(dev, dev,
+			err = ocelot_netdevice_bridge_join(dev, brport_dev,
 							   info->upper_dev,
 							   extack);
 		else
-			err = ocelot_netdevice_bridge_leave(dev, dev,
+			err = ocelot_netdevice_bridge_leave(dev, brport_dev,
 							    info->upper_dev);
 	}
 	if (netif_is_lag_master(info->upper_dev)) {
@@ -1346,7 +1347,7 @@ ocelot_netdevice_lag_changeupper(struct net_device *dev,
 		if (ocelot_port->bond != dev)
 			return NOTIFY_OK;
 
-		err = ocelot_netdevice_changeupper(lower, info);
+		err = ocelot_netdevice_changeupper(lower, dev, info);
 		if (err)
 			return notifier_from_errno(err);
 	}
@@ -1385,7 +1386,7 @@ static int ocelot_netdevice_event(struct notifier_block *unused,
 		struct netdev_notifier_changeupper_info *info = ptr;
 
 		if (ocelot_netdevice_dev_check(dev))
-			return ocelot_netdevice_changeupper(dev, info);
+			return ocelot_netdevice_changeupper(dev, dev, info);
 
 		if (netif_is_lag_master(dev))
 			return ocelot_netdevice_lag_changeupper(dev, info);

From b0b33b048dcfbd7da82c3cde4fab02751dfab4d6 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 13 Jul 2021 12:37:19 +0300
Subject: [PATCH 153/158] net: dsa: sja1105: fix address learning getting
 disabled on the CPU port

In May 2019 when commit 640f763f98c2 ("net: dsa: sja1105: Add support
for Spanning Tree Protocol") was introduced, the comment that "STP does
not get called for the CPU port" was true. This changed after commit
0394a63acfe2 ("net: dsa: enable and disable all ports") in August 2019
and went largely unnoticed, because the sja1105_bridge_stp_state_set()
method did nothing different compared to the static setup done by
sja1105_init_mac_settings().

With the ability to turn address learning off introduced by the blamed
commit, there is a new priv->learn_ena port mask in the driver. When
sja1105_bridge_stp_state_set() gets called and we are in
BR_STATE_LEARNING or later, address learning is enabled or not depending
on priv->learn_ena & BIT(port).

So what happens is that priv->learn_ena is not being set from anywhere
for the CPU port, and the static configuration done by
sja1105_init_mac_settings() is being overwritten.

To solve this, acknowledge that the static configuration of STP state is
no longer necessary because the STP state is being set by the DSA core
now, but what is necessary is to set priv->learn_ena for the CPU port.

Fixes: 4d9423549501 ("net: dsa: sja1105: offload bridge port flags to device")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_main.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 4f0545605f6b..ced8c9cb29c2 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -122,14 +122,12 @@ static int sja1105_init_mac_settings(struct sja1105_private *priv)
 
 	for (i = 0; i < ds->num_ports; i++) {
 		mac[i] = default_mac;
-		if (i == dsa_upstream_port(priv->ds, i)) {
-			/* STP doesn't get called for CPU port, so we need to
-			 * set the I/O parameters statically.
-			 */
-			mac[i].dyn_learn = true;
-			mac[i].ingress = true;
-			mac[i].egress = true;
-		}
+
+		/* Let sja1105_bridge_stp_state_set() keep address learning
+		 * enabled for the CPU port.
+		 */
+		if (dsa_is_cpu_port(ds, i))
+			priv->learn_ena |= BIT(i);
 	}
 
 	return 0;

From deb7178eb940e2c5caca1b1db084a69b2e59b4c9 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Tue, 13 Jul 2021 13:58:53 +0300
Subject: [PATCH 154/158] net: fddi: fix UAF in fza_probe

fp is netdev private data and it cannot be
used after free_netdev() call. Using fp after free_netdev()
can cause UAF bug. Fix it by moving free_netdev() after error message.

Fixes: 61414f5ec983 ("FDDI: defza: Add support for DEC FDDIcontroller 700
TURBOchannel adapter")
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/fddi/defza.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/fddi/defza.c b/drivers/net/fddi/defza.c
index 14f07050b6b1..0de2c4552f5e 100644
--- a/drivers/net/fddi/defza.c
+++ b/drivers/net/fddi/defza.c
@@ -1504,9 +1504,8 @@ err_out_resource:
 	release_mem_region(start, len);
 
 err_out_kfree:
-	free_netdev(dev);
-
 	pr_err("%s: initialization failure, aborting!\n", fp->name);
+	free_netdev(dev);
 	return ret;
 }
 

From f28100cb9c9645c07cbd22431278ac9492f6a01c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= <ihuguet@redhat.com>
Date: Tue, 13 Jul 2021 16:21:27 +0200
Subject: [PATCH 155/158] sfc: fix lack of XDP TX queues - error XDP TX failed
 (-22)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes: e26ca4b53582 sfc: reduce the number of requested xdp ev queues

The buggy commit intended to allocate less channels for XDP in order to
be more unlikely to reach the limit of 32 channels of the driver.

The idea was to use each IRQ/eventqeue for more XDP TX queues than
before, calculating which is the maximum number of TX queues that one
event queue can handle. For example, in EF10 each event queue could
handle up to 8 queues, better than the 4 they were handling before the
change. This way, it would have to allocate half of channels than before
for XDP TX.

The problem is that the TX queues are also contained inside the channel
structs, and there are only 4 queues per channel. Reducing the number of
channels means also reducing the number of queues, resulting in not
having the desired number of 1 queue per CPU.

This leads to getting errors on XDP_TX and XDP_REDIRECT if they're
executed from a high numbered CPU, because there only exist queues for
the low half of CPUs, actually. If XDP_TX/REDIRECT is executed in a low
numbered CPU, the error doesn't happen. This is the error in the logs
(repeated many times, even rate limited):
sfc 0000:5e:00.0 ens3f0np0: XDP TX failed (-22)

This errors happens in function efx_xdp_tx_buffers, where it expects to
have a dedicated XDP TX queue per CPU.

Reverting the change makes again more likely to reach the limit of 32
channels in machines with many CPUs. If this happen, no XDP_TX/REDIRECT
will be possible at all, and we will have this log error messages:

At interface probe:
sfc 0000:5e:00.0: Insufficient resources for 12 XDP event queues (24 other channels, max 32)

At every subsequent XDP_TX/REDIRECT failure, rate limited:
sfc 0000:5e:00.0 ens3f0np0: XDP TX failed (-22)

However, without reverting the change, it makes the user to think that
everything is OK at probe time, but later it fails in an unpredictable
way, depending on the CPU that handles the packet.

It is better to restore the predictable behaviour. If the user sees the
error message at probe time, he/she can try to configure the best way it
fits his/her needs. At least, he/she will have 2 options:
- Accept that XDP_TX/REDIRECT is not available (he/she may not need it)
- Load sfc module with modparam 'rss_cpus' with a lower number, thus
  creating less normal RX queues/channels, letting more free resources
  for XDP, with some performance penalty.

Anyway, let the calculation of maximum TX queues that can be handled by
a single event queue, and use it only if it's less than the number of TX
queues per channel. This doesn't happen in practice, but could happen if
some constant values are tweaked in the future, such us
EFX_MAX_TXQ_PER_CHANNEL, EFX_MAX_EVQ_SIZE or EFX_MAX_DMAQ_SIZE.

Related mailing list thread:
https://lore.kernel.org/bpf/20201215104327.2be76156@carbon/

Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/efx_channels.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c
index a3ca406a3561..5b71f8a03a6d 100644
--- a/drivers/net/ethernet/sfc/efx_channels.c
+++ b/drivers/net/ethernet/sfc/efx_channels.c
@@ -152,6 +152,7 @@ static int efx_allocate_msix_channels(struct efx_nic *efx,
 	 * maximum size.
 	 */
 	tx_per_ev = EFX_MAX_EVQ_SIZE / EFX_TXQ_MAX_ENT(efx);
+	tx_per_ev = min(tx_per_ev, EFX_MAX_TXQ_PER_CHANNEL);
 	n_xdp_tx = num_possible_cpus();
 	n_xdp_ev = DIV_ROUND_UP(n_xdp_tx, tx_per_ev);
 
@@ -181,7 +182,7 @@ static int efx_allocate_msix_channels(struct efx_nic *efx,
 		efx->xdp_tx_queue_count = 0;
 	} else {
 		efx->n_xdp_channels = n_xdp_ev;
-		efx->xdp_tx_per_channel = EFX_MAX_TXQ_PER_CHANNEL;
+		efx->xdp_tx_per_channel = tx_per_ev;
 		efx->xdp_tx_queue_count = n_xdp_tx;
 		n_channels += n_xdp_ev;
 		netif_dbg(efx, drv, efx->net_dev,

From 788bc000d4c2f25232db19ab3a0add0ba4e27671 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= <ihuguet@redhat.com>
Date: Tue, 13 Jul 2021 16:21:28 +0200
Subject: [PATCH 156/158] sfc: ensure correct number of XDP queues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 99ba0ea616aa ("sfc: adjust efx->xdp_tx_queue_count with the real
number of initialized queues") intended to fix a problem caused by a
round up when calculating the number of XDP channels and queues.
However, this was not the real problem. The real problem was that the
number of XDP TX queues had been reduced to half in
commit e26ca4b53582 ("sfc: reduce the number of requested xdp ev queues"),
but the variable xdp_tx_queue_count had remained the same.

Once the correct number of XDP TX queues is created again in the
previous patch of this series, this also can be reverted since the error
doesn't actually exist.

Only in the case that there is a bug in the code we can have different
values in xdp_queue_number and efx->xdp_tx_queue_count. Because of this,
and per Edward Cree's suggestion, I add instead a WARN_ON to catch if it
happens again in the future.

Note that the number of allocated queues can be higher than the number
of used ones due to the round up, as explained in the existing comment
in the code. That's why we also have to stop increasing xdp_queue_number
beyond efx->xdp_tx_queue_count.

Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/efx_channels.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c
index 5b71f8a03a6d..bb48a139dd15 100644
--- a/drivers/net/ethernet/sfc/efx_channels.c
+++ b/drivers/net/ethernet/sfc/efx_channels.c
@@ -892,18 +892,20 @@ int efx_set_channels(struct efx_nic *efx)
 			if (efx_channel_is_xdp_tx(channel)) {
 				efx_for_each_channel_tx_queue(tx_queue, channel) {
 					tx_queue->queue = next_queue++;
-					netif_dbg(efx, drv, efx->net_dev, "Channel %u TXQ %u is XDP %u, HW %u\n",
-						  channel->channel, tx_queue->label,
-						  xdp_queue_number, tx_queue->queue);
+
 					/* We may have a few left-over XDP TX
 					 * queues owing to xdp_tx_queue_count
 					 * not dividing evenly by EFX_MAX_TXQ_PER_CHANNEL.
 					 * We still allocate and probe those
 					 * TXQs, but never use them.
 					 */
-					if (xdp_queue_number < efx->xdp_tx_queue_count)
+					if (xdp_queue_number < efx->xdp_tx_queue_count) {
+						netif_dbg(efx, drv, efx->net_dev, "Channel %u TXQ %u is XDP %u, HW %u\n",
+							  channel->channel, tx_queue->label,
+							  xdp_queue_number, tx_queue->queue);
 						efx->xdp_tx_queues[xdp_queue_number] = tx_queue;
-					xdp_queue_number++;
+						xdp_queue_number++;
+					}
 				}
 			} else {
 				efx_for_each_channel_tx_queue(tx_queue, channel) {
@@ -915,8 +917,7 @@ int efx_set_channels(struct efx_nic *efx)
 			}
 		}
 	}
-	if (xdp_queue_number)
-		efx->xdp_tx_queue_count = xdp_queue_number;
+	WARN_ON(xdp_queue_number != efx->xdp_tx_queue_count);
 
 	rc = netif_set_real_num_tx_queues(efx->net_dev, efx->n_tx_channels);
 	if (rc)

From d2a16bde77322fca6b6f36ebe19097a1c3c46f74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= <ihuguet@redhat.com>
Date: Tue, 13 Jul 2021 16:21:29 +0200
Subject: [PATCH 157/158] sfc: add logs explaining XDP_TX/REDIRECT is not
 available
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If it's not possible to allocate enough channels for XDP, XDP_TX and
XDP_REDIRECT don't work. However, only a message saying that not enough
channels were available was shown, but not saying what are the
consequences in that case. The user didn't know if he/she can use XDP
or not, if the performance is reduced, or what.

Signed-off-by: Íñigo Huguet <ihuguet@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/efx_channels.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c
index bb48a139dd15..e5b0d795c301 100644
--- a/drivers/net/ethernet/sfc/efx_channels.c
+++ b/drivers/net/ethernet/sfc/efx_channels.c
@@ -170,6 +170,8 @@ static int efx_allocate_msix_channels(struct efx_nic *efx,
 		netif_err(efx, drv, efx->net_dev,
 			  "Insufficient resources for %d XDP event queues (%d other channels, max %d)\n",
 			  n_xdp_ev, n_channels, max_channels);
+		netif_err(efx, drv, efx->net_dev,
+			  "XDP_TX and XDP_REDIRECT will not work on this interface");
 		efx->n_xdp_channels = 0;
 		efx->xdp_tx_per_channel = 0;
 		efx->xdp_tx_queue_count = 0;
@@ -177,6 +179,8 @@ static int efx_allocate_msix_channels(struct efx_nic *efx,
 		netif_err(efx, drv, efx->net_dev,
 			  "Insufficient resources for %d XDP TX queues (%d other channels, max VIs %d)\n",
 			  n_xdp_tx, n_channels, efx->max_vis);
+		netif_err(efx, drv, efx->net_dev,
+			  "XDP_TX and XDP_REDIRECT will not work on this interface");
 		efx->n_xdp_channels = 0;
 		efx->xdp_tx_per_channel = 0;
 		efx->xdp_tx_queue_count = 0;

From bcb9928a155444dbd212473e60241ca0a7f641e1 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 13 Jul 2021 12:40:21 +0300
Subject: [PATCH 158/158] net: dsa: properly check for the bridge_leave methods
 in dsa_switch_bridge_leave()

This was not caught because there is no switch driver which implements
the .port_bridge_join but not .port_bridge_leave method, but it should
nonetheless be fixed, as in certain conditions (driver development) it
might lead to NULL pointer dereference.

Fixes: f66a6a69f97a ("net: dsa: permit cross-chip bridging between all trees in the system")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/switch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 248455145982..5ece05dfd8f2 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -113,11 +113,11 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 	int err, port;
 
 	if (dst->index == info->tree_index && ds->index == info->sw_index &&
-	    ds->ops->port_bridge_join)
+	    ds->ops->port_bridge_leave)
 		ds->ops->port_bridge_leave(ds, info->port, info->br);
 
 	if ((dst->index != info->tree_index || ds->index != info->sw_index) &&
-	    ds->ops->crosschip_bridge_join)
+	    ds->ops->crosschip_bridge_leave)
 		ds->ops->crosschip_bridge_leave(ds, info->tree_index,
 						info->sw_index, info->port,
 						info->br);