linux-stable/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c

843 lines
22 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */
#include <linux/pci.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/mlx5/driver.h>
#include <linux/mlx5/vport.h>
#include "mlx5_core.h"
#include "mlx5_irq.h"
#include "pci_irq.h"
#include "lib/sf.h"
#include "lib/eq.h"
#ifdef CONFIG_RFS_ACCEL
#include <linux/cpu_rmap.h>
#endif
#define MLX5_SFS_PER_CTRL_IRQ 64
#define MLX5_IRQ_CTRL_SF_MAX 8
/* min num of vectors for SFs to be enabled */
#define MLX5_IRQ_VEC_COMP_BASE_SF 2
#define MLX5_IRQ_VEC_COMP_BASE 1
#define MLX5_EQ_SHARE_IRQ_MAX_COMP (8)
#define MLX5_EQ_SHARE_IRQ_MAX_CTRL (UINT_MAX)
#define MLX5_EQ_SHARE_IRQ_MIN_COMP (1)
#define MLX5_EQ_SHARE_IRQ_MIN_CTRL (4)
struct mlx5_irq {
struct atomic_notifier_head nh;
cpumask_var_t mask;
char name[MLX5_MAX_IRQ_FORMATTED_NAME];
struct mlx5_irq_pool *pool;
int refcount;
struct msi_map map;
u32 pool_index;
};
struct mlx5_irq_table {
struct mlx5_irq_pool *pcif_pool;
struct mlx5_irq_pool *sf_ctrl_pool;
struct mlx5_irq_pool *sf_comp_pool;
};
static int mlx5_core_func_to_vport(const struct mlx5_core_dev *dev,
int func,
bool ec_vf_func)
{
if (!ec_vf_func)
return func;
return mlx5_core_ec_vf_vport_base(dev) + func - 1;
}
/**
* mlx5_get_default_msix_vec_count - Get the default number of MSI-X vectors
* to be ssigned to each VF.
* @dev: PF to work on
* @num_vfs: Number of enabled VFs
*/
int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs)
{
int num_vf_msix, min_msix, max_msix;
num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix);
if (!num_vf_msix)
return 0;
min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size);
max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size);
/* Limit maximum number of MSI-X vectors so the default configuration
* has some available in the pool. This will allow the user to increase
* the number of vectors in a VF without having to first size-down other
* VFs.
*/
return max(min(num_vf_msix / num_vfs, max_msix / 2), min_msix);
}
/**
* mlx5_set_msix_vec_count - Set dynamically allocated MSI-X on the VF
* @dev: PF to work on
* @function_id: Internal PCI VF function IDd
* @msix_vec_count: Number of MSI-X vectors to set
*/
int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id,
int msix_vec_count)
{
int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
void *hca_cap = NULL, *query_cap = NULL, *cap;
int num_vf_msix, min_msix, max_msix;
bool ec_vf_function;
int vport;
int ret;
num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix);
if (!num_vf_msix)
return 0;
if (!MLX5_CAP_GEN(dev, vport_group_manager) || !mlx5_core_is_pf(dev))
return -EOPNOTSUPP;
min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size);
max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size);
if (msix_vec_count < min_msix)
return -EINVAL;
if (msix_vec_count > max_msix)
return -EOVERFLOW;
query_cap = kvzalloc(query_sz, GFP_KERNEL);
hca_cap = kvzalloc(set_sz, GFP_KERNEL);
if (!hca_cap || !query_cap) {
ret = -ENOMEM;
goto out;
}
ec_vf_function = mlx5_core_ec_sriov_enabled(dev);
vport = mlx5_core_func_to_vport(dev, function_id, ec_vf_function);
ret = mlx5_vport_get_other_func_general_cap(dev, vport, query_cap);
if (ret)
goto out;
cap = MLX5_ADDR_OF(set_hca_cap_in, hca_cap, capability);
memcpy(cap, MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability),
MLX5_UN_SZ_BYTES(hca_cap_union));
MLX5_SET(cmd_hca_cap, cap, dynamic_msix_table_size, msix_vec_count);
MLX5_SET(set_hca_cap_in, hca_cap, opcode, MLX5_CMD_OP_SET_HCA_CAP);
MLX5_SET(set_hca_cap_in, hca_cap, other_function, 1);
MLX5_SET(set_hca_cap_in, hca_cap, ec_vf_function, ec_vf_function);
MLX5_SET(set_hca_cap_in, hca_cap, function_id, function_id);
MLX5_SET(set_hca_cap_in, hca_cap, op_mod,
MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1);
ret = mlx5_cmd_exec_in(dev, set_hca_cap, hca_cap);
out:
kvfree(hca_cap);
kvfree(query_cap);
return ret;
}
net/mlx5: Free IRQ rmap and notifier on kernel shutdown The kernel IRQ system needs the irq affinity notifier to be clear before attempting to free the irq, see WARN_ON log below. On a normal driver unload we don't have this issue since we do the complete cleanup of the irq resources. To fix this, put the important resources cleanup in a helper function and use it in both normal driver unload and shutdown flows. [ 4497.498434] ------------[ cut here ]------------ [ 4497.498726] WARNING: CPU: 0 PID: 9 at kernel/irq/manage.c:2034 free_irq+0x295/0x340 [ 4497.499193] Modules linked in: [ 4497.499386] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.4.0-rc4+ #10 [ 4497.499876] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014 [ 4497.500518] Workqueue: events do_poweroff [ 4497.500849] RIP: 0010:free_irq+0x295/0x340 [ 4497.501132] Code: 85 c0 0f 84 1d ff ff ff 48 89 ef ff d0 0f 1f 00 e9 10 ff ff ff 0f 0b e9 72 ff ff ff 49 8d 7f 28 ff d0 0f 1f 00 e9 df fd ff ff <0f> 0b 48 c7 80 c0 008 [ 4497.502269] RSP: 0018:ffffc90000053da0 EFLAGS: 00010282 [ 4497.502589] RAX: ffff888100949600 RBX: ffff88810330b948 RCX: 0000000000000000 [ 4497.503035] RDX: ffff888100949600 RSI: ffff888100400490 RDI: 0000000000000023 [ 4497.503472] RBP: ffff88810330c7e0 R08: ffff8881004005d0 R09: ffffffff8273a260 [ 4497.503923] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8881009ae000 [ 4497.504359] R13: ffff8881009ae148 R14: 0000000000000000 R15: ffff888100949600 [ 4497.504804] FS: 0000000000000000(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 [ 4497.505302] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4497.505671] CR2: 00007fce98806298 CR3: 000000000262e005 CR4: 0000000000370ef0 [ 4497.506104] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4497.506540] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4497.507002] Call Trace: [ 4497.507158] <TASK> [ 4497.507299] ? free_irq+0x295/0x340 [ 4497.507522] ? __warn+0x7c/0x130 [ 4497.507740] ? free_irq+0x295/0x340 [ 4497.507963] ? report_bug+0x171/0x1a0 [ 4497.508197] ? handle_bug+0x3c/0x70 [ 4497.508417] ? exc_invalid_op+0x17/0x70 [ 4497.508662] ? asm_exc_invalid_op+0x1a/0x20 [ 4497.508926] ? free_irq+0x295/0x340 [ 4497.509146] mlx5_irq_pool_free_irqs+0x48/0x90 [ 4497.509421] mlx5_irq_table_free_irqs+0x38/0x50 [ 4497.509714] mlx5_core_eq_free_irqs+0x27/0x40 [ 4497.509984] shutdown+0x7b/0x100 [ 4497.510184] pci_device_shutdown+0x30/0x60 [ 4497.510440] device_shutdown+0x14d/0x240 [ 4497.510698] kernel_power_off+0x30/0x70 [ 4497.510938] process_one_work+0x1e6/0x3e0 [ 4497.511183] worker_thread+0x49/0x3b0 [ 4497.511407] ? __pfx_worker_thread+0x10/0x10 [ 4497.511679] kthread+0xe0/0x110 [ 4497.511879] ? __pfx_kthread+0x10/0x10 [ 4497.512114] ret_from_fork+0x29/0x50 [ 4497.512342] </TASK> Fixes: 9c2d08010963 ("net/mlx5: Free irqs only on shutdown callback") Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> Reviewed-by: Shay Drory <shayd@nvidia.com>
2023-06-08 19:00:54 +00:00
/* mlx5_system_free_irq - Free an IRQ
* @irq: IRQ to free
*
* Free the IRQ and other resources such as rmap from the system.
* BUT doesn't free or remove reference from mlx5.
* This function is very important for the shutdown flow, where we need to
* cleanup system resoruces but keep mlx5 objects alive,
* see mlx5_irq_table_free_irqs().
*/
static void mlx5_system_free_irq(struct mlx5_irq *irq)
{
struct mlx5_irq_pool *pool = irq->pool;
#ifdef CONFIG_RFS_ACCEL
struct cpu_rmap *rmap;
#endif
/* free_irq requires that affinity_hint and rmap will be cleared before
* calling it. To satisfy this requirement, we call
* irq_cpu_rmap_remove() to remove the notifier
*/
irq_update_affinity_hint(irq->map.virq, NULL);
#ifdef CONFIG_RFS_ACCEL
rmap = mlx5_eq_table_get_rmap(pool->dev);
net/mlx5: Remove rmap also in case dynamic MSIX not supported mlx5 add IRQs to rmap upon MSIX request, and mlx5 remove rmap from MSIX only if msi_map.index is populated. However, msi_map.index is populated only when dynamic MSIX is supported. This results in freeing IRQs without removing them from rmap, which triggers the bellow WARN_ON[1]. rmap is a feature which have no relation to dynamic MSIX. Hence, remove the check of msi_map.index when removing IRQ from rmap. [1] [ 200.307160 ] WARNING: CPU: 20 PID: 1702 at kernel/irq/manage.c:2034 free_irq+0x2ac/0x358 [ 200.316990 ] CPU: 20 PID: 1702 Comm: modprobe Not tainted 6.4.0-rc3_for_upstream_min_debug_2023_05_24_14_02 #1 [ 200.318939 ] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 [ 200.321659 ] pc : free_irq+0x2ac/0x358 [ 200.322400 ] lr : free_irq+0x20/0x358 [ 200.337865 ] Call trace: [ 200.338360 ] free_irq+0x2ac/0x358 [ 200.339029 ] irq_release+0x58/0xd0 [mlx5_core] [ 200.340093 ] mlx5_irqs_release_vectors+0x80/0xb0 [mlx5_core] [ 200.341344 ] destroy_comp_eqs+0x120/0x170 [mlx5_core] [ 200.342469 ] mlx5_eq_table_destroy+0x1c/0x38 [mlx5_core] [ 200.343645 ] mlx5_unload+0x8c/0xc8 [mlx5_core] [ 200.344652 ] mlx5_uninit_one+0x78/0x118 [mlx5_core] [ 200.345745 ] remove_one+0x80/0x108 [mlx5_core] [ 200.346752 ] pci_device_remove+0x40/0xd8 [ 200.347554 ] device_remove+0x50/0x88 [ 200.348272 ] device_release_driver_internal+0x1c4/0x228 [ 200.349312 ] driver_detach+0x54/0xa0 [ 200.350030 ] bus_remove_driver+0x74/0x100 [ 200.350833 ] driver_unregister+0x34/0x68 [ 200.351619 ] pci_unregister_driver+0x28/0xa0 [ 200.352476 ] mlx5_cleanup+0x14/0x2210 [mlx5_core] [ 200.353536 ] __arm64_sys_delete_module+0x190/0x2e8 [ 200.354495 ] el0_svc_common.constprop.0+0x6c/0x1d0 [ 200.355455 ] do_el0_svc+0x38/0x98 [ 200.356122 ] el0_svc+0x1c/0x80 [ 200.356739 ] el0t_64_sync_handler+0xb4/0x130 [ 200.357604 ] el0t_64_sync+0x174/0x178 [ 200.358345 ] ---[ end trace 0000000000000000 ]--- Fixes: 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation") Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2023-05-30 08:59:34 +00:00
if (rmap)
irq_cpu_rmap_remove(rmap, irq->map.virq);
#endif
free_irq(irq->map.virq, &irq->nh);
if (irq->map.index && pci_msix_can_alloc_dyn(pool->dev->pdev))
pci_msix_free_irq(pool->dev->pdev, irq->map);
net/mlx5: Free IRQ rmap and notifier on kernel shutdown The kernel IRQ system needs the irq affinity notifier to be clear before attempting to free the irq, see WARN_ON log below. On a normal driver unload we don't have this issue since we do the complete cleanup of the irq resources. To fix this, put the important resources cleanup in a helper function and use it in both normal driver unload and shutdown flows. [ 4497.498434] ------------[ cut here ]------------ [ 4497.498726] WARNING: CPU: 0 PID: 9 at kernel/irq/manage.c:2034 free_irq+0x295/0x340 [ 4497.499193] Modules linked in: [ 4497.499386] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.4.0-rc4+ #10 [ 4497.499876] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014 [ 4497.500518] Workqueue: events do_poweroff [ 4497.500849] RIP: 0010:free_irq+0x295/0x340 [ 4497.501132] Code: 85 c0 0f 84 1d ff ff ff 48 89 ef ff d0 0f 1f 00 e9 10 ff ff ff 0f 0b e9 72 ff ff ff 49 8d 7f 28 ff d0 0f 1f 00 e9 df fd ff ff <0f> 0b 48 c7 80 c0 008 [ 4497.502269] RSP: 0018:ffffc90000053da0 EFLAGS: 00010282 [ 4497.502589] RAX: ffff888100949600 RBX: ffff88810330b948 RCX: 0000000000000000 [ 4497.503035] RDX: ffff888100949600 RSI: ffff888100400490 RDI: 0000000000000023 [ 4497.503472] RBP: ffff88810330c7e0 R08: ffff8881004005d0 R09: ffffffff8273a260 [ 4497.503923] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8881009ae000 [ 4497.504359] R13: ffff8881009ae148 R14: 0000000000000000 R15: ffff888100949600 [ 4497.504804] FS: 0000000000000000(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 [ 4497.505302] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4497.505671] CR2: 00007fce98806298 CR3: 000000000262e005 CR4: 0000000000370ef0 [ 4497.506104] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4497.506540] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4497.507002] Call Trace: [ 4497.507158] <TASK> [ 4497.507299] ? free_irq+0x295/0x340 [ 4497.507522] ? __warn+0x7c/0x130 [ 4497.507740] ? free_irq+0x295/0x340 [ 4497.507963] ? report_bug+0x171/0x1a0 [ 4497.508197] ? handle_bug+0x3c/0x70 [ 4497.508417] ? exc_invalid_op+0x17/0x70 [ 4497.508662] ? asm_exc_invalid_op+0x1a/0x20 [ 4497.508926] ? free_irq+0x295/0x340 [ 4497.509146] mlx5_irq_pool_free_irqs+0x48/0x90 [ 4497.509421] mlx5_irq_table_free_irqs+0x38/0x50 [ 4497.509714] mlx5_core_eq_free_irqs+0x27/0x40 [ 4497.509984] shutdown+0x7b/0x100 [ 4497.510184] pci_device_shutdown+0x30/0x60 [ 4497.510440] device_shutdown+0x14d/0x240 [ 4497.510698] kernel_power_off+0x30/0x70 [ 4497.510938] process_one_work+0x1e6/0x3e0 [ 4497.511183] worker_thread+0x49/0x3b0 [ 4497.511407] ? __pfx_worker_thread+0x10/0x10 [ 4497.511679] kthread+0xe0/0x110 [ 4497.511879] ? __pfx_kthread+0x10/0x10 [ 4497.512114] ret_from_fork+0x29/0x50 [ 4497.512342] </TASK> Fixes: 9c2d08010963 ("net/mlx5: Free irqs only on shutdown callback") Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> Reviewed-by: Shay Drory <shayd@nvidia.com>
2023-06-08 19:00:54 +00:00
}
static void irq_release(struct mlx5_irq *irq)
{
struct mlx5_irq_pool *pool = irq->pool;
xa_erase(&pool->irqs, irq->pool_index);
mlx5_system_free_irq(irq);
free_cpumask_var(irq->mask);
kfree(irq);
}
int mlx5_irq_put(struct mlx5_irq *irq)
{
struct mlx5_irq_pool *pool = irq->pool;
int ret = 0;
mutex_lock(&pool->lock);
irq->refcount--;
if (!irq->refcount) {
irq_release(irq);
ret = 1;
}
mutex_unlock(&pool->lock);
return ret;
}
int mlx5_irq_read_locked(struct mlx5_irq *irq)
{
lockdep_assert_held(&irq->pool->lock);
return irq->refcount;
}
int mlx5_irq_get_locked(struct mlx5_irq *irq)
{
lockdep_assert_held(&irq->pool->lock);
if (WARN_ON_ONCE(!irq->refcount))
return 0;
irq->refcount++;
return 1;
}
static int irq_get(struct mlx5_irq *irq)
{
int err;
mutex_lock(&irq->pool->lock);
err = mlx5_irq_get_locked(irq);
mutex_unlock(&irq->pool->lock);
return err;
}
static irqreturn_t irq_int_handler(int irq, void *nh)
{
atomic_notifier_call_chain(nh, 0, NULL);
return IRQ_HANDLED;
}
static void irq_sf_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
{
snprintf(name, MLX5_MAX_IRQ_NAME, "%s%d", pool->name, vecidx);
}
static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
{
if (!pool->xa_num_irqs.max) {
/* in case we only have a single irq for the device */
snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_combined%d", vecidx);
return;
}
if (!vecidx) {
snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async%d", vecidx);
return;
}
vecidx -= MLX5_IRQ_VEC_COMP_BASE;
snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", vecidx);
}
struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
struct irq_affinity_desc *af_desc,
struct cpu_rmap **rmap)
{
struct mlx5_core_dev *dev = pool->dev;
char name[MLX5_MAX_IRQ_NAME];
struct mlx5_irq *irq;
int err;
irq = kzalloc(sizeof(*irq), GFP_KERNEL);
if (!irq || !zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
kfree(irq);
return ERR_PTR(-ENOMEM);
}
if (!i || !pci_msix_can_alloc_dyn(dev->pdev)) {
/* The vector at index 0 is always statically allocated. If
* dynamic irq is not supported all vectors are statically
* allocated. In both cases just get the irq number and set
* the index.
*/
irq->map.virq = pci_irq_vector(dev->pdev, i);
irq->map.index = i;
} else {
irq->map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, af_desc);
if (!irq->map.virq) {
err = irq->map.index;
goto err_alloc_irq;
}
}
if (i && rmap && *rmap) {
#ifdef CONFIG_RFS_ACCEL
err = irq_cpu_rmap_add(*rmap, irq->map.virq);
if (err)
goto err_irq_rmap;
#endif
}
if (!mlx5_irq_pool_is_sf_pool(pool))
irq_set_name(pool, name, i);
else
irq_sf_set_name(pool, name, i);
ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
snprintf(irq->name, MLX5_MAX_IRQ_FORMATTED_NAME,
MLX5_IRQ_NAME_FORMAT_STR, name, pci_name(dev->pdev));
err = request_irq(irq->map.virq, irq_int_handler, 0, irq->name,
&irq->nh);
if (err) {
mlx5_core_err(dev, "Failed to request irq. err = %d\n", err);
goto err_req_irq;
}
if (af_desc) {
cpumask_copy(irq->mask, &af_desc->mask);
irq_set_affinity_and_hint(irq->map.virq, irq->mask);
}
irq->pool = pool;
irq->refcount = 1;
irq->pool_index = i;
err = xa_err(xa_store(&pool->irqs, irq->pool_index, irq, GFP_KERNEL));
if (err) {
mlx5_core_err(dev, "Failed to alloc xa entry for irq(%u). err = %d\n",
irq->pool_index, err);
goto err_xa;
}
return irq;
err_xa:
if (af_desc)
irq_update_affinity_hint(irq->map.virq, NULL);
free_irq(irq->map.virq, &irq->nh);
err_req_irq:
#ifdef CONFIG_RFS_ACCEL
if (i && rmap && *rmap) {
free_irq_cpu_rmap(*rmap);
*rmap = NULL;
}
err_irq_rmap:
#endif
if (i && pci_msix_can_alloc_dyn(dev->pdev))
pci_msix_free_irq(dev->pdev, irq->map);
err_alloc_irq:
free_cpumask_var(irq->mask);
kfree(irq);
return ERR_PTR(err);
}
int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
{
int ret;
ret = irq_get(irq);
if (!ret)
/* Something very bad happens here, we are enabling EQ
* on non-existing IRQ.
*/
return -ENOENT;
ret = atomic_notifier_chain_register(&irq->nh, nb);
if (ret)
mlx5_irq_put(irq);
return ret;
}
int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
{
int err = 0;
err = atomic_notifier_chain_unregister(&irq->nh, nb);
mlx5_irq_put(irq);
return err;
}
struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq)
{
return irq->mask;
}
int mlx5_irq_get_index(struct mlx5_irq *irq)
{
return irq->map.index;
}
/* irq_pool API */
/* requesting an irq from a given pool according to given index */
static struct mlx5_irq *
irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
struct irq_affinity_desc *af_desc,
struct cpu_rmap **rmap)
{
struct mlx5_irq *irq;
mutex_lock(&pool->lock);
irq = xa_load(&pool->irqs, vecidx);
if (irq) {
mlx5_irq_get_locked(irq);
goto unlock;
}
irq = mlx5_irq_alloc(pool, vecidx, af_desc, rmap);
unlock:
mutex_unlock(&pool->lock);
return irq;
}
static struct mlx5_irq_pool *sf_ctrl_irq_pool_get(struct mlx5_irq_table *irq_table)
{
return irq_table->sf_ctrl_pool;
}
static struct mlx5_irq_pool *sf_irq_pool_get(struct mlx5_irq_table *irq_table)
{
return irq_table->sf_comp_pool;
}
struct mlx5_irq_pool *mlx5_irq_pool_get(struct mlx5_core_dev *dev)
{
struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
struct mlx5_irq_pool *pool = NULL;
if (mlx5_core_is_sf(dev))
pool = sf_irq_pool_get(irq_table);
/* In some configs, there won't be a pool of SFs IRQs. Hence, returning
* the PF IRQs pool in case the SF pool doesn't exist.
*/
return pool ? pool : irq_table->pcif_pool;
}
static struct mlx5_irq_pool *ctrl_irq_pool_get(struct mlx5_core_dev *dev)
{
struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
struct mlx5_irq_pool *pool = NULL;
if (mlx5_core_is_sf(dev))
pool = sf_ctrl_irq_pool_get(irq_table);
/* In some configs, there won't be a pool of SFs IRQs. Hence, returning
* the PF IRQs pool in case the SF pool doesn't exist.
*/
return pool ? pool : irq_table->pcif_pool;
}
static void _mlx5_irq_release(struct mlx5_irq *irq)
{
synchronize_irq(irq->map.virq);
mlx5_irq_put(irq);
}
/**
* mlx5_ctrl_irq_release - release a ctrl IRQ back to the system.
* @ctrl_irq: ctrl IRQ to be released.
*/
void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq)
{
_mlx5_irq_release(ctrl_irq);
}
/**
* mlx5_ctrl_irq_request - request a ctrl IRQ for mlx5 device.
* @dev: mlx5 device that requesting the IRQ.
*
* This function returns a pointer to IRQ, or ERR_PTR in case of error.
*/
struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev)
{
struct mlx5_irq_pool *pool = ctrl_irq_pool_get(dev);
struct irq_affinity_desc af_desc;
struct mlx5_irq *irq;
cpumask_copy(&af_desc.mask, cpu_online_mask);
af_desc.is_managed = false;
if (!mlx5_irq_pool_is_sf_pool(pool)) {
/* In case we are allocating a control IRQ from a pci device's pool.
* This can happen also for a SF if the SFs pool is empty.
*/
if (!pool->xa_num_irqs.max) {
cpumask_clear(&af_desc.mask);
/* In case we only have a single IRQ for PF/VF */
cpumask_set_cpu(cpumask_first(cpu_online_mask), &af_desc.mask);
}
/* Allocate the IRQ in index 0. The vector was already allocated */
irq = irq_pool_request_vector(pool, 0, &af_desc, NULL);
} else {
irq = mlx5_irq_affinity_request(pool, &af_desc);
}
return irq;
}
/**
* mlx5_irq_request - request an IRQ for mlx5 PF/VF device.
* @dev: mlx5 device that requesting the IRQ.
* @vecidx: vector index of the IRQ. This argument is ignore if affinity is
* provided.
* @af_desc: affinity descriptor for this IRQ.
* @rmap: pointer to reverse map pointer for completion interrupts
*
* This function returns a pointer to IRQ, or ERR_PTR in case of error.
*/
struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
struct irq_affinity_desc *af_desc,
struct cpu_rmap **rmap)
{
struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
struct mlx5_irq_pool *pool;
struct mlx5_irq *irq;
pool = irq_table->pcif_pool;
irq = irq_pool_request_vector(pool, vecidx, af_desc, rmap);
if (IS_ERR(irq))
return irq;
mlx5_core_dbg(dev, "irq %u mapped to cpu %*pbl, %u EQs on this irq\n",
irq->map.virq, cpumask_pr_args(&af_desc->mask),
irq->refcount / MLX5_EQ_REFS_PER_IRQ);
return irq;
}
/**
* mlx5_msix_alloc - allocate msix interrupt
* @dev: mlx5 device from which to request
* @handler: interrupt handler
* @affdesc: affinity descriptor
* @name: interrupt name
*
* Returns: struct msi_map with result encoded.
* Note: the caller must make sure to release the irq by calling
* mlx5_msix_free() if shutdown was initiated.
*/
struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev,
irqreturn_t (*handler)(int, void *),
const struct irq_affinity_desc *affdesc,
const char *name)
{
struct msi_map map;
int err;
if (!dev->pdev) {
map.virq = 0;
map.index = -EINVAL;
return map;
}
map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, affdesc);
if (!map.virq)
return map;
err = request_irq(map.virq, handler, 0, name, NULL);
if (err) {
mlx5_core_warn(dev, "err %d\n", err);
pci_msix_free_irq(dev->pdev, map);
map.virq = 0;
map.index = -ENOMEM;
}
return map;
}
EXPORT_SYMBOL(mlx5_msix_alloc);
/**
* mlx5_msix_free - free a previously allocated msix interrupt
* @dev: mlx5 device associated with interrupt
* @map: map previously returned by mlx5_msix_alloc()
*/
void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map)
{
free_irq(map.virq, NULL);
pci_msix_free_irq(dev->pdev, map);
}
EXPORT_SYMBOL(mlx5_msix_free);
/**
* mlx5_irq_release_vector - release one IRQ back to the system.
* @irq: the irq to release.
*/
void mlx5_irq_release_vector(struct mlx5_irq *irq)
{
_mlx5_irq_release(irq);
}
/**
* mlx5_irq_request_vector - request one IRQ for mlx5 device.
* @dev: mlx5 device that is requesting the IRQ.
* @cpu: CPU to bind the IRQ to.
* @vecidx: vector index to request an IRQ for.
* @rmap: pointer to reverse map pointer for completion interrupts
*
* Each IRQ is bound to at most 1 CPU.
* This function is requests one IRQ, for the given @vecidx.
*
* This function returns a pointer to the irq on success, or an error pointer
* in case of an error.
*/
struct mlx5_irq *mlx5_irq_request_vector(struct mlx5_core_dev *dev, u16 cpu,
u16 vecidx, struct cpu_rmap **rmap)
{
struct mlx5_irq_table *table = mlx5_irq_table_get(dev);
struct mlx5_irq_pool *pool = table->pcif_pool;
struct irq_affinity_desc af_desc;
int offset = MLX5_IRQ_VEC_COMP_BASE;
if (!pool->xa_num_irqs.max)
offset = 0;
af_desc.is_managed = false;
cpumask_clear(&af_desc.mask);
cpumask_set_cpu(cpu, &af_desc.mask);
return mlx5_irq_request(dev, vecidx + offset, &af_desc, rmap);
}
static struct mlx5_irq_pool *
irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name,
u32 min_threshold, u32 max_threshold)
{
struct mlx5_irq_pool *pool = kvzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool)
return ERR_PTR(-ENOMEM);
pool->dev = dev;
mutex_init(&pool->lock);
xa_init_flags(&pool->irqs, XA_FLAGS_ALLOC);
pool->xa_num_irqs.min = start;
pool->xa_num_irqs.max = start + size - 1;
if (name)
snprintf(pool->name, MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS,
"%s", name);
pool->min_threshold = min_threshold * MLX5_EQ_REFS_PER_IRQ;
pool->max_threshold = max_threshold * MLX5_EQ_REFS_PER_IRQ;
mlx5_core_dbg(dev, "pool->name = %s, pool->size = %d, pool->start = %d",
name, size, start);
return pool;
}
static void irq_pool_free(struct mlx5_irq_pool *pool)
{
struct mlx5_irq *irq;
unsigned long index;
/* There are cases in which we are destrying the irq_table before
* freeing all the IRQs, fast teardown for example. Hence, free the irqs
* which might not have been freed.
*/
xa_for_each(&pool->irqs, index, irq)
irq_release(irq);
xa_destroy(&pool->irqs);
mutex_destroy(&pool->lock);
kfree(pool->irqs_per_cpu);
kvfree(pool);
}
static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec)
{
struct mlx5_irq_table *table = dev->priv.irq_table;
int num_sf_ctrl_by_msix;
int num_sf_ctrl_by_sfs;
int num_sf_ctrl;
int err;
/* init pcif_pool */
table->pcif_pool = irq_pool_alloc(dev, 0, pcif_vec, NULL,
MLX5_EQ_SHARE_IRQ_MIN_COMP,
MLX5_EQ_SHARE_IRQ_MAX_COMP);
if (IS_ERR(table->pcif_pool))
return PTR_ERR(table->pcif_pool);
if (!mlx5_sf_max_functions(dev))
return 0;
if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) {
mlx5_core_dbg(dev, "Not enught IRQs for SFs. SF may run at lower performance\n");
return 0;
}
/* init sf_ctrl_pool */
num_sf_ctrl_by_msix = DIV_ROUND_UP(sf_vec, MLX5_COMP_EQS_PER_SF);
num_sf_ctrl_by_sfs = DIV_ROUND_UP(mlx5_sf_max_functions(dev),
MLX5_SFS_PER_CTRL_IRQ);
num_sf_ctrl = min_t(int, num_sf_ctrl_by_msix, num_sf_ctrl_by_sfs);
num_sf_ctrl = min_t(int, MLX5_IRQ_CTRL_SF_MAX, num_sf_ctrl);
table->sf_ctrl_pool = irq_pool_alloc(dev, pcif_vec, num_sf_ctrl,
"mlx5_sf_ctrl",
MLX5_EQ_SHARE_IRQ_MIN_CTRL,
MLX5_EQ_SHARE_IRQ_MAX_CTRL);
if (IS_ERR(table->sf_ctrl_pool)) {
err = PTR_ERR(table->sf_ctrl_pool);
goto err_pf;
}
/* init sf_comp_pool */
table->sf_comp_pool = irq_pool_alloc(dev, pcif_vec + num_sf_ctrl,
sf_vec - num_sf_ctrl, "mlx5_sf_comp",
MLX5_EQ_SHARE_IRQ_MIN_COMP,
MLX5_EQ_SHARE_IRQ_MAX_COMP);
if (IS_ERR(table->sf_comp_pool)) {
err = PTR_ERR(table->sf_comp_pool);
goto err_sf_ctrl;
}
table->sf_comp_pool->irqs_per_cpu = kcalloc(nr_cpu_ids, sizeof(u16), GFP_KERNEL);
if (!table->sf_comp_pool->irqs_per_cpu) {
err = -ENOMEM;
goto err_irqs_per_cpu;
}
return 0;
err_irqs_per_cpu:
irq_pool_free(table->sf_comp_pool);
err_sf_ctrl:
irq_pool_free(table->sf_ctrl_pool);
err_pf:
irq_pool_free(table->pcif_pool);
return err;
}
static void irq_pools_destroy(struct mlx5_irq_table *table)
{
if (table->sf_ctrl_pool) {
irq_pool_free(table->sf_comp_pool);
irq_pool_free(table->sf_ctrl_pool);
}
irq_pool_free(table->pcif_pool);
}
net/mlx5: Free irqs only on shutdown callback Whenever a shutdown is invoked, free irqs only and keep mlx5_irq synthetic wrapper intact in order to avoid use-after-free on system shutdown. for example: ================================================================== BUG: KASAN: use-after-free in _find_first_bit+0x66/0x80 Read of size 8 at addr ffff88823fc0d318 by task kworker/u192:0/13608 CPU: 25 PID: 13608 Comm: kworker/u192:0 Tainted: G B W O 6.1.21-cloudflare-kasan-2023.3.21 #1 Hardware name: GIGABYTE R162-R2-GEN0/MZ12-HD2-CD, BIOS R14 05/03/2021 Workqueue: mlx5e mlx5e_tx_timeout_work [mlx5_core] Call Trace: <TASK> dump_stack_lvl+0x34/0x48 print_report+0x170/0x473 ? _find_first_bit+0x66/0x80 kasan_report+0xad/0x130 ? _find_first_bit+0x66/0x80 _find_first_bit+0x66/0x80 mlx5e_open_channels+0x3c5/0x3a10 [mlx5_core] ? console_unlock+0x2fa/0x430 ? _raw_spin_lock_irqsave+0x8d/0xf0 ? _raw_spin_unlock_irqrestore+0x42/0x80 ? preempt_count_add+0x7d/0x150 ? __wake_up_klogd.part.0+0x7d/0xc0 ? vprintk_emit+0xfe/0x2c0 ? mlx5e_trigger_napi_sched+0x40/0x40 [mlx5_core] ? dev_attr_show.cold+0x35/0x35 ? devlink_health_do_dump.part.0+0x174/0x340 ? devlink_health_report+0x504/0x810 ? mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] ? process_one_work+0x680/0x1050 mlx5e_safe_switch_params+0x156/0x220 [mlx5_core] ? mlx5e_switch_priv_channels+0x310/0x310 [mlx5_core] ? mlx5_eq_poll_irq_disabled+0xb6/0x100 [mlx5_core] mlx5e_tx_reporter_timeout_recover+0x123/0x240 [mlx5_core] ? __mutex_unlock_slowpath.constprop.0+0x2b0/0x2b0 devlink_health_reporter_recover+0xa6/0x1f0 devlink_health_report+0x2f7/0x810 ? vsnprintf+0x854/0x15e0 mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_reporter_tx_err_cqe+0x1a0/0x1a0 [mlx5_core] ? mlx5e_tx_reporter_timeout_dump+0x50/0x50 [mlx5_core] ? mlx5e_tx_reporter_dump_sq+0x260/0x260 [mlx5_core] ? newidle_balance+0x9b7/0xe30 ? psi_group_change+0x6a7/0xb80 ? mutex_lock+0x96/0xf0 ? __mutex_lock_slowpath+0x10/0x10 mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] process_one_work+0x680/0x1050 worker_thread+0x5a0/0xeb0 ? process_one_work+0x1050/0x1050 kthread+0x2a2/0x340 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> Freed by task 1: kasan_save_stack+0x23/0x50 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x40 ____kasan_slab_free+0x169/0x1d0 slab_free_freelist_hook+0xd2/0x190 __kmem_cache_free+0x1a1/0x2f0 irq_pool_free+0x138/0x200 [mlx5_core] mlx5_irq_table_destroy+0xf6/0x170 [mlx5_core] mlx5_core_eq_free_irqs+0x74/0xf0 [mlx5_core] shutdown+0x194/0x1aa [mlx5_core] pci_device_shutdown+0x75/0x120 device_shutdown+0x35c/0x620 kernel_restart+0x60/0xa0 __do_sys_reboot+0x1cb/0x2c0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x4b/0xb5 The buggy address belongs to the object at ffff88823fc0d300 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 24 bytes inside of 192-byte region [ffff88823fc0d300, ffff88823fc0d3c0) The buggy address belongs to the physical page: page:0000000010139587 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x23fc0c head:0000000010139587 order:1 compound_mapcount:0 compound_pincount:0 flags: 0x2ffff800010200(slab|head|node=0|zone=2|lastcpupid=0x1ffff) raw: 002ffff800010200 0000000000000000 dead000000000122 ffff88810004ca00 raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88823fc0d200: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88823fc0d280: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff88823fc0d300: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88823fc0d380: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff88823fc0d400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== general protection fault, probably for non-canonical address 0xdffffc005c40d7ac: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: probably user-memory-access in range [0x00000002e206bd60-0x00000002e206bd67] CPU: 25 PID: 13608 Comm: kworker/u192:0 Tainted: G B W O 6.1.21-cloudflare-kasan-2023.3.21 #1 Hardware name: GIGABYTE R162-R2-GEN0/MZ12-HD2-CD, BIOS R14 05/03/2021 Workqueue: mlx5e mlx5e_tx_timeout_work [mlx5_core] RIP: 0010:__alloc_pages+0x141/0x5c0 Call Trace: <TASK> ? sysvec_apic_timer_interrupt+0xa0/0xc0 ? asm_sysvec_apic_timer_interrupt+0x16/0x20 ? __alloc_pages_slowpath.constprop.0+0x1ec0/0x1ec0 ? _raw_spin_unlock_irqrestore+0x3d/0x80 __kmalloc_large_node+0x80/0x120 ? kvmalloc_node+0x4e/0x170 __kmalloc_node+0xd4/0x150 kvmalloc_node+0x4e/0x170 mlx5e_open_channels+0x631/0x3a10 [mlx5_core] ? console_unlock+0x2fa/0x430 ? _raw_spin_lock_irqsave+0x8d/0xf0 ? _raw_spin_unlock_irqrestore+0x42/0x80 ? preempt_count_add+0x7d/0x150 ? __wake_up_klogd.part.0+0x7d/0xc0 ? vprintk_emit+0xfe/0x2c0 ? mlx5e_trigger_napi_sched+0x40/0x40 [mlx5_core] ? dev_attr_show.cold+0x35/0x35 ? devlink_health_do_dump.part.0+0x174/0x340 ? devlink_health_report+0x504/0x810 ? mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] ? process_one_work+0x680/0x1050 mlx5e_safe_switch_params+0x156/0x220 [mlx5_core] ? mlx5e_switch_priv_channels+0x310/0x310 [mlx5_core] ? mlx5_eq_poll_irq_disabled+0xb6/0x100 [mlx5_core] mlx5e_tx_reporter_timeout_recover+0x123/0x240 [mlx5_core] ? __mutex_unlock_slowpath.constprop.0+0x2b0/0x2b0 devlink_health_reporter_recover+0xa6/0x1f0 devlink_health_report+0x2f7/0x810 ? vsnprintf+0x854/0x15e0 mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_reporter_tx_err_cqe+0x1a0/0x1a0 [mlx5_core] ? mlx5e_tx_reporter_timeout_dump+0x50/0x50 [mlx5_core] ? mlx5e_tx_reporter_dump_sq+0x260/0x260 [mlx5_core] ? newidle_balance+0x9b7/0xe30 ? psi_group_change+0x6a7/0xb80 ? mutex_lock+0x96/0xf0 ? __mutex_lock_slowpath+0x10/0x10 mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] process_one_work+0x680/0x1050 worker_thread+0x5a0/0xeb0 ? process_one_work+0x1050/0x1050 kthread+0x2a2/0x340 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> ---[ end trace 0000000000000000 ]--- RIP: 0010:__alloc_pages+0x141/0x5c0 Code: e0 39 a3 96 89 e9 b8 22 01 32 01 83 e1 0f 48 89 fa 01 c9 48 c1 ea 03 d3 f8 83 e0 03 89 44 24 6c 48 b8 00 00 00 00 00 fc ff df <80> 3c 02 00 0f 85 fc 03 00 00 89 e8 4a 8b 14 f5 e0 39 a3 96 4c 89 RSP: 0018:ffff888251f0f438 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 1ffff1104a3e1e8b RCX: 0000000000000000 RDX: 000000005c40d7ac RSI: 0000000000000003 RDI: 00000002e206bd60 RBP: 0000000000052dc0 R08: ffff8882b0044218 R09: ffff8882b0045e8a R10: fffffbfff300fefc R11: ffff888167af4000 R12: 0000000000000003 R13: 0000000000000000 R14: 00000000696c7070 R15: ffff8882373f4380 FS: 0000000000000000(0000) GS:ffff88bf2be80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005641d031eee8 CR3: 0000002e7ca14000 CR4: 0000000000350ee0 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x11000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]---] Reported-by: Frederick Lawler <fred@cloudflare.com> Link: https://lore.kernel.org/netdev/be5b9271-7507-19c5-ded1-fa78f1980e69@cloudflare.com Signed-off-by: Shay Drory <shayd@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2023-04-13 19:15:31 +00:00
static void mlx5_irq_pool_free_irqs(struct mlx5_irq_pool *pool)
{
struct mlx5_irq *irq;
unsigned long index;
xa_for_each(&pool->irqs, index, irq)
net/mlx5: Free IRQ rmap and notifier on kernel shutdown The kernel IRQ system needs the irq affinity notifier to be clear before attempting to free the irq, see WARN_ON log below. On a normal driver unload we don't have this issue since we do the complete cleanup of the irq resources. To fix this, put the important resources cleanup in a helper function and use it in both normal driver unload and shutdown flows. [ 4497.498434] ------------[ cut here ]------------ [ 4497.498726] WARNING: CPU: 0 PID: 9 at kernel/irq/manage.c:2034 free_irq+0x295/0x340 [ 4497.499193] Modules linked in: [ 4497.499386] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.4.0-rc4+ #10 [ 4497.499876] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014 [ 4497.500518] Workqueue: events do_poweroff [ 4497.500849] RIP: 0010:free_irq+0x295/0x340 [ 4497.501132] Code: 85 c0 0f 84 1d ff ff ff 48 89 ef ff d0 0f 1f 00 e9 10 ff ff ff 0f 0b e9 72 ff ff ff 49 8d 7f 28 ff d0 0f 1f 00 e9 df fd ff ff <0f> 0b 48 c7 80 c0 008 [ 4497.502269] RSP: 0018:ffffc90000053da0 EFLAGS: 00010282 [ 4497.502589] RAX: ffff888100949600 RBX: ffff88810330b948 RCX: 0000000000000000 [ 4497.503035] RDX: ffff888100949600 RSI: ffff888100400490 RDI: 0000000000000023 [ 4497.503472] RBP: ffff88810330c7e0 R08: ffff8881004005d0 R09: ffffffff8273a260 [ 4497.503923] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8881009ae000 [ 4497.504359] R13: ffff8881009ae148 R14: 0000000000000000 R15: ffff888100949600 [ 4497.504804] FS: 0000000000000000(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 [ 4497.505302] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4497.505671] CR2: 00007fce98806298 CR3: 000000000262e005 CR4: 0000000000370ef0 [ 4497.506104] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4497.506540] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4497.507002] Call Trace: [ 4497.507158] <TASK> [ 4497.507299] ? free_irq+0x295/0x340 [ 4497.507522] ? __warn+0x7c/0x130 [ 4497.507740] ? free_irq+0x295/0x340 [ 4497.507963] ? report_bug+0x171/0x1a0 [ 4497.508197] ? handle_bug+0x3c/0x70 [ 4497.508417] ? exc_invalid_op+0x17/0x70 [ 4497.508662] ? asm_exc_invalid_op+0x1a/0x20 [ 4497.508926] ? free_irq+0x295/0x340 [ 4497.509146] mlx5_irq_pool_free_irqs+0x48/0x90 [ 4497.509421] mlx5_irq_table_free_irqs+0x38/0x50 [ 4497.509714] mlx5_core_eq_free_irqs+0x27/0x40 [ 4497.509984] shutdown+0x7b/0x100 [ 4497.510184] pci_device_shutdown+0x30/0x60 [ 4497.510440] device_shutdown+0x14d/0x240 [ 4497.510698] kernel_power_off+0x30/0x70 [ 4497.510938] process_one_work+0x1e6/0x3e0 [ 4497.511183] worker_thread+0x49/0x3b0 [ 4497.511407] ? __pfx_worker_thread+0x10/0x10 [ 4497.511679] kthread+0xe0/0x110 [ 4497.511879] ? __pfx_kthread+0x10/0x10 [ 4497.512114] ret_from_fork+0x29/0x50 [ 4497.512342] </TASK> Fixes: 9c2d08010963 ("net/mlx5: Free irqs only on shutdown callback") Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> Reviewed-by: Shay Drory <shayd@nvidia.com>
2023-06-08 19:00:54 +00:00
mlx5_system_free_irq(irq);
net/mlx5: Free irqs only on shutdown callback Whenever a shutdown is invoked, free irqs only and keep mlx5_irq synthetic wrapper intact in order to avoid use-after-free on system shutdown. for example: ================================================================== BUG: KASAN: use-after-free in _find_first_bit+0x66/0x80 Read of size 8 at addr ffff88823fc0d318 by task kworker/u192:0/13608 CPU: 25 PID: 13608 Comm: kworker/u192:0 Tainted: G B W O 6.1.21-cloudflare-kasan-2023.3.21 #1 Hardware name: GIGABYTE R162-R2-GEN0/MZ12-HD2-CD, BIOS R14 05/03/2021 Workqueue: mlx5e mlx5e_tx_timeout_work [mlx5_core] Call Trace: <TASK> dump_stack_lvl+0x34/0x48 print_report+0x170/0x473 ? _find_first_bit+0x66/0x80 kasan_report+0xad/0x130 ? _find_first_bit+0x66/0x80 _find_first_bit+0x66/0x80 mlx5e_open_channels+0x3c5/0x3a10 [mlx5_core] ? console_unlock+0x2fa/0x430 ? _raw_spin_lock_irqsave+0x8d/0xf0 ? _raw_spin_unlock_irqrestore+0x42/0x80 ? preempt_count_add+0x7d/0x150 ? __wake_up_klogd.part.0+0x7d/0xc0 ? vprintk_emit+0xfe/0x2c0 ? mlx5e_trigger_napi_sched+0x40/0x40 [mlx5_core] ? dev_attr_show.cold+0x35/0x35 ? devlink_health_do_dump.part.0+0x174/0x340 ? devlink_health_report+0x504/0x810 ? mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] ? process_one_work+0x680/0x1050 mlx5e_safe_switch_params+0x156/0x220 [mlx5_core] ? mlx5e_switch_priv_channels+0x310/0x310 [mlx5_core] ? mlx5_eq_poll_irq_disabled+0xb6/0x100 [mlx5_core] mlx5e_tx_reporter_timeout_recover+0x123/0x240 [mlx5_core] ? __mutex_unlock_slowpath.constprop.0+0x2b0/0x2b0 devlink_health_reporter_recover+0xa6/0x1f0 devlink_health_report+0x2f7/0x810 ? vsnprintf+0x854/0x15e0 mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_reporter_tx_err_cqe+0x1a0/0x1a0 [mlx5_core] ? mlx5e_tx_reporter_timeout_dump+0x50/0x50 [mlx5_core] ? mlx5e_tx_reporter_dump_sq+0x260/0x260 [mlx5_core] ? newidle_balance+0x9b7/0xe30 ? psi_group_change+0x6a7/0xb80 ? mutex_lock+0x96/0xf0 ? __mutex_lock_slowpath+0x10/0x10 mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] process_one_work+0x680/0x1050 worker_thread+0x5a0/0xeb0 ? process_one_work+0x1050/0x1050 kthread+0x2a2/0x340 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> Freed by task 1: kasan_save_stack+0x23/0x50 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x40 ____kasan_slab_free+0x169/0x1d0 slab_free_freelist_hook+0xd2/0x190 __kmem_cache_free+0x1a1/0x2f0 irq_pool_free+0x138/0x200 [mlx5_core] mlx5_irq_table_destroy+0xf6/0x170 [mlx5_core] mlx5_core_eq_free_irqs+0x74/0xf0 [mlx5_core] shutdown+0x194/0x1aa [mlx5_core] pci_device_shutdown+0x75/0x120 device_shutdown+0x35c/0x620 kernel_restart+0x60/0xa0 __do_sys_reboot+0x1cb/0x2c0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x4b/0xb5 The buggy address belongs to the object at ffff88823fc0d300 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 24 bytes inside of 192-byte region [ffff88823fc0d300, ffff88823fc0d3c0) The buggy address belongs to the physical page: page:0000000010139587 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x23fc0c head:0000000010139587 order:1 compound_mapcount:0 compound_pincount:0 flags: 0x2ffff800010200(slab|head|node=0|zone=2|lastcpupid=0x1ffff) raw: 002ffff800010200 0000000000000000 dead000000000122 ffff88810004ca00 raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88823fc0d200: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88823fc0d280: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff88823fc0d300: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88823fc0d380: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff88823fc0d400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== general protection fault, probably for non-canonical address 0xdffffc005c40d7ac: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: probably user-memory-access in range [0x00000002e206bd60-0x00000002e206bd67] CPU: 25 PID: 13608 Comm: kworker/u192:0 Tainted: G B W O 6.1.21-cloudflare-kasan-2023.3.21 #1 Hardware name: GIGABYTE R162-R2-GEN0/MZ12-HD2-CD, BIOS R14 05/03/2021 Workqueue: mlx5e mlx5e_tx_timeout_work [mlx5_core] RIP: 0010:__alloc_pages+0x141/0x5c0 Call Trace: <TASK> ? sysvec_apic_timer_interrupt+0xa0/0xc0 ? asm_sysvec_apic_timer_interrupt+0x16/0x20 ? __alloc_pages_slowpath.constprop.0+0x1ec0/0x1ec0 ? _raw_spin_unlock_irqrestore+0x3d/0x80 __kmalloc_large_node+0x80/0x120 ? kvmalloc_node+0x4e/0x170 __kmalloc_node+0xd4/0x150 kvmalloc_node+0x4e/0x170 mlx5e_open_channels+0x631/0x3a10 [mlx5_core] ? console_unlock+0x2fa/0x430 ? _raw_spin_lock_irqsave+0x8d/0xf0 ? _raw_spin_unlock_irqrestore+0x42/0x80 ? preempt_count_add+0x7d/0x150 ? __wake_up_klogd.part.0+0x7d/0xc0 ? vprintk_emit+0xfe/0x2c0 ? mlx5e_trigger_napi_sched+0x40/0x40 [mlx5_core] ? dev_attr_show.cold+0x35/0x35 ? devlink_health_do_dump.part.0+0x174/0x340 ? devlink_health_report+0x504/0x810 ? mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] ? process_one_work+0x680/0x1050 mlx5e_safe_switch_params+0x156/0x220 [mlx5_core] ? mlx5e_switch_priv_channels+0x310/0x310 [mlx5_core] ? mlx5_eq_poll_irq_disabled+0xb6/0x100 [mlx5_core] mlx5e_tx_reporter_timeout_recover+0x123/0x240 [mlx5_core] ? __mutex_unlock_slowpath.constprop.0+0x2b0/0x2b0 devlink_health_reporter_recover+0xa6/0x1f0 devlink_health_report+0x2f7/0x810 ? vsnprintf+0x854/0x15e0 mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_reporter_tx_err_cqe+0x1a0/0x1a0 [mlx5_core] ? mlx5e_tx_reporter_timeout_dump+0x50/0x50 [mlx5_core] ? mlx5e_tx_reporter_dump_sq+0x260/0x260 [mlx5_core] ? newidle_balance+0x9b7/0xe30 ? psi_group_change+0x6a7/0xb80 ? mutex_lock+0x96/0xf0 ? __mutex_lock_slowpath+0x10/0x10 mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] process_one_work+0x680/0x1050 worker_thread+0x5a0/0xeb0 ? process_one_work+0x1050/0x1050 kthread+0x2a2/0x340 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> ---[ end trace 0000000000000000 ]--- RIP: 0010:__alloc_pages+0x141/0x5c0 Code: e0 39 a3 96 89 e9 b8 22 01 32 01 83 e1 0f 48 89 fa 01 c9 48 c1 ea 03 d3 f8 83 e0 03 89 44 24 6c 48 b8 00 00 00 00 00 fc ff df <80> 3c 02 00 0f 85 fc 03 00 00 89 e8 4a 8b 14 f5 e0 39 a3 96 4c 89 RSP: 0018:ffff888251f0f438 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 1ffff1104a3e1e8b RCX: 0000000000000000 RDX: 000000005c40d7ac RSI: 0000000000000003 RDI: 00000002e206bd60 RBP: 0000000000052dc0 R08: ffff8882b0044218 R09: ffff8882b0045e8a R10: fffffbfff300fefc R11: ffff888167af4000 R12: 0000000000000003 R13: 0000000000000000 R14: 00000000696c7070 R15: ffff8882373f4380 FS: 0000000000000000(0000) GS:ffff88bf2be80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005641d031eee8 CR3: 0000002e7ca14000 CR4: 0000000000350ee0 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x11000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]---] Reported-by: Frederick Lawler <fred@cloudflare.com> Link: https://lore.kernel.org/netdev/be5b9271-7507-19c5-ded1-fa78f1980e69@cloudflare.com Signed-off-by: Shay Drory <shayd@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2023-04-13 19:15:31 +00:00
}
static void mlx5_irq_pools_free_irqs(struct mlx5_irq_table *table)
{
if (table->sf_ctrl_pool) {
mlx5_irq_pool_free_irqs(table->sf_comp_pool);
mlx5_irq_pool_free_irqs(table->sf_ctrl_pool);
}
mlx5_irq_pool_free_irqs(table->pcif_pool);
}
/* irq_table API */
int mlx5_irq_table_init(struct mlx5_core_dev *dev)
{
struct mlx5_irq_table *irq_table;
if (mlx5_core_is_sf(dev))
return 0;
irq_table = kvzalloc_node(sizeof(*irq_table), GFP_KERNEL,
dev->priv.numa_node);
if (!irq_table)
return -ENOMEM;
dev->priv.irq_table = irq_table;
return 0;
}
void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
{
if (mlx5_core_is_sf(dev))
return;
kvfree(dev->priv.irq_table);
}
int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
{
if (!table->pcif_pool->xa_num_irqs.max)
return 1;
return table->pcif_pool->xa_num_irqs.max - table->pcif_pool->xa_num_irqs.min;
}
int mlx5_irq_table_create(struct mlx5_core_dev *dev)
{
int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
MLX5_CAP_GEN(dev, max_num_eqs) :
1 << MLX5_CAP_GEN(dev, log_max_eq);
int total_vec;
int pcif_vec;
int req_vec;
int err;
int n;
if (mlx5_core_is_sf(dev))
return 0;
pcif_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + 1;
pcif_vec = min_t(int, pcif_vec, num_eqs);
total_vec = pcif_vec;
if (mlx5_sf_max_functions(dev))
total_vec += MLX5_IRQ_CTRL_SF_MAX +
MLX5_COMP_EQS_PER_SF * mlx5_sf_max_functions(dev);
total_vec = min_t(int, total_vec, pci_msix_vec_count(dev->pdev));
pcif_vec = min_t(int, pcif_vec, pci_msix_vec_count(dev->pdev));
req_vec = pci_msix_can_alloc_dyn(dev->pdev) ? 1 : total_vec;
n = pci_alloc_irq_vectors(dev->pdev, 1, req_vec, PCI_IRQ_MSIX);
if (n < 0)
return n;
err = irq_pools_init(dev, total_vec - pcif_vec, pcif_vec);
if (err)
pci_free_irq_vectors(dev->pdev);
return err;
}
void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
{
struct mlx5_irq_table *table = dev->priv.irq_table;
if (mlx5_core_is_sf(dev))
return;
/* There are cases where IRQs still will be in used when we reaching
* to here. Hence, making sure all the irqs are released.
*/
irq_pools_destroy(table);
pci_free_irq_vectors(dev->pdev);
}
net/mlx5: Free irqs only on shutdown callback Whenever a shutdown is invoked, free irqs only and keep mlx5_irq synthetic wrapper intact in order to avoid use-after-free on system shutdown. for example: ================================================================== BUG: KASAN: use-after-free in _find_first_bit+0x66/0x80 Read of size 8 at addr ffff88823fc0d318 by task kworker/u192:0/13608 CPU: 25 PID: 13608 Comm: kworker/u192:0 Tainted: G B W O 6.1.21-cloudflare-kasan-2023.3.21 #1 Hardware name: GIGABYTE R162-R2-GEN0/MZ12-HD2-CD, BIOS R14 05/03/2021 Workqueue: mlx5e mlx5e_tx_timeout_work [mlx5_core] Call Trace: <TASK> dump_stack_lvl+0x34/0x48 print_report+0x170/0x473 ? _find_first_bit+0x66/0x80 kasan_report+0xad/0x130 ? _find_first_bit+0x66/0x80 _find_first_bit+0x66/0x80 mlx5e_open_channels+0x3c5/0x3a10 [mlx5_core] ? console_unlock+0x2fa/0x430 ? _raw_spin_lock_irqsave+0x8d/0xf0 ? _raw_spin_unlock_irqrestore+0x42/0x80 ? preempt_count_add+0x7d/0x150 ? __wake_up_klogd.part.0+0x7d/0xc0 ? vprintk_emit+0xfe/0x2c0 ? mlx5e_trigger_napi_sched+0x40/0x40 [mlx5_core] ? dev_attr_show.cold+0x35/0x35 ? devlink_health_do_dump.part.0+0x174/0x340 ? devlink_health_report+0x504/0x810 ? mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] ? process_one_work+0x680/0x1050 mlx5e_safe_switch_params+0x156/0x220 [mlx5_core] ? mlx5e_switch_priv_channels+0x310/0x310 [mlx5_core] ? mlx5_eq_poll_irq_disabled+0xb6/0x100 [mlx5_core] mlx5e_tx_reporter_timeout_recover+0x123/0x240 [mlx5_core] ? __mutex_unlock_slowpath.constprop.0+0x2b0/0x2b0 devlink_health_reporter_recover+0xa6/0x1f0 devlink_health_report+0x2f7/0x810 ? vsnprintf+0x854/0x15e0 mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_reporter_tx_err_cqe+0x1a0/0x1a0 [mlx5_core] ? mlx5e_tx_reporter_timeout_dump+0x50/0x50 [mlx5_core] ? mlx5e_tx_reporter_dump_sq+0x260/0x260 [mlx5_core] ? newidle_balance+0x9b7/0xe30 ? psi_group_change+0x6a7/0xb80 ? mutex_lock+0x96/0xf0 ? __mutex_lock_slowpath+0x10/0x10 mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] process_one_work+0x680/0x1050 worker_thread+0x5a0/0xeb0 ? process_one_work+0x1050/0x1050 kthread+0x2a2/0x340 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> Freed by task 1: kasan_save_stack+0x23/0x50 kasan_set_track+0x21/0x30 kasan_save_free_info+0x2a/0x40 ____kasan_slab_free+0x169/0x1d0 slab_free_freelist_hook+0xd2/0x190 __kmem_cache_free+0x1a1/0x2f0 irq_pool_free+0x138/0x200 [mlx5_core] mlx5_irq_table_destroy+0xf6/0x170 [mlx5_core] mlx5_core_eq_free_irqs+0x74/0xf0 [mlx5_core] shutdown+0x194/0x1aa [mlx5_core] pci_device_shutdown+0x75/0x120 device_shutdown+0x35c/0x620 kernel_restart+0x60/0xa0 __do_sys_reboot+0x1cb/0x2c0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x4b/0xb5 The buggy address belongs to the object at ffff88823fc0d300 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 24 bytes inside of 192-byte region [ffff88823fc0d300, ffff88823fc0d3c0) The buggy address belongs to the physical page: page:0000000010139587 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x23fc0c head:0000000010139587 order:1 compound_mapcount:0 compound_pincount:0 flags: 0x2ffff800010200(slab|head|node=0|zone=2|lastcpupid=0x1ffff) raw: 002ffff800010200 0000000000000000 dead000000000122 ffff88810004ca00 raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88823fc0d200: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88823fc0d280: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff88823fc0d300: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88823fc0d380: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff88823fc0d400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== general protection fault, probably for non-canonical address 0xdffffc005c40d7ac: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: probably user-memory-access in range [0x00000002e206bd60-0x00000002e206bd67] CPU: 25 PID: 13608 Comm: kworker/u192:0 Tainted: G B W O 6.1.21-cloudflare-kasan-2023.3.21 #1 Hardware name: GIGABYTE R162-R2-GEN0/MZ12-HD2-CD, BIOS R14 05/03/2021 Workqueue: mlx5e mlx5e_tx_timeout_work [mlx5_core] RIP: 0010:__alloc_pages+0x141/0x5c0 Call Trace: <TASK> ? sysvec_apic_timer_interrupt+0xa0/0xc0 ? asm_sysvec_apic_timer_interrupt+0x16/0x20 ? __alloc_pages_slowpath.constprop.0+0x1ec0/0x1ec0 ? _raw_spin_unlock_irqrestore+0x3d/0x80 __kmalloc_large_node+0x80/0x120 ? kvmalloc_node+0x4e/0x170 __kmalloc_node+0xd4/0x150 kvmalloc_node+0x4e/0x170 mlx5e_open_channels+0x631/0x3a10 [mlx5_core] ? console_unlock+0x2fa/0x430 ? _raw_spin_lock_irqsave+0x8d/0xf0 ? _raw_spin_unlock_irqrestore+0x42/0x80 ? preempt_count_add+0x7d/0x150 ? __wake_up_klogd.part.0+0x7d/0xc0 ? vprintk_emit+0xfe/0x2c0 ? mlx5e_trigger_napi_sched+0x40/0x40 [mlx5_core] ? dev_attr_show.cold+0x35/0x35 ? devlink_health_do_dump.part.0+0x174/0x340 ? devlink_health_report+0x504/0x810 ? mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] ? process_one_work+0x680/0x1050 mlx5e_safe_switch_params+0x156/0x220 [mlx5_core] ? mlx5e_switch_priv_channels+0x310/0x310 [mlx5_core] ? mlx5_eq_poll_irq_disabled+0xb6/0x100 [mlx5_core] mlx5e_tx_reporter_timeout_recover+0x123/0x240 [mlx5_core] ? __mutex_unlock_slowpath.constprop.0+0x2b0/0x2b0 devlink_health_reporter_recover+0xa6/0x1f0 devlink_health_report+0x2f7/0x810 ? vsnprintf+0x854/0x15e0 mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core] ? mlx5e_reporter_tx_err_cqe+0x1a0/0x1a0 [mlx5_core] ? mlx5e_tx_reporter_timeout_dump+0x50/0x50 [mlx5_core] ? mlx5e_tx_reporter_dump_sq+0x260/0x260 [mlx5_core] ? newidle_balance+0x9b7/0xe30 ? psi_group_change+0x6a7/0xb80 ? mutex_lock+0x96/0xf0 ? __mutex_lock_slowpath+0x10/0x10 mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core] process_one_work+0x680/0x1050 worker_thread+0x5a0/0xeb0 ? process_one_work+0x1050/0x1050 kthread+0x2a2/0x340 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK> ---[ end trace 0000000000000000 ]--- RIP: 0010:__alloc_pages+0x141/0x5c0 Code: e0 39 a3 96 89 e9 b8 22 01 32 01 83 e1 0f 48 89 fa 01 c9 48 c1 ea 03 d3 f8 83 e0 03 89 44 24 6c 48 b8 00 00 00 00 00 fc ff df <80> 3c 02 00 0f 85 fc 03 00 00 89 e8 4a 8b 14 f5 e0 39 a3 96 4c 89 RSP: 0018:ffff888251f0f438 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 1ffff1104a3e1e8b RCX: 0000000000000000 RDX: 000000005c40d7ac RSI: 0000000000000003 RDI: 00000002e206bd60 RBP: 0000000000052dc0 R08: ffff8882b0044218 R09: ffff8882b0045e8a R10: fffffbfff300fefc R11: ffff888167af4000 R12: 0000000000000003 R13: 0000000000000000 R14: 00000000696c7070 R15: ffff8882373f4380 FS: 0000000000000000(0000) GS:ffff88bf2be80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005641d031eee8 CR3: 0000002e7ca14000 CR4: 0000000000350ee0 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x11000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]---] Reported-by: Frederick Lawler <fred@cloudflare.com> Link: https://lore.kernel.org/netdev/be5b9271-7507-19c5-ded1-fa78f1980e69@cloudflare.com Signed-off-by: Shay Drory <shayd@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2023-04-13 19:15:31 +00:00
void mlx5_irq_table_free_irqs(struct mlx5_core_dev *dev)
{
struct mlx5_irq_table *table = dev->priv.irq_table;
if (mlx5_core_is_sf(dev))
return;
mlx5_irq_pools_free_irqs(table);
pci_free_irq_vectors(dev->pdev);
}
int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table)
{
if (table->sf_comp_pool)
return min_t(int, num_online_cpus(),
table->sf_comp_pool->xa_num_irqs.max -
table->sf_comp_pool->xa_num_irqs.min + 1);
else
return mlx5_irq_table_get_num_comp(table);
}
struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev)
{
#ifdef CONFIG_MLX5_SF
if (mlx5_core_is_sf(dev))
return dev->priv.parent_mdev->priv.irq_table;
#endif
return dev->priv.irq_table;
}