linux-stable/include/linux/mlx5/driver.h

1297 lines
34 KiB
C
Raw Normal View History

/*
* Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef MLX5_DRIVER_H
#define MLX5_DRIVER_H
#include <linux/kernel.h>
#include <linux/completion.h>
#include <linux/pci.h>
#include <linux/irq.h>
#include <linux/spinlock_types.h>
#include <linux/semaphore.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/xarray.h>
#include <linux/workqueue.h>
#include <linux/mempool.h>
#include <linux/interrupt.h>
#include <linux/idr.h>
#include <linux/notifier.h>
#include <linux/refcount.h>
net/mlx5: Register mlx5 devices to auxiliary virtual bus Create auxiliary devices under new virtual bus. This will replace the custom-made mlx5 ->add()/->remove() interfaces and next patches will fill the missing callback and remove the old interface logic. The attachment of auxiliary drivers to the devices is possible in 1-to-1 manner only and it requires us to create device for every protocol, so that device (module) will be able to connect to it. System with 2 IB and 1 RoCE cards: [leonro@vm ~]$ lspci |grep nox 00:09.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5] 00:0a.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] 00:0b.0 Ethernet controller: Mellanox Technologies MT2910 Family [ConnectX-7] [leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/ mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.eth.2 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/mlx5_core.rdma.0 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.rdma.1 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.rdma.2 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.vdpa.1 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.vdpa.2 [leonro@vm ~]$ rdma dev 0: ibp0s9: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 1: ibp0s10: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3456 sys_image_guid 5254:00c0:fe12:3456 2: rdmap0s11: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3457 sys_image_guid 5254:00c0:fe12:3457 System with RoCE SR-IOV card with 4 VFs: [leonro@vm ~]$ lspci |grep nox 01:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] 01:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.2 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.3 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.4 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] [leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/ mlx5_core.eth.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.eth.0 mlx5_core.eth.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.eth.1 mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.eth.2 mlx5_core.eth.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.eth.3 mlx5_core.eth.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.eth.4 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.rdma.0 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.rdma.1 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.rdma.2 mlx5_core.rdma.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.rdma.3 mlx5_core.rdma.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.rdma.4 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.vdpa.1 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.vdpa.2 mlx5_core.vdpa.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.vdpa.3 mlx5_core.vdpa.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.vdpa.4 [leonro@vm ~]$ rdma dev 0: rocep1s0f0: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 1: rocep1s0f0v0: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3456 2: rocep1s0f0v1: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3457 3: rocep1s0f0v2: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3458 4: rocep1s0f0v3: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3459 Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
2020-10-08 13:06:37 +00:00
#include <linux/auxiliary_bus.h>
#include <linux/mlx5/device.h>
#include <linux/mlx5/doorbell.h>
#include <linux/mlx5/eq.h>
#include <linux/timecounter.h>
#include <linux/ptp_clock_kernel.h>
#include <net/devlink.h>
#define MLX5_ADEV_NAME "mlx5_core"
#define MLX5_IRQ_EQ_CTRL (U8_MAX)
enum {
MLX5_BOARD_ID_LEN = 64,
};
enum {
MLX5_CMD_WQ_MAX_NAME = 32,
};
enum {
CMD_OWNER_SW = 0x0,
CMD_OWNER_HW = 0x1,
CMD_STATUS_SUCCESS = 0,
};
enum mlx5_sqp_t {
MLX5_SQP_SMI = 0,
MLX5_SQP_GSI = 1,
MLX5_SQP_IEEE_1588 = 2,
MLX5_SQP_SNIFFER = 3,
MLX5_SQP_SYNC_UMR = 4,
};
enum {
MLX5_MAX_PORTS = 4,
};
enum {
MLX5_ATOMIC_MODE_OFFSET = 16,
MLX5_ATOMIC_MODE_IB_COMP = 1,
MLX5_ATOMIC_MODE_CX = 2,
MLX5_ATOMIC_MODE_8B = 3,
MLX5_ATOMIC_MODE_16B = 4,
MLX5_ATOMIC_MODE_32B = 5,
MLX5_ATOMIC_MODE_64B = 6,
MLX5_ATOMIC_MODE_128B = 7,
MLX5_ATOMIC_MODE_256B = 8,
};
enum {
MLX5_REG_QPTS = 0x4002,
MLX5_REG_QETCR = 0x4005,
MLX5_REG_QTCT = 0x400a,
MLX5_REG_QPDPM = 0x4013,
MLX5_REG_QCAM = 0x4019,
MLX5_REG_DCBX_PARAM = 0x4020,
MLX5_REG_DCBX_APP = 0x4021,
MLX5_REG_FPGA_CAP = 0x4022,
MLX5_REG_FPGA_CTRL = 0x4023,
MLX5_REG_FPGA_ACCESS_REG = 0x4024,
MLX5_REG_CORE_DUMP = 0x402e,
MLX5_REG_PCAP = 0x5001,
MLX5_REG_PMTU = 0x5003,
MLX5_REG_PTYS = 0x5004,
MLX5_REG_PAOS = 0x5006,
MLX5_REG_PFCC = 0x5007,
MLX5_REG_PPCNT = 0x5008,
MLX5_REG_PPTB = 0x500b,
MLX5_REG_PBMC = 0x500c,
MLX5_REG_PMAOS = 0x5012,
MLX5_REG_PUDE = 0x5009,
MLX5_REG_PMPE = 0x5010,
MLX5_REG_PELC = 0x500e,
MLX5_REG_PVLC = 0x500f,
MLX5_REG_PCMR = 0x5041,
MLX5_REG_PDDR = 0x5031,
MLX5_REG_PMLP = 0x5002,
MLX5_REG_PPLM = 0x5023,
MLX5_REG_PCAM = 0x507f,
MLX5_REG_NODE_DESC = 0x6001,
MLX5_REG_HOST_ENDIANNESS = 0x7004,
MLX5_REG_MCIA = 0x9014,
MLX5_REG_MFRL = 0x9028,
MLX5_REG_MLCR = 0x902b,
MLX5_REG_MRTC = 0x902d,
MLX5_REG_MTRC_CAP = 0x9040,
MLX5_REG_MTRC_CONF = 0x9041,
MLX5_REG_MTRC_STDB = 0x9042,
MLX5_REG_MTRC_CTRL = 0x9043,
MLX5_REG_MPEIN = 0x9050,
MLX5_REG_MPCNT = 0x9051,
MLX5_REG_MTPPS = 0x9053,
MLX5_REG_MTPPSE = 0x9054,
MLX5_REG_MTUTC = 0x9055,
MLX5_REG_MPEGC = 0x9056,
MLX5_REG_MCQS = 0x9060,
MLX5_REG_MCQI = 0x9061,
MLX5_REG_MCC = 0x9062,
MLX5_REG_MCDA = 0x9063,
MLX5_REG_MCAM = 0x907f,
MLX5_REG_MIRC = 0x9162,
MLX5_REG_SBCAM = 0xB01F,
MLX5_REG_RESOURCE_DUMP = 0xC000,
MLX5_REG_DTOR = 0xC00E,
};
enum mlx5_qpts_trust_state {
MLX5_QPTS_TRUST_PCP = 1,
MLX5_QPTS_TRUST_DSCP = 2,
};
enum mlx5_dcbx_oper_mode {
MLX5E_DCBX_PARAM_VER_OPER_HOST = 0x0,
MLX5E_DCBX_PARAM_VER_OPER_AUTO = 0x3,
};
enum {
MLX5_ATOMIC_OPS_CMP_SWAP = 1 << 0,
MLX5_ATOMIC_OPS_FETCH_ADD = 1 << 1,
MLX5_ATOMIC_OPS_EXTENDED_CMP_SWAP = 1 << 2,
MLX5_ATOMIC_OPS_EXTENDED_FETCH_ADD = 1 << 3,
};
enum mlx5_page_fault_resume_flags {
MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0,
MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1,
MLX5_PAGE_FAULT_RESUME_RDMA = 1 << 2,
MLX5_PAGE_FAULT_RESUME_ERROR = 1 << 7,
};
enum dbg_rsc_type {
MLX5_DBG_RSC_QP,
MLX5_DBG_RSC_EQ,
MLX5_DBG_RSC_CQ,
};
enum port_state_policy {
MLX5_POLICY_DOWN = 0,
MLX5_POLICY_UP = 1,
MLX5_POLICY_FOLLOW = 2,
MLX5_POLICY_INVALID = 0xffffffff
};
enum mlx5_coredev_type {
MLX5_COREDEV_PF,
MLX5_COREDEV_VF,
MLX5_COREDEV_SF,
};
struct mlx5_field_desc {
int i;
};
struct mlx5_rsc_debug {
struct mlx5_core_dev *dev;
void *object;
enum dbg_rsc_type type;
struct dentry *root;
net/mlx5: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2020-05-07 18:59:35 +00:00
struct mlx5_field_desc fields[];
};
enum mlx5_dev_event {
MLX5_DEV_EVENT_SYS_ERROR = 128, /* 0 - 127 are FW events */
MLX5_DEV_EVENT_PORT_AFFINITY = 129,
};
enum mlx5_port_status {
MLX5_PORT_UP = 1,
MLX5_PORT_DOWN = 2,
};
enum mlx5_cmdif_state {
MLX5_CMDIF_STATE_UNINITIALIZED,
MLX5_CMDIF_STATE_UP,
MLX5_CMDIF_STATE_DOWN,
};
struct mlx5_cmd_first {
__be32 data[4];
};
struct mlx5_cmd_msg {
struct list_head list;
struct cmd_msg_cache *parent;
u32 len;
struct mlx5_cmd_first first;
struct mlx5_cmd_mailbox *next;
};
struct mlx5_cmd_debug {
struct dentry *dbg_root;
void *in_msg;
void *out_msg;
u8 status;
u16 inlen;
u16 outlen;
};
struct cmd_msg_cache {
/* protect block chain allocations
*/
spinlock_t lock;
struct list_head head;
unsigned int max_inbox_size;
unsigned int num_ent;
};
enum {
MLX5_NUM_COMMAND_CACHES = 5,
};
struct mlx5_cmd_stats {
u64 sum;
u64 n;
/* number of times command failed */
u64 failed;
/* number of times command failed on bad status returned by FW */
u64 failed_mbox_status;
/* last command failed returned errno */
u32 last_failed_errno;
/* last bad status returned by FW */
u8 last_failed_mbox_status;
/* last command failed syndrome returned by FW */
u32 last_failed_syndrome;
struct dentry *root;
/* protect command average calculations */
spinlock_t lock;
};
struct mlx5_cmd {
struct mlx5_nb nb;
enum mlx5_cmdif_state state;
void *cmd_alloc_buf;
dma_addr_t alloc_dma;
int alloc_size;
void *cmd_buf;
dma_addr_t dma;
u16 cmdif_rev;
u8 log_sz;
u8 log_stride;
int max_reg_cmds;
int events;
u32 __iomem *vector;
/* protect command queue allocations
*/
spinlock_t alloc_lock;
/* protect token allocations
*/
spinlock_t token_lock;
u8 token;
unsigned long bitmask;
char wq_name[MLX5_CMD_WQ_MAX_NAME];
struct workqueue_struct *wq;
struct semaphore sem;
struct semaphore pages_sem;
int mode;
u16 allowed_opcode;
struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS];
struct dma_pool *pool;
struct mlx5_cmd_debug dbg;
struct cmd_msg_cache cache[MLX5_NUM_COMMAND_CACHES];
int checksum_disabled;
struct mlx5_cmd_stats *stats;
};
struct mlx5_cmd_mailbox {
void *buf;
dma_addr_t dma;
struct mlx5_cmd_mailbox *next;
};
struct mlx5_buf_list {
void *buf;
dma_addr_t map;
};
struct mlx5_frag_buf {
struct mlx5_buf_list *frags;
int npages;
int size;
u8 page_shift;
};
struct mlx5_frag_buf_ctrl {
struct mlx5_buf_list *frags;
u32 sz_m1;
u16 frag_sz_m1;
u16 strides_offset;
u8 log_sz;
u8 log_stride;
u8 log_frag_strides;
};
struct mlx5_core_psv {
u32 psv_idx;
struct psv_layout {
u32 pd;
u16 syndrome;
u16 reserved;
u16 bg;
u16 app_tag;
u32 ref_tag;
} psv;
};
struct mlx5_core_sig_ctx {
struct mlx5_core_psv psv_memory;
struct mlx5_core_psv psv_wire;
struct ib_sig_err err_item;
bool sig_status_checked;
bool sig_err_exists;
u32 sigerr_count;
};
#define MLX5_24BIT_MASK ((1 << 24) - 1)
enum mlx5_res_type {
MLX5_RES_QP = MLX5_EVENT_QUEUE_TYPE_QP,
MLX5_RES_RQ = MLX5_EVENT_QUEUE_TYPE_RQ,
MLX5_RES_SQ = MLX5_EVENT_QUEUE_TYPE_SQ,
MLX5_RES_SRQ = 3,
MLX5_RES_XSRQ = 4,
MLX5_RES_XRQ = 5,
MLX5_RES_DCT = MLX5_EVENT_QUEUE_TYPE_DCT,
};
struct mlx5_core_rsc_common {
enum mlx5_res_type res;
refcount_t refcount;
struct completion free;
};
struct mlx5_uars_page {
void __iomem *map;
bool wc;
u32 index;
struct list_head list;
unsigned int bfregs;
unsigned long *reg_bitmap; /* for non fast path bf regs */
unsigned long *fp_bitmap;
unsigned int reg_avail;
unsigned int fp_avail;
struct kref ref_count;
struct mlx5_core_dev *mdev;
};
struct mlx5_bfreg_head {
/* protect blue flame registers allocations */
struct mutex lock;
struct list_head list;
};
struct mlx5_bfreg_data {
struct mlx5_bfreg_head reg_head;
struct mlx5_bfreg_head wc_head;
};
struct mlx5_sq_bfreg {
void __iomem *map;
struct mlx5_uars_page *up;
bool wc;
u32 index;
unsigned int offset;
};
struct mlx5_core_health {
struct health_buffer __iomem *health;
__be32 __iomem *health_counter;
struct timer_list timer;
u32 prev;
int miss_counter;
u8 synd;
u32 fatal_error;
u32 crdump_size;
/* wq spinlock to synchronize draining */
spinlock_t wq_lock;
struct workqueue_struct *wq;
unsigned long flags;
struct work_struct fatal_report_work;
struct work_struct report_work;
struct devlink_health_reporter *fw_reporter;
struct devlink_health_reporter *fw_fatal_reporter;
struct delayed_work update_fw_log_ts_work;
};
struct mlx5_qp_table {
struct notifier_block nb;
/* protect radix tree
*/
spinlock_t lock;
struct radix_tree_root tree;
};
enum {
MLX5_PF_NOTIFY_DISABLE_VF,
MLX5_PF_NOTIFY_ENABLE_VF,
};
struct mlx5_vf_context {
int enabled;
u64 port_guid;
u64 node_guid;
IB/mlx5: Return the administrative GUID if exists A user can change the operational GUID (a.k.a affective GUID) through link/infiniband. Therefore it is preferred to return the currently set GUID if it exists instead of the operational. This way the PF can query which VF GUID will be set in the next bind. In order to align with MAC address, zero is returned if administrative GUID is not set. For example, before setting administrative GUID: $ ip link show ib0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 4092 qdisc mq state UP mode DEFAULT group default qlen 256 link/infiniband 00:00:00:08:fe:80:00:00:00:00:00:00:52:54:00:c0:fe:12:34:55 brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff vf 0 link/infiniband 00:00:00:08:fe:80:00:00:00:00:00:00:52:54:00:c0:fe:12:34:55 brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff, spoof checking off, NODE_GUID 00:00:00:00:00:00:00:00, PORT_GUID 00:00:00:00:00:00:00:00, link-state auto, trust off, query_rss off Then: $ ip link set ib0 vf 0 node_guid 11:00:af:21:cb:05:11:00 $ ip link set ib0 vf 0 port_guid 22:11:af:21:cb:05:11:00 After setting administrative GUID: $ ip link show ib0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 4092 qdisc mq state UP mode DEFAULT group default qlen 256 link/infiniband 00:00:00:08:fe:80:00:00:00:00:00:00:52:54:00:c0:fe:12:34:55 brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff vf 0 link/infiniband 00:00:00:08:fe:80:00:00:00:00:00:00:52:54:00:c0:fe:12:34:55 brd 00:ff:ff:ff:ff:12:40:1b:ff:ff:00:00:00:00:00:00:ff:ff:ff:ff, spoof checking off, NODE_GUID 11:00:af:21:cb:05:11:00, PORT_GUID 22:11:af:21:cb:05:11:00, link-state auto, trust off, query_rss off Fixes: 9c0015ef0928 ("IB/mlx5: Implement callbacks for getting VFs GUID attributes") Link: https://lore.kernel.org/r/20200116120048.12744-1-leon@kernel.org Signed-off-by: Danit Goldberg <danitg@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2020-01-16 12:00:48 +00:00
/* Valid bits are used to validate administrative guid only.
* Enabled after ndo_set_vf_guid
*/
u8 port_guid_valid:1;
u8 node_guid_valid:1;
enum port_state_policy policy;
struct blocking_notifier_head notifier;
};
struct mlx5_core_sriov {
struct mlx5_vf_context *vfs_ctx;
int num_vfs;
u16 max_vfs;
};
net/mlx5: Add flow counter pool Add a pool of flow counters, based on flow counter bulks, removing the need to allocate a new counter via a costly FW command during the flow creation process. The time it takes to acquire/release a flow counter is cut from ~50 [us] to ~50 [ns]. The pool is part of the mlx5 driver instance, and provides flow counters for aging flows. mlx5_fc_create() was modified to provide counters for aging flows from the pool by default, and mlx5_destroy_fc() was modified to release counters back to the pool for later reuse. If bulk allocation is not supported or fails, and for non-aging flows, the fallback behavior is to allocate and free individual counters. The pool is comprised of three lists of flow counter bulks, one of fully used bulks, one of partially used bulks, and one of unused bulks. Counters are provided from the partially used bulks first, to help limit bulk fragmentation. The pool maintains a threshold, and strives to maintain the amount of available counters below it. The pool is increased in size when a counter acquisition request is made and there are no available counters, and it is decreased in size when the last counter in a bulk is released and there are more available counters than the threshold. All pool size changes are done in the context of the acquiring/releasing process. The value of the threshold is directly correlated to the amount of used counters the pool is providing, while constrained by a hard maximum, and is recalculated every time a bulk is allocated/freed. This ensures that the pool only consumes large amounts of memory for available counters if the pool is being used heavily. When fully populated and at the hard maximum, the buffer of available counters consumes ~40 [MB]. Signed-off-by: Gavi Teitz <gavi@mellanox.com> Reviewed-by: Vlad Buslov <vladbu@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-06-27 17:53:03 +00:00
struct mlx5_fc_pool {
struct mlx5_core_dev *dev;
struct mutex pool_lock; /* protects pool lists */
struct list_head fully_used;
struct list_head partially_used;
struct list_head unused;
int available_fcs;
int used_fcs;
int threshold;
};
struct mlx5_fc_stats {
spinlock_t counters_idr_lock; /* protects counters_idr */
struct idr counters_idr;
struct list_head counters;
struct llist_head addlist;
struct llist_head dellist;
struct workqueue_struct *wq;
struct delayed_work work;
unsigned long next_query;
unsigned long sampling_interval; /* jiffies */
u32 *bulk_query_out;
int bulk_query_len;
size_t num_counters;
bool bulk_query_alloc_failed;
unsigned long next_bulk_query_alloc;
net/mlx5: Add flow counter pool Add a pool of flow counters, based on flow counter bulks, removing the need to allocate a new counter via a costly FW command during the flow creation process. The time it takes to acquire/release a flow counter is cut from ~50 [us] to ~50 [ns]. The pool is part of the mlx5 driver instance, and provides flow counters for aging flows. mlx5_fc_create() was modified to provide counters for aging flows from the pool by default, and mlx5_destroy_fc() was modified to release counters back to the pool for later reuse. If bulk allocation is not supported or fails, and for non-aging flows, the fallback behavior is to allocate and free individual counters. The pool is comprised of three lists of flow counter bulks, one of fully used bulks, one of partially used bulks, and one of unused bulks. Counters are provided from the partially used bulks first, to help limit bulk fragmentation. The pool maintains a threshold, and strives to maintain the amount of available counters below it. The pool is increased in size when a counter acquisition request is made and there are no available counters, and it is decreased in size when the last counter in a bulk is released and there are more available counters than the threshold. All pool size changes are done in the context of the acquiring/releasing process. The value of the threshold is directly correlated to the amount of used counters the pool is providing, while constrained by a hard maximum, and is recalculated every time a bulk is allocated/freed. This ensures that the pool only consumes large amounts of memory for available counters if the pool is being used heavily. When fully populated and at the hard maximum, the buffer of available counters consumes ~40 [MB]. Signed-off-by: Gavi Teitz <gavi@mellanox.com> Reviewed-by: Vlad Buslov <vladbu@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-06-27 17:53:03 +00:00
struct mlx5_fc_pool fc_pool;
};
struct mlx5_events;
struct mlx5_mpfs;
struct mlx5_eswitch;
struct mlx5_lag;
struct mlx5_devcom;
struct mlx5_fw_reset;
struct mlx5_eq_table;
struct mlx5_irq_table;
struct mlx5_vhca_state_notifier;
net/mlx5: SF, Add auxiliary device support Introduce API to add and delete an auxiliary device for an SF. Each SF has its own dedicated window in the PCI BAR 2. SF device is similar to PCI PF and VF that supports multiple class of devices such as net, rdma and vdpa. SF device will be added or removed in subsequent patch during SF devlink port function state change command. A subfunction device exposes user supplied subfunction number which will be further used by systemd/udev to have deterministic name for its netdevice and rdma device. An mlx5 subfunction auxiliary device example: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show ens2f0npf0sf88 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set ens2f0npf0sf88 hw_addr 00:00:00:00:88:88 state active On activation, $ ls -l /sys/bus/auxiliary/devices/ mlx5_core.sf.4 -> ../../../devices/pci0000:00/0000:00:03.0/0000:06:00.0/mlx5_core.sf.4 $ cat /sys/bus/auxiliary/devices/mlx5_core.sf.4/sfnum 88 Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-12 06:12:17 +00:00
struct mlx5_sf_dev_table;
net/mlx5: SF, Add port add delete functionality To handle SF port management outside of the eswitch as independent software layer, introduce eswitch notifier APIs so that mlx5 upper layer who wish to support sf port management in switchdev mode can perform its task whenever eswitch mode is set to switchdev or before eswitch is disabled. Initialize sf port table on such eswitch event. Add SF port add and delete functionality in switchdev mode. Destroy all SF ports when eswitch is disabled. Expose SF port add and delete to user via devlink commands. $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show ens2f0npf0sf88 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached or by its unique port index: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show ens2f0npf0sf88 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00", "state": "inactive", "opstate": "detached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-12 06:12:21 +00:00
struct mlx5_sf_hw_table;
struct mlx5_sf_table;
struct mlx5_rate_limit {
u32 rate;
u32 max_burst_sz;
u16 typical_pkt_sz;
};
struct mlx5_rl_entry {
u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)];
u64 refcount;
u16 index;
u16 uid;
u8 dedicated : 1;
};
struct mlx5_rl_table {
/* protect rate limit table */
struct mutex rl_lock;
u16 max_size;
u32 max_rate;
u32 min_rate;
struct mlx5_rl_entry *rl_entry;
u64 refcount;
};
struct mlx5_core_roce {
struct mlx5_flow_table *ft;
struct mlx5_flow_group *fg;
struct mlx5_flow_handle *allow_rule;
};
net/mlx5: Register mlx5 devices to auxiliary virtual bus Create auxiliary devices under new virtual bus. This will replace the custom-made mlx5 ->add()/->remove() interfaces and next patches will fill the missing callback and remove the old interface logic. The attachment of auxiliary drivers to the devices is possible in 1-to-1 manner only and it requires us to create device for every protocol, so that device (module) will be able to connect to it. System with 2 IB and 1 RoCE cards: [leonro@vm ~]$ lspci |grep nox 00:09.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5] 00:0a.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] 00:0b.0 Ethernet controller: Mellanox Technologies MT2910 Family [ConnectX-7] [leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/ mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.eth.2 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/mlx5_core.rdma.0 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.rdma.1 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.rdma.2 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.vdpa.1 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.vdpa.2 [leonro@vm ~]$ rdma dev 0: ibp0s9: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 1: ibp0s10: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3456 sys_image_guid 5254:00c0:fe12:3456 2: rdmap0s11: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3457 sys_image_guid 5254:00c0:fe12:3457 System with RoCE SR-IOV card with 4 VFs: [leonro@vm ~]$ lspci |grep nox 01:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] 01:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.2 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.3 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.4 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] [leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/ mlx5_core.eth.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.eth.0 mlx5_core.eth.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.eth.1 mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.eth.2 mlx5_core.eth.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.eth.3 mlx5_core.eth.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.eth.4 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.rdma.0 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.rdma.1 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.rdma.2 mlx5_core.rdma.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.rdma.3 mlx5_core.rdma.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.rdma.4 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.vdpa.1 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.vdpa.2 mlx5_core.vdpa.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.vdpa.3 mlx5_core.vdpa.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.vdpa.4 [leonro@vm ~]$ rdma dev 0: rocep1s0f0: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 1: rocep1s0f0v0: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3456 2: rocep1s0f0v1: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3457 3: rocep1s0f0v2: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3458 4: rocep1s0f0v3: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3459 Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
2020-10-08 13:06:37 +00:00
enum {
MLX5_PRIV_FLAGS_DISABLE_IB_ADEV = 1 << 0,
MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV = 1 << 1,
/* Set during device detach to block any further devices
* creation/deletion on drivers rescan. Unset during device attach.
*/
MLX5_PRIV_FLAGS_DETACH = 1 << 2,
/* Distinguish between mlx5e_probe/remove called by module init/cleanup
* and called by other flows which can already hold devlink lock
*/
MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW = 1 << 3,
net/mlx5: Register mlx5 devices to auxiliary virtual bus Create auxiliary devices under new virtual bus. This will replace the custom-made mlx5 ->add()/->remove() interfaces and next patches will fill the missing callback and remove the old interface logic. The attachment of auxiliary drivers to the devices is possible in 1-to-1 manner only and it requires us to create device for every protocol, so that device (module) will be able to connect to it. System with 2 IB and 1 RoCE cards: [leonro@vm ~]$ lspci |grep nox 00:09.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5] 00:0a.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] 00:0b.0 Ethernet controller: Mellanox Technologies MT2910 Family [ConnectX-7] [leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/ mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.eth.2 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/mlx5_core.rdma.0 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.rdma.1 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.rdma.2 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.vdpa.1 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.vdpa.2 [leonro@vm ~]$ rdma dev 0: ibp0s9: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 1: ibp0s10: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3456 sys_image_guid 5254:00c0:fe12:3456 2: rdmap0s11: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3457 sys_image_guid 5254:00c0:fe12:3457 System with RoCE SR-IOV card with 4 VFs: [leonro@vm ~]$ lspci |grep nox 01:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] 01:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.2 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.3 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.4 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] [leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/ mlx5_core.eth.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.eth.0 mlx5_core.eth.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.eth.1 mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.eth.2 mlx5_core.eth.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.eth.3 mlx5_core.eth.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.eth.4 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.rdma.0 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.rdma.1 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.rdma.2 mlx5_core.rdma.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.rdma.3 mlx5_core.rdma.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.rdma.4 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.vdpa.1 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.vdpa.2 mlx5_core.vdpa.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.vdpa.3 mlx5_core.vdpa.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.vdpa.4 [leonro@vm ~]$ rdma dev 0: rocep1s0f0: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 1: rocep1s0f0v0: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3456 2: rocep1s0f0v1: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3457 3: rocep1s0f0v2: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3458 4: rocep1s0f0v3: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3459 Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
2020-10-08 13:06:37 +00:00
};
struct mlx5_adev {
struct auxiliary_device adev;
struct mlx5_core_dev *mdev;
int idx;
};
struct mlx5_debugfs_entries {
struct dentry *dbg_root;
struct dentry *qp_debugfs;
struct dentry *eq_debugfs;
struct dentry *cq_debugfs;
struct dentry *cmdif_debugfs;
struct dentry *pages_debugfs;
net/mlx5: Lag, add debugfs to query hardware lag state Lag state has become very complicated with many modes, flags, types and port selections methods and future work will add additional features. Add a debugfs to query the current lag state. A new directory named "lag" will be created under the mlx5 debugfs directory. As the driver has debugfs per pci function the location will be: <debugfs>/mlx5/<BDF>/lag For example: /sys/kernel/debug/mlx5/0000:08:00.0/lag The following files are exposed: - state: Returns "active" or "disabled". If "active" it means hardware lag is active. - members: Returns the BDFs of all the members of lag object. - type: Returns the type of the lag currently configured. Valid only if hardware lag is active. * "roce" - Members are bare metal PFs. * "switchdev" - Members are in switchdev mode. * "multipath" - ECMP offloads. - port_sel_mode: Returns the egress port selection method, valid only if hardware lag is active. * "queue_affinity" - Egress port is selected by the QP/SQ affinity. * "hash" - Egress port is selected by hash done on each packet. Controlled by: xmit_hash_policy of the bond device. - flags: Returns flags that are specific per lag @type. Valid only if hardware lag is active. * "shared_fdb" - "on" or "off", if "on" single FDB is used. - mapping: Returns the mapping which is used to select egress port. Valid only if hardware lag is active. If @port_sel_mode is "hash" returns the active egress ports. The hash result will select only active ports. if @port_sel_mode is "queue_affinity" returns the mapping between the configured port affinity of the QP/SQ and actual egress port. For example: * 1:1 - Mapping means if the configured affinity is port 1 traffic will egress via port 1. * 1:2 - Mapping means if the configured affinity is port 1 traffic will egress via port 2. This can happen if port 1 is down or in active/backup mode and port 1 is backup. Signed-off-by: Mark Bloch <mbloch@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2022-03-15 16:56:50 +00:00
struct dentry *lag_debugfs;
};
struct mlx5_ft_pool;
struct mlx5_priv {
/* IRQ table valid only for real pci devices PF or VF */
struct mlx5_irq_table *irq_table;
struct mlx5_eq_table *eq_table;
/* pages stuff */
struct mlx5_nb pg_nb;
struct workqueue_struct *pg_wq;
struct xarray page_root_xa;
u32 fw_pages;
atomic_t reg_pages;
struct list_head free_list;
u32 vfs_pages;
u32 host_pf_pages;
u32 fw_pages_alloc_failed;
u32 give_pages_dropped;
u32 reclaim_pages_discard;
struct mlx5_core_health health;
struct list_head traps;
struct mlx5_debugfs_entries dbg;
/* start: alloc staff */
/* protect buffer allocation according to numa node */
struct mutex alloc_mutex;
int numa_node;
struct mutex pgdir_mutex;
struct list_head pgdir_list;
/* end: alloc staff */
struct list_head ctx_list;
spinlock_t ctx_lock;
net/mlx5: Register mlx5 devices to auxiliary virtual bus Create auxiliary devices under new virtual bus. This will replace the custom-made mlx5 ->add()/->remove() interfaces and next patches will fill the missing callback and remove the old interface logic. The attachment of auxiliary drivers to the devices is possible in 1-to-1 manner only and it requires us to create device for every protocol, so that device (module) will be able to connect to it. System with 2 IB and 1 RoCE cards: [leonro@vm ~]$ lspci |grep nox 00:09.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5] 00:0a.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] 00:0b.0 Ethernet controller: Mellanox Technologies MT2910 Family [ConnectX-7] [leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/ mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.eth.2 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/mlx5_core.rdma.0 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.rdma.1 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.rdma.2 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.vdpa.1 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.vdpa.2 [leonro@vm ~]$ rdma dev 0: ibp0s9: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 1: ibp0s10: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3456 sys_image_guid 5254:00c0:fe12:3456 2: rdmap0s11: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3457 sys_image_guid 5254:00c0:fe12:3457 System with RoCE SR-IOV card with 4 VFs: [leonro@vm ~]$ lspci |grep nox 01:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] 01:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.2 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.3 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.4 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] [leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/ mlx5_core.eth.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.eth.0 mlx5_core.eth.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.eth.1 mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.eth.2 mlx5_core.eth.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.eth.3 mlx5_core.eth.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.eth.4 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.rdma.0 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.rdma.1 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.rdma.2 mlx5_core.rdma.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.rdma.3 mlx5_core.rdma.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.rdma.4 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.vdpa.1 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.vdpa.2 mlx5_core.vdpa.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.vdpa.3 mlx5_core.vdpa.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.vdpa.4 [leonro@vm ~]$ rdma dev 0: rocep1s0f0: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 1: rocep1s0f0v0: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3456 2: rocep1s0f0v1: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3457 3: rocep1s0f0v2: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3458 4: rocep1s0f0v3: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3459 Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
2020-10-08 13:06:37 +00:00
struct mlx5_adev **adev;
int adev_idx;
int sw_vhca_id;
struct mlx5_events *events;
struct mlx5_flow_steering *steering;
struct mlx5_mpfs *mpfs;
struct mlx5_eswitch *eswitch;
struct mlx5_core_sriov sriov;
struct mlx5_lag *lag;
net/mlx5: Register mlx5 devices to auxiliary virtual bus Create auxiliary devices under new virtual bus. This will replace the custom-made mlx5 ->add()/->remove() interfaces and next patches will fill the missing callback and remove the old interface logic. The attachment of auxiliary drivers to the devices is possible in 1-to-1 manner only and it requires us to create device for every protocol, so that device (module) will be able to connect to it. System with 2 IB and 1 RoCE cards: [leonro@vm ~]$ lspci |grep nox 00:09.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5] 00:0a.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] 00:0b.0 Ethernet controller: Mellanox Technologies MT2910 Family [ConnectX-7] [leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/ mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.eth.2 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/mlx5_core.rdma.0 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.rdma.1 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.rdma.2 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:0a.0/mlx5_core.vdpa.1 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:0b.0/mlx5_core.vdpa.2 [leonro@vm ~]$ rdma dev 0: ibp0s9: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 1: ibp0s10: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3456 sys_image_guid 5254:00c0:fe12:3456 2: rdmap0s11: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3457 sys_image_guid 5254:00c0:fe12:3457 System with RoCE SR-IOV card with 4 VFs: [leonro@vm ~]$ lspci |grep nox 01:00.0 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6] 01:00.1 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.2 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.3 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] 01:00.4 Ethernet controller: Mellanox Technologies MT28908 Family [ConnectX-6 Virtual Function] [leonro@vm ~]$ ls -l /sys/bus/auxiliary/devices/ mlx5_core.eth.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.eth.0 mlx5_core.eth.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.eth.1 mlx5_core.eth.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.eth.2 mlx5_core.eth.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.eth.3 mlx5_core.eth.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.eth.4 mlx5_core.rdma.0 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.0/mlx5_core.rdma.0 mlx5_core.rdma.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.rdma.1 mlx5_core.rdma.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.rdma.2 mlx5_core.rdma.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.rdma.3 mlx5_core.rdma.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.rdma.4 mlx5_core.vdpa.1 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.1/mlx5_core.vdpa.1 mlx5_core.vdpa.2 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.2/mlx5_core.vdpa.2 mlx5_core.vdpa.3 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.3/mlx5_core.vdpa.3 mlx5_core.vdpa.4 -> ../../../devices/pci0000:00/0000:00:09.0/0000:01:00.4/mlx5_core.vdpa.4 [leonro@vm ~]$ rdma dev 0: rocep1s0f0: node_type ca fw 4.6.9999 node_guid 5254:00c0:fe12:3455 sys_image_guid 5254:00c0:fe12:3455 1: rocep1s0f0v0: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3456 2: rocep1s0f0v1: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3457 3: rocep1s0f0v2: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3458 4: rocep1s0f0v3: node_type ca fw 4.6.9999 node_guid 0000:0000:0000:0000 sys_image_guid 5254:00c0:fe12:3459 Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
2020-10-08 13:06:37 +00:00
u32 flags;
struct mlx5_devcom *devcom;
struct mlx5_fw_reset *fw_reset;
struct mlx5_core_roce roce;
struct mlx5_fc_stats fc_stats;
struct mlx5_rl_table rl_table;
struct mlx5_ft_pool *ft_pool;
struct mlx5_bfreg_data bfregs;
struct mlx5_uars_page *uar;
#ifdef CONFIG_MLX5_SF
struct mlx5_vhca_state_notifier *vhca_state_notifier;
net/mlx5: SF, Add auxiliary device support Introduce API to add and delete an auxiliary device for an SF. Each SF has its own dedicated window in the PCI BAR 2. SF device is similar to PCI PF and VF that supports multiple class of devices such as net, rdma and vdpa. SF device will be added or removed in subsequent patch during SF devlink port function state change command. A subfunction device exposes user supplied subfunction number which will be further used by systemd/udev to have deterministic name for its netdevice and rdma device. An mlx5 subfunction auxiliary device example: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show ens2f0npf0sf88 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set ens2f0npf0sf88 hw_addr 00:00:00:00:88:88 state active On activation, $ ls -l /sys/bus/auxiliary/devices/ mlx5_core.sf.4 -> ../../../devices/pci0000:00/0000:00:03.0/0000:06:00.0/mlx5_core.sf.4 $ cat /sys/bus/auxiliary/devices/mlx5_core.sf.4/sfnum 88 Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-12 06:12:17 +00:00
struct mlx5_sf_dev_table *sf_dev_table;
struct mlx5_core_dev *parent_mdev;
#endif
net/mlx5: SF, Add port add delete functionality To handle SF port management outside of the eswitch as independent software layer, introduce eswitch notifier APIs so that mlx5 upper layer who wish to support sf port management in switchdev mode can perform its task whenever eswitch mode is set to switchdev or before eswitch is disabled. Initialize sf port table on such eswitch event. Add SF port add and delete functionality in switchdev mode. Destroy all SF ports when eswitch is disabled. Expose SF port add and delete to user via devlink commands. $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show ens2f0npf0sf88 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached or by its unique port index: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show ens2f0npf0sf88 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00", "state": "inactive", "opstate": "detached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-12 06:12:21 +00:00
#ifdef CONFIG_MLX5_SF_MANAGER
struct mlx5_sf_hw_table *sf_hw_table;
struct mlx5_sf_table *sf_table;
#endif
};
enum mlx5_device_state {
MLX5_DEVICE_STATE_UP = 1,
MLX5_DEVICE_STATE_INTERNAL_ERROR,
};
enum mlx5_interface_state {
MLX5_INTERFACE_STATE_UP = BIT(0),
MLX5_BREAK_FW_WAIT = BIT(1),
};
enum mlx5_pci_status {
MLX5_PCI_STATUS_DISABLED,
MLX5_PCI_STATUS_ENABLED,
};
enum mlx5_pagefault_type_flags {
MLX5_PFAULT_REQUESTOR = 1 << 0,
MLX5_PFAULT_WRITE = 1 << 1,
MLX5_PFAULT_RDMA = 1 << 2,
};
struct mlx5_td {
/* protects tirs list changes while tirs refresh */
struct mutex list_lock;
struct list_head tirs_list;
u32 tdn;
};
struct mlx5e_resources {
struct mlx5e_hw_objs {
u32 pdn;
struct mlx5_td td;
u32 mkey;
struct mlx5_sq_bfreg bfreg;
} hw_objs;
struct devlink_port dl_port;
struct net_device *uplink_netdev;
};
enum mlx5_sw_icm_type {
MLX5_SW_ICM_TYPE_STEERING,
MLX5_SW_ICM_TYPE_HEADER_MODIFY,
MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN,
};
#define MLX5_MAX_RESERVED_GIDS 8
struct mlx5_rsvd_gids {
unsigned int start;
unsigned int count;
struct ida ida;
};
#define MAX_PIN_NUM 8
struct mlx5_pps {
u8 pin_caps[MAX_PIN_NUM];
struct work_struct out_work;
u64 start[MAX_PIN_NUM];
u8 enabled;
};
struct mlx5_timer {
struct cyclecounter cycles;
struct timecounter tc;
u32 nominal_c_mult;
unsigned long overflow_period;
struct delayed_work overflow_work;
};
struct mlx5_clock {
struct mlx5_nb pps_nb;
seqlock_t lock;
struct hwtstamp_config hwtstamp_config;
struct ptp_clock *ptp;
struct ptp_clock_info ptp_info;
struct mlx5_pps pps_info;
struct mlx5_timer timer;
};
struct mlx5_dm;
struct mlx5_fw_tracer;
struct mlx5_vxlan;
struct mlx5_geneve;
struct mlx5_hv_vhca;
#define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
#define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
enum {
MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0,
MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1,
};
enum {
MKEY_CACHE_LAST_STD_ENTRY = 20,
MLX5_IMR_MTT_CACHE_ENTRY,
MLX5_IMR_KSM_CACHE_ENTRY,
MAX_MKEY_CACHE_ENTRIES
};
struct mlx5_profile {
u64 mask;
u8 log_max_qp;
struct {
int size;
int limit;
} mr_cache[MAX_MKEY_CACHE_ENTRIES];
};
struct mlx5_hca_cap {
u32 cur[MLX5_UN_SZ_DW(hca_cap_union)];
u32 max[MLX5_UN_SZ_DW(hca_cap_union)];
};
struct mlx5_core_dev {
struct device *device;
enum mlx5_coredev_type coredev_type;
struct pci_dev *pdev;
/* sync pci state */
struct mutex pci_status_mutex;
enum mlx5_pci_status pci_status;
u8 rev_id;
char board_id[MLX5_BOARD_ID_LEN];
struct mlx5_cmd cmd;
struct {
struct mlx5_hca_cap *hca[MLX5_CAP_NUM];
u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
u32 mcam[MLX5_MCAM_REGS_NUM][MLX5_ST_SZ_DW(mcam_reg)];
u32 fpga[MLX5_ST_SZ_DW(fpga_cap)];
u32 qcam[MLX5_ST_SZ_DW(qcam_reg)];
net/mlx5: Introduce Mellanox SmartNIC and modify page management logic Mellanox's SmartNIC combines embedded CPU(e.g, ARM) processing power with advanced network offloads to accelerate a multitude of security, networking and storage applications. With the introduction of the SmartNIC, there is a new PCI function called Embedded CPU Physical Function(ECPF). And it's possible for a PF to get its ICM pages from the ECPF PCI function. Driver shall identify if it is running on such a function by reading a bit in the initialization segment. When firmware asks for pages, it would issue a page request event specifying how many pages it requests and for which function. That driver responds with a manage_pages command providing the requested pages along with an indication for which function it is providing these pages. The encoding before this patch was as follows: function_id == 0: pages are requested for the function receiving the EQE. function_id != 0: pages are requested for VF identified by the function_id value A new one bit field in the EQE identifies that pages are requested for the ECPF. The notion of page_supplier can be introduced here and to support that, manage pages and query pages were modified so firmware can distinguish the following cases: 1. Function provides pages for itself 2. PF provides pages for its VF 3. ECPF provides pages to itself 4. ECPF provides pages for another function This distinction is possible through the introduction of the bit "embedded_cpu_function" in query_pages, manage_pages and page request EQE. Signed-off-by: Bodong Wang <bodong@mellanox.com> Signed-off-by: Eli Cohen <eli@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-02-13 06:55:35 +00:00
u8 embedded_cpu;
} caps;
struct mlx5_timeouts *timeouts;
u64 sys_image_guid;
phys_addr_t iseg_base;
struct mlx5_init_seg __iomem *iseg;
phys_addr_t bar_addr;
enum mlx5_device_state state;
/* sync interface state */
struct mutex intf_state_mutex;
net/mlx5: Avoid false positive lockdep warning by adding lock_class_key Add a lock_class_key per mlx5 device to avoid a false positive "possible circular locking dependency" warning by lockdep, on flows which lock more than one mlx5 device, such as adding SF. kernel log: ====================================================== WARNING: possible circular locking dependency detected 5.19.0-rc8+ #2 Not tainted ------------------------------------------------------ kworker/u20:0/8 is trying to acquire lock: ffff88812dfe0d98 (&dev->intf_state_mutex){+.+.}-{3:3}, at: mlx5_init_one+0x2e/0x490 [mlx5_core] but task is already holding lock: ffff888101aa7898 (&(&notifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&(&notifier->n_head)->rwsem){++++}-{3:3}: down_write+0x90/0x150 blocking_notifier_chain_register+0x53/0xa0 mlx5_sf_table_init+0x369/0x4a0 [mlx5_core] mlx5_init_one+0x261/0x490 [mlx5_core] probe_one+0x430/0x680 [mlx5_core] local_pci_probe+0xd6/0x170 work_for_cpu_fn+0x4e/0xa0 process_one_work+0x7c2/0x1340 worker_thread+0x6f6/0xec0 kthread+0x28f/0x330 ret_from_fork+0x1f/0x30 -> #0 (&dev->intf_state_mutex){+.+.}-{3:3}: __lock_acquire+0x2fc7/0x6720 lock_acquire+0x1c1/0x550 __mutex_lock+0x12c/0x14b0 mlx5_init_one+0x2e/0x490 [mlx5_core] mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core] auxiliary_bus_probe+0x9d/0xe0 really_probe+0x1e0/0xaa0 __driver_probe_device+0x219/0x480 driver_probe_device+0x49/0x130 __device_attach_driver+0x1b8/0x280 bus_for_each_drv+0x123/0x1a0 __device_attach+0x1a3/0x460 bus_probe_device+0x1a2/0x260 device_add+0x9b1/0x1b40 __auxiliary_device_add+0x88/0xc0 mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core] blocking_notifier_call_chain+0xd5/0x130 mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core] process_one_work+0x7c2/0x1340 worker_thread+0x59d/0xec0 kthread+0x28f/0x330 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&(&notifier->n_head)->rwsem); lock(&dev->intf_state_mutex); lock(&(&notifier->n_head)->rwsem); lock(&dev->intf_state_mutex); *** DEADLOCK *** 4 locks held by kworker/u20:0/8: #0: ffff888150612938 ((wq_completion)mlx5_events){+.+.}-{0:0}, at: process_one_work+0x6e2/0x1340 #1: ffff888100cafdb8 ((work_completion)(&work->work)#3){+.+.}-{0:0}, at: process_one_work+0x70f/0x1340 #2: ffff888101aa7898 (&(&notifier->n_head)->rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x5a/0x130 #3: ffff88813682d0e8 (&dev->mutex){....}-{3:3}, at:__device_attach+0x76/0x460 stack backtrace: CPU: 6 PID: 8 Comm: kworker/u20:0 Not tainted 5.19.0-rc8+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: mlx5_events mlx5_vhca_state_work_handler [mlx5_core] Call Trace: <TASK> dump_stack_lvl+0x57/0x7d check_noncircular+0x278/0x300 ? print_circular_bug+0x460/0x460 ? lock_chain_count+0x20/0x20 ? register_lock_class+0x1880/0x1880 __lock_acquire+0x2fc7/0x6720 ? register_lock_class+0x1880/0x1880 ? register_lock_class+0x1880/0x1880 lock_acquire+0x1c1/0x550 ? mlx5_init_one+0x2e/0x490 [mlx5_core] ? lockdep_hardirqs_on_prepare+0x400/0x400 __mutex_lock+0x12c/0x14b0 ? mlx5_init_one+0x2e/0x490 [mlx5_core] ? mlx5_init_one+0x2e/0x490 [mlx5_core] ? _raw_read_unlock+0x1f/0x30 ? mutex_lock_io_nested+0x1320/0x1320 ? __ioremap_caller.constprop.0+0x306/0x490 ? mlx5_sf_dev_probe+0x269/0x370 [mlx5_core] ? iounmap+0x160/0x160 mlx5_init_one+0x2e/0x490 [mlx5_core] mlx5_sf_dev_probe+0x29c/0x370 [mlx5_core] ? mlx5_sf_dev_remove+0x130/0x130 [mlx5_core] auxiliary_bus_probe+0x9d/0xe0 really_probe+0x1e0/0xaa0 __driver_probe_device+0x219/0x480 ? auxiliary_match_id+0xe9/0x140 driver_probe_device+0x49/0x130 __device_attach_driver+0x1b8/0x280 ? driver_allows_async_probing+0x140/0x140 bus_for_each_drv+0x123/0x1a0 ? bus_for_each_dev+0x1a0/0x1a0 ? lockdep_hardirqs_on_prepare+0x286/0x400 ? trace_hardirqs_on+0x2d/0x100 __device_attach+0x1a3/0x460 ? device_driver_attach+0x1e0/0x1e0 ? kobject_uevent_env+0x22d/0xf10 bus_probe_device+0x1a2/0x260 device_add+0x9b1/0x1b40 ? dev_set_name+0xab/0xe0 ? __fw_devlink_link_to_suppliers+0x260/0x260 ? memset+0x20/0x40 ? lockdep_init_map_type+0x21a/0x7d0 __auxiliary_device_add+0x88/0xc0 ? auxiliary_device_init+0x86/0xa0 mlx5_sf_dev_state_change_handler+0x67e/0x9d0 [mlx5_core] blocking_notifier_call_chain+0xd5/0x130 mlx5_vhca_state_work_handler+0x2b0/0x3f0 [mlx5_core] ? mlx5_vhca_event_arm+0x100/0x100 [mlx5_core] ? lock_downgrade+0x6e0/0x6e0 ? lockdep_hardirqs_on_prepare+0x286/0x400 process_one_work+0x7c2/0x1340 ? lockdep_hardirqs_on_prepare+0x400/0x400 ? pwq_dec_nr_in_flight+0x230/0x230 ? rwlock_bug.part.0+0x90/0x90 worker_thread+0x59d/0xec0 ? process_one_work+0x1340/0x1340 kthread+0x28f/0x330 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 </TASK> Fixes: 6a3273217469 ("net/mlx5: SF, Port function state change support") Signed-off-by: Moshe Shemesh <moshe@nvidia.com> Reviewed-by: Shay Drory <shayd@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2022-08-03 07:49:23 +00:00
struct lock_class_key lock_key;
unsigned long intf_state;
struct mlx5_priv priv;
struct mlx5_profile profile;
u32 issi;
struct mlx5e_resources mlx5e_res;
struct mlx5_dm *dm;
struct mlx5_vxlan *vxlan;
struct mlx5_geneve *geneve;
struct {
struct mlx5_rsvd_gids reserved_gids;
u32 roce_en;
} roce;
#ifdef CONFIG_MLX5_FPGA
struct mlx5_fpga_device *fpga;
#endif
struct mlx5_clock clock;
struct mlx5_ib_clock_info *clock_info;
struct mlx5_fw_tracer *tracer;
struct mlx5_rsc_dump *rsc_dump;
u32 vsc_addr;
struct mlx5_hv_vhca *hv_vhca;
};
struct mlx5_db {
__be32 *db;
union {
struct mlx5_db_pgdir *pgdir;
struct mlx5_ib_user_db_page *user_page;
} u;
dma_addr_t dma;
int index;
};
enum {
MLX5_COMP_EQ_SIZE = 1024,
};
enum {
MLX5_PTYS_IB = 1 << 0,
MLX5_PTYS_EN = 1 << 2,
};
typedef void (*mlx5_cmd_cbk_t)(int status, void *context);
enum {
MLX5_CMD_ENT_STATE_PENDING_COMP,
};
struct mlx5_cmd_work_ent {
unsigned long state;
struct mlx5_cmd_msg *in;
struct mlx5_cmd_msg *out;
void *uout;
int uout_size;
mlx5_cmd_cbk_t callback;
struct delayed_work cb_timeout_work;
void *context;
int idx;
struct completion handling;
struct completion done;
struct mlx5_cmd *cmd;
struct work_struct work;
struct mlx5_cmd_layout *lay;
int ret;
int page_queue;
u8 status;
u8 token;
u64 ts1;
u64 ts2;
u16 op;
bool polling;
/* Track the max comp handlers */
refcount_t refcnt;
};
struct mlx5_pas {
u64 pa;
u8 log_sz;
};
enum phy_port_state {
MLX5_AAA_111
};
struct mlx5_hca_vport_context {
u32 field_select;
bool sm_virt_aware;
bool has_smi;
bool has_raw;
enum port_state_policy policy;
enum phy_port_state phys_state;
enum ib_port_state vport_state;
u8 port_physical_state;
u64 sys_image_guid;
u64 port_guid;
u64 node_guid;
u32 cap_mask1;
u32 cap_mask1_perm;
u16 cap_mask2;
u16 cap_mask2_perm;
u16 lid;
u8 init_type_reply; /* bitmask: see ib spec 14.2.5.6 InitTypeReply */
u8 lmc;
u8 subnet_timeout;
u16 sm_lid;
u8 sm_sl;
u16 qkey_violation_counter;
u16 pkey_violation_counter;
bool grh_required;
};
#define STRUCT_FIELD(header, field) \
.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \
.struct_size_bytes = sizeof((struct ib_unpacked_ ## header *)0)->field
extern struct dentry *mlx5_debugfs_root;
static inline u16 fw_rev_maj(struct mlx5_core_dev *dev)
{
return ioread32be(&dev->iseg->fw_rev) & 0xffff;
}
static inline u16 fw_rev_min(struct mlx5_core_dev *dev)
{
return ioread32be(&dev->iseg->fw_rev) >> 16;
}
static inline u16 fw_rev_sub(struct mlx5_core_dev *dev)
{
return ioread32be(&dev->iseg->cmdif_rev_fw_sub) & 0xffff;
}
static inline u32 mlx5_base_mkey(const u32 key)
{
return key & 0xffffff00u;
}
static inline u32 wq_get_byte_sz(u8 log_sz, u8 log_stride)
{
return ((u32)1 << log_sz) << log_stride;
}
static inline void mlx5_init_fbc_offset(struct mlx5_buf_list *frags,
u8 log_stride, u8 log_sz,
u16 strides_offset,
struct mlx5_frag_buf_ctrl *fbc)
{
fbc->frags = frags;
fbc->log_stride = log_stride;
fbc->log_sz = log_sz;
fbc->sz_m1 = (1 << fbc->log_sz) - 1;
fbc->log_frag_strides = PAGE_SHIFT - fbc->log_stride;
fbc->frag_sz_m1 = (1 << fbc->log_frag_strides) - 1;
fbc->strides_offset = strides_offset;
}
static inline void mlx5_init_fbc(struct mlx5_buf_list *frags,
u8 log_stride, u8 log_sz,
struct mlx5_frag_buf_ctrl *fbc)
{
mlx5_init_fbc_offset(frags, log_stride, log_sz, 0, fbc);
}
static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc,
u32 ix)
{
unsigned int frag;
ix += fbc->strides_offset;
frag = ix >> fbc->log_frag_strides;
return fbc->frags[frag].buf + ((fbc->frag_sz_m1 & ix) << fbc->log_stride);
}
static inline u32
mlx5_frag_buf_get_idx_last_contig_stride(struct mlx5_frag_buf_ctrl *fbc, u32 ix)
{
u32 last_frag_stride_idx = (ix + fbc->strides_offset) | fbc->frag_sz_m1;
return min_t(u32, last_frag_stride_idx - fbc->strides_offset, fbc->sz_m1);
}
enum {
CMD_ALLOWED_OPCODE_ALL,
};
void mlx5_cmd_use_events(struct mlx5_core_dev *dev);
void mlx5_cmd_use_polling(struct mlx5_core_dev *dev);
void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode);
struct mlx5_async_ctx {
struct mlx5_core_dev *dev;
atomic_t num_inflight;
net/mlx5: Fix possible use-after-free in async command interface [ Upstream commit bacd22df95147ed673bec4692ab2d4d585935241 ] mlx5_cmd_cleanup_async_ctx should return only after all its callback handlers were completed. Before this patch, the below race between mlx5_cmd_cleanup_async_ctx and mlx5_cmd_exec_cb_handler was possible and lead to a use-after-free: 1. mlx5_cmd_cleanup_async_ctx is called while num_inflight is 2 (i.e. elevated by 1, a single inflight callback). 2. mlx5_cmd_cleanup_async_ctx decreases num_inflight to 1. 3. mlx5_cmd_exec_cb_handler is called, decreases num_inflight to 0 and is about to call wake_up(). 4. mlx5_cmd_cleanup_async_ctx calls wait_event, which returns immediately as the condition (num_inflight == 0) holds. 5. mlx5_cmd_cleanup_async_ctx returns. 6. The caller of mlx5_cmd_cleanup_async_ctx frees the mlx5_async_ctx object. 7. mlx5_cmd_exec_cb_handler goes on and calls wake_up() on the freed object. Fix it by syncing using a completion object. Mark it completed when num_inflight reaches 0. Trace: BUG: KASAN: use-after-free in do_raw_spin_lock+0x23d/0x270 Read of size 4 at addr ffff888139cd12f4 by task swapper/5/0 CPU: 5 PID: 0 Comm: swapper/5 Not tainted 6.0.0-rc3_for_upstream_debug_2022_08_30_13_10 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: <IRQ> dump_stack_lvl+0x57/0x7d print_report.cold+0x2d5/0x684 ? do_raw_spin_lock+0x23d/0x270 kasan_report+0xb1/0x1a0 ? do_raw_spin_lock+0x23d/0x270 do_raw_spin_lock+0x23d/0x270 ? rwlock_bug.part.0+0x90/0x90 ? __delete_object+0xb8/0x100 ? lock_downgrade+0x6e0/0x6e0 _raw_spin_lock_irqsave+0x43/0x60 ? __wake_up_common_lock+0xb9/0x140 __wake_up_common_lock+0xb9/0x140 ? __wake_up_common+0x650/0x650 ? destroy_tis_callback+0x53/0x70 [mlx5_core] ? kasan_set_track+0x21/0x30 ? destroy_tis_callback+0x53/0x70 [mlx5_core] ? kfree+0x1ba/0x520 ? do_raw_spin_unlock+0x54/0x220 mlx5_cmd_exec_cb_handler+0x136/0x1a0 [mlx5_core] ? mlx5_cmd_cleanup_async_ctx+0x220/0x220 [mlx5_core] ? mlx5_cmd_cleanup_async_ctx+0x220/0x220 [mlx5_core] mlx5_cmd_comp_handler+0x65a/0x12b0 [mlx5_core] ? dump_command+0xcc0/0xcc0 [mlx5_core] ? lockdep_hardirqs_on_prepare+0x400/0x400 ? cmd_comp_notifier+0x7e/0xb0 [mlx5_core] cmd_comp_notifier+0x7e/0xb0 [mlx5_core] atomic_notifier_call_chain+0xd7/0x1d0 mlx5_eq_async_int+0x3ce/0xa20 [mlx5_core] atomic_notifier_call_chain+0xd7/0x1d0 ? irq_release+0x140/0x140 [mlx5_core] irq_int_handler+0x19/0x30 [mlx5_core] __handle_irq_event_percpu+0x1f2/0x620 handle_irq_event+0xb2/0x1d0 handle_edge_irq+0x21e/0xb00 __common_interrupt+0x79/0x1a0 common_interrupt+0x78/0xa0 </IRQ> <TASK> asm_common_interrupt+0x22/0x40 RIP: 0010:default_idle+0x42/0x60 Code: c1 83 e0 07 48 c1 e9 03 83 c0 03 0f b6 14 11 38 d0 7c 04 84 d2 75 14 8b 05 eb 47 22 02 85 c0 7e 07 0f 00 2d e0 9f 48 00 fb f4 <c3> 48 c7 c7 80 08 7f 85 e8 d1 d3 3e fe eb de 66 66 2e 0f 1f 84 00 RSP: 0018:ffff888100dbfdf0 EFLAGS: 00000242 RAX: 0000000000000001 RBX: ffffffff84ecbd48 RCX: 1ffffffff0afe110 RDX: 0000000000000004 RSI: 0000000000000000 RDI: ffffffff835cc9bc RBP: 0000000000000005 R08: 0000000000000001 R09: ffff88881dec4ac3 R10: ffffed1103bd8958 R11: 0000017d0ca571c9 R12: 0000000000000005 R13: ffffffff84f024e0 R14: 0000000000000000 R15: dffffc0000000000 ? default_idle_call+0xcc/0x450 default_idle_call+0xec/0x450 do_idle+0x394/0x450 ? arch_cpu_idle_exit+0x40/0x40 ? do_idle+0x17/0x450 cpu_startup_entry+0x19/0x20 start_secondary+0x221/0x2b0 ? set_cpu_sibling_map+0x2070/0x2070 secondary_startup_64_no_verify+0xcd/0xdb </TASK> Allocated by task 49502: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 kvmalloc_node+0x48/0xe0 mlx5e_bulk_async_init+0x35/0x110 [mlx5_core] mlx5e_tls_priv_tx_list_cleanup+0x84/0x3e0 [mlx5_core] mlx5e_ktls_cleanup_tx+0x38f/0x760 [mlx5_core] mlx5e_cleanup_nic_tx+0xa7/0x100 [mlx5_core] mlx5e_detach_netdev+0x1ca/0x2b0 [mlx5_core] mlx5e_suspend+0xdb/0x140 [mlx5_core] mlx5e_remove+0x89/0x190 [mlx5_core] auxiliary_bus_remove+0x52/0x70 device_release_driver_internal+0x40f/0x650 driver_detach+0xc1/0x180 bus_remove_driver+0x125/0x2f0 auxiliary_driver_unregister+0x16/0x50 mlx5e_cleanup+0x26/0x30 [mlx5_core] cleanup+0xc/0x4e [mlx5_core] __x64_sys_delete_module+0x2b5/0x450 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Freed by task 49502: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 ____kasan_slab_free+0x11d/0x1b0 kfree+0x1ba/0x520 mlx5e_tls_priv_tx_list_cleanup+0x2e7/0x3e0 [mlx5_core] mlx5e_ktls_cleanup_tx+0x38f/0x760 [mlx5_core] mlx5e_cleanup_nic_tx+0xa7/0x100 [mlx5_core] mlx5e_detach_netdev+0x1ca/0x2b0 [mlx5_core] mlx5e_suspend+0xdb/0x140 [mlx5_core] mlx5e_remove+0x89/0x190 [mlx5_core] auxiliary_bus_remove+0x52/0x70 device_release_driver_internal+0x40f/0x650 driver_detach+0xc1/0x180 bus_remove_driver+0x125/0x2f0 auxiliary_driver_unregister+0x16/0x50 mlx5e_cleanup+0x26/0x30 [mlx5_core] cleanup+0xc/0x4e [mlx5_core] __x64_sys_delete_module+0x2b5/0x450 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Fixes: e355477ed9e4 ("net/mlx5: Make mlx5_cmd_exec_cb() a safe API") Signed-off-by: Tariq Toukan <tariqt@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> Link: https://lore.kernel.org/r/20221026135153.154807-8-saeed@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
2022-10-26 13:51:45 +00:00
struct completion inflight_done;
};
struct mlx5_async_work;
typedef void (*mlx5_async_cbk_t)(int status, struct mlx5_async_work *context);
struct mlx5_async_work {
struct mlx5_async_ctx *ctx;
mlx5_async_cbk_t user_callback;
u16 opcode; /* cmd opcode */
void *out; /* pointer to the cmd output buffer */
};
void mlx5_cmd_init_async_ctx(struct mlx5_core_dev *dev,
struct mlx5_async_ctx *ctx);
void mlx5_cmd_cleanup_async_ctx(struct mlx5_async_ctx *ctx);
int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size,
void *out, int out_size, mlx5_async_cbk_t callback,
struct mlx5_async_work *work);
void mlx5_cmd_out_err(struct mlx5_core_dev *dev, u16 opcode, u16 op_mod, void *out);
int mlx5_cmd_do(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size);
int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out);
int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
int out_size);
#define mlx5_cmd_exec_inout(dev, ifc_cmd, in, out) \
({ \
mlx5_cmd_exec(dev, in, MLX5_ST_SZ_BYTES(ifc_cmd##_in), out, \
MLX5_ST_SZ_BYTES(ifc_cmd##_out)); \
})
#define mlx5_cmd_exec_in(dev, ifc_cmd, in) \
({ \
u32 _out[MLX5_ST_SZ_DW(ifc_cmd##_out)] = {}; \
mlx5_cmd_exec_inout(dev, ifc_cmd, in, _out); \
})
int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size,
void *out, int out_size);
bool mlx5_cmd_is_down(struct mlx5_core_dev *dev);
int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
void mlx5_health_flush(struct mlx5_core_dev *dev);
void mlx5_health_cleanup(struct mlx5_core_dev *dev);
int mlx5_health_init(struct mlx5_core_dev *dev);
void mlx5_start_health_poll(struct mlx5_core_dev *dev);
void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
struct mlx5_frag_buf *buf, int node);
void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf);
struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
gfp_t flags, int npages);
void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
struct mlx5_cmd_mailbox *head);
int mlx5_core_create_mkey(struct mlx5_core_dev *dev, u32 *mkey, u32 *in,
int inlen);
int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, u32 mkey);
int mlx5_core_query_mkey(struct mlx5_core_dev *dev, u32 mkey, u32 *out,
int outlen);
int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn);
int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn);
int mlx5_pagealloc_init(struct mlx5_core_dev *dev);
void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev);
void mlx5_pagealloc_start(struct mlx5_core_dev *dev);
void mlx5_pagealloc_stop(struct mlx5_core_dev *dev);
void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev);
void mlx5_pages_debugfs_cleanup(struct mlx5_core_dev *dev);
void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
net/mlx5: Introduce Mellanox SmartNIC and modify page management logic Mellanox's SmartNIC combines embedded CPU(e.g, ARM) processing power with advanced network offloads to accelerate a multitude of security, networking and storage applications. With the introduction of the SmartNIC, there is a new PCI function called Embedded CPU Physical Function(ECPF). And it's possible for a PF to get its ICM pages from the ECPF PCI function. Driver shall identify if it is running on such a function by reading a bit in the initialization segment. When firmware asks for pages, it would issue a page request event specifying how many pages it requests and for which function. That driver responds with a manage_pages command providing the requested pages along with an indication for which function it is providing these pages. The encoding before this patch was as follows: function_id == 0: pages are requested for the function receiving the EQE. function_id != 0: pages are requested for VF identified by the function_id value A new one bit field in the EQE identifies that pages are requested for the ECPF. The notion of page_supplier can be introduced here and to support that, manage pages and query pages were modified so firmware can distinguish the following cases: 1. Function provides pages for itself 2. PF provides pages for its VF 3. ECPF provides pages to itself 4. ECPF provides pages for another function This distinction is possible through the introduction of the bit "embedded_cpu_function" in query_pages, manage_pages and page request EQE. Signed-off-by: Bodong Wang <bodong@mellanox.com> Signed-off-by: Eli Cohen <eli@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-02-13 06:55:35 +00:00
s32 npages, bool ec_function);
int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot);
int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev);
void mlx5_register_debugfs(void);
void mlx5_unregister_debugfs(void);
void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm);
void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas);
int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn);
int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
struct dentry *mlx5_debugfs_get_dev_root(struct mlx5_core_dev *dev);
void mlx5_qp_debugfs_init(struct mlx5_core_dev *dev);
void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev);
int mlx5_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in,
void *data_out, int size_out, u16 reg_id, int arg,
int write, bool verbose);
int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
int size_in, void *data_out, int size_out,
u16 reg_num, int arg, int write);
int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db,
int node);
static inline int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
{
return mlx5_db_alloc_node(dev, db, dev->priv.numa_node);
}
void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db);
const char *mlx5_command_str(int command);
void mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev);
void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev);
int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
int npsvs, u32 *sig_index);
int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num);
void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
struct mlx5_odp_caps *odp_caps);
int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev,
u8 port_num, void *out, size_t sz);
int mlx5_init_rl_table(struct mlx5_core_dev *dev);
void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index,
struct mlx5_rate_limit *rl);
void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl);
bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate);
int mlx5_rl_add_rate_raw(struct mlx5_core_dev *dev, void *rl_in, u16 uid,
bool dedicated_entry, u16 *index);
void mlx5_rl_remove_rate_raw(struct mlx5_core_dev *dev, u16 index);
bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0,
struct mlx5_rate_limit *rl_1);
int mlx5_alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg,
bool map_wc, bool fast_path);
void mlx5_free_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg);
unsigned int mlx5_comp_vectors_count(struct mlx5_core_dev *dev);
struct cpumask *
mlx5_comp_irq_get_affinity_mask(struct mlx5_core_dev *dev, int vector);
unsigned int mlx5_core_reserved_gids_count(struct mlx5_core_dev *dev);
int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index,
u8 roce_version, u8 roce_l3_type, const u8 *gid,
const u8 *mac, bool vlan, u16 vlan_id, u8 port_num);
static inline u32 mlx5_mkey_to_idx(u32 mkey)
{
return mkey >> 8;
}
static inline u32 mlx5_idx_to_mkey(u32 mkey_idx)
{
return mkey_idx << 8;
}
static inline u8 mlx5_mkey_variant(u32 mkey)
{
return mkey & 0xff;
}
/* Async-atomic event notifier used by mlx5 core to forward FW
* evetns received from event queue to mlx5 consumers.
* Optimise event queue dipatching.
*/
int mlx5_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb);
int mlx5_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb);
/* Async-atomic event notifier used for forwarding
* evetns from the event queue into the to mlx5 events dispatcher,
* eswitch, clock and others.
*/
int mlx5_eq_notifier_register(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
int mlx5_eq_notifier_unregister(struct mlx5_core_dev *dev, struct mlx5_nb *nb);
/* Blocking event notifier used to forward SW events, used for slow path */
int mlx5_blocking_notifier_register(struct mlx5_core_dev *dev, struct notifier_block *nb);
int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb);
int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int event,
void *data);
int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev);
int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev);
bool mlx5_lag_is_roce(struct mlx5_core_dev *dev);
bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev);
bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
bool mlx5_lag_is_master(struct mlx5_core_dev *dev);
bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev);
struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
struct net_device *slave);
int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
u64 *values,
int num_counters,
size_t *offsets);
struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev);
u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev);
struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev);
void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up);
int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
u64 length, u32 log_alignment, u16 uid,
phys_addr_t *addr, u32 *obj_id);
int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev);
void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev);
int mlx5_sriov_blocking_notifier_register(struct mlx5_core_dev *mdev,
int vf_id,
struct notifier_block *nb);
void mlx5_sriov_blocking_notifier_unregister(struct mlx5_core_dev *mdev,
int vf_id,
struct notifier_block *nb);
#ifdef CONFIG_MLX5_CORE_IPOIB
struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
struct ib_device *ibdev,
const char *name,
void (*setup)(struct net_device *));
#endif /* CONFIG_MLX5_CORE_IPOIB */
int mlx5_rdma_rn_get_params(struct mlx5_core_dev *mdev,
struct ib_device *device,
struct rdma_netdev_alloc_params *params);
enum {
MLX5_PCI_DEV_IS_VF = 1 << 0,
};
static inline bool mlx5_core_is_pf(const struct mlx5_core_dev *dev)
{
return dev->coredev_type == MLX5_COREDEV_PF;
}
static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev)
{
return dev->coredev_type == MLX5_COREDEV_VF;
}
static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev)
net/mlx5: Introduce Mellanox SmartNIC and modify page management logic Mellanox's SmartNIC combines embedded CPU(e.g, ARM) processing power with advanced network offloads to accelerate a multitude of security, networking and storage applications. With the introduction of the SmartNIC, there is a new PCI function called Embedded CPU Physical Function(ECPF). And it's possible for a PF to get its ICM pages from the ECPF PCI function. Driver shall identify if it is running on such a function by reading a bit in the initialization segment. When firmware asks for pages, it would issue a page request event specifying how many pages it requests and for which function. That driver responds with a manage_pages command providing the requested pages along with an indication for which function it is providing these pages. The encoding before this patch was as follows: function_id == 0: pages are requested for the function receiving the EQE. function_id != 0: pages are requested for VF identified by the function_id value A new one bit field in the EQE identifies that pages are requested for the ECPF. The notion of page_supplier can be introduced here and to support that, manage pages and query pages were modified so firmware can distinguish the following cases: 1. Function provides pages for itself 2. PF provides pages for its VF 3. ECPF provides pages to itself 4. ECPF provides pages for another function This distinction is possible through the introduction of the bit "embedded_cpu_function" in query_pages, manage_pages and page request EQE. Signed-off-by: Bodong Wang <bodong@mellanox.com> Signed-off-by: Eli Cohen <eli@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2019-02-13 06:55:35 +00:00
{
return dev->caps.embedded_cpu;
}
static inline bool
mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev)
{
return dev->caps.embedded_cpu && MLX5_CAP_GEN(dev, eswitch_manager);
}
static inline bool mlx5_ecpf_vport_exists(const struct mlx5_core_dev *dev)
{
return mlx5_core_is_pf(dev) && MLX5_CAP_ESW(dev, ecpf_vport_exists);
}
static inline u16 mlx5_core_max_vfs(const struct mlx5_core_dev *dev)
{
return dev->priv.sriov.max_vfs;
}
static inline int mlx5_get_gid_table_len(u16 param)
{
if (param > 4) {
pr_warn("gid table length is zero\n");
return 0;
}
return 8 * (1 << param);
}
static inline bool mlx5_rl_is_supported(struct mlx5_core_dev *dev)
{
return !!(dev->priv.rl_table.max_size);
}
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
static inline int mlx5_core_is_mp_slave(struct mlx5_core_dev *dev)
{
return MLX5_CAP_GEN(dev, affiliate_nic_vport_criteria) &&
MLX5_CAP_GEN(dev, num_vhca_ports) <= 1;
}
static inline int mlx5_core_is_mp_master(struct mlx5_core_dev *dev)
{
return MLX5_CAP_GEN(dev, num_vhca_ports) > 1;
}
static inline int mlx5_core_mp_enabled(struct mlx5_core_dev *dev)
{
return mlx5_core_is_mp_slave(dev) ||
mlx5_core_is_mp_master(dev);
}
static inline int mlx5_core_native_port_num(struct mlx5_core_dev *dev)
{
{net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens <danielj@mellanox.com> Reviewed-by: Parav Pandit <parav@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-01-04 15:25:36 +00:00
if (!mlx5_core_mp_enabled(dev))
return 1;
return MLX5_CAP_GEN(dev, native_port_num);
}
static inline int mlx5_get_dev_index(struct mlx5_core_dev *dev)
{
int idx = MLX5_CAP_GEN(dev, native_port_num);
if (idx >= 1 && idx <= MLX5_MAX_PORTS)
return idx - 1;
else
return PCI_FUNC(dev->pdev->devfn);
}
enum {
MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32,
};
bool mlx5_is_roce_on(struct mlx5_core_dev *dev);
static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev)
{
if (MLX5_CAP_GEN(dev, roce_rw_supported))
return MLX5_CAP_GEN(dev, roce);
/* If RoCE cap is read-only in FW, get RoCE state from devlink
* in order to support RoCE enable/disable feature
*/
return mlx5_is_roce_on(dev);
}
#endif /* MLX5_DRIVER_H */