2018-05-17 21:44:15 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
|
|
* This file implements the error recovery as a core part of PCIe error
|
|
|
|
* reporting. When a PCIe error is delivered, an error message will be
|
|
|
|
* collected and printed to console, then, an error recovery procedure
|
|
|
|
* will be executed by following the PCI error recovery rules.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2006 Intel Corp.
|
|
|
|
* Tom Long Nguyen (tom.l.nguyen@intel.com)
|
|
|
|
* Zhang Yanmin (yanmin.zhang@intel.com)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/pci.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/pci.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/aer.h>
|
|
|
|
#include "portdrv.h"
|
|
|
|
#include "../pci.h"
|
|
|
|
|
|
|
|
struct aer_broadcast_data {
|
|
|
|
enum pci_channel_state state;
|
|
|
|
enum pci_ers_result result;
|
|
|
|
};
|
|
|
|
|
|
|
|
static pci_ers_result_t merge_result(enum pci_ers_result orig,
|
|
|
|
enum pci_ers_result new)
|
|
|
|
{
|
|
|
|
if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
|
|
|
|
return PCI_ERS_RESULT_NO_AER_DRIVER;
|
|
|
|
|
|
|
|
if (new == PCI_ERS_RESULT_NONE)
|
|
|
|
return orig;
|
|
|
|
|
|
|
|
switch (orig) {
|
|
|
|
case PCI_ERS_RESULT_CAN_RECOVER:
|
|
|
|
case PCI_ERS_RESULT_RECOVERED:
|
|
|
|
orig = new;
|
|
|
|
break;
|
|
|
|
case PCI_ERS_RESULT_DISCONNECT:
|
|
|
|
if (new == PCI_ERS_RESULT_NEED_RESET)
|
|
|
|
orig = PCI_ERS_RESULT_NEED_RESET;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return orig;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int report_error_detected(struct pci_dev *dev, void *data)
|
|
|
|
{
|
|
|
|
pci_ers_result_t vote;
|
|
|
|
const struct pci_error_handlers *err_handler;
|
|
|
|
struct aer_broadcast_data *result_data;
|
|
|
|
|
|
|
|
result_data = (struct aer_broadcast_data *) data;
|
|
|
|
|
|
|
|
device_lock(&dev->dev);
|
|
|
|
dev->error_state = result_data->state;
|
|
|
|
|
|
|
|
if (!dev->driver ||
|
|
|
|
!dev->driver->err_handler ||
|
|
|
|
!dev->driver->err_handler->error_detected) {
|
|
|
|
/*
|
PCI/ERR: Run error recovery callbacks for all affected devices
If an Endpoint reported an error with ERR_FATAL, we previously ran driver
error recovery callbacks only for the Endpoint's driver. But if we reset a
Link to recover from the error, all downstream components are affected,
including the Endpoint, any multi-function peers, and children of those
peers.
Initiate the Link reset from the deepest Downstream Port that is
reliable, and call the error recovery callbacks for all its children.
If a Downstream Port (including a Root Port) reports an error, we assume
the Port itself is reliable and we need to reset its downstream Link. In
all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume
the Link leading to the component needs to be reset, so we initiate the
reset at the parent Downstream Port.
This allows two other clean-ups. First, we currently only use a Link
reset, which can only be initiated using a Downstream Port, so we can
remove checks for Endpoints. Second, the Downstream Port where we initiate
the Link reset is reliable (unlike components downstream from it), so the
special cases for error detect and resume are no longer necessary.
Signed-off-by: Keith Busch <keith.busch@intel.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
2018-09-20 16:27:13 +00:00
|
|
|
* If any device in the subtree does not have an error_detected
|
|
|
|
* callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent
|
|
|
|
* error callbacks of "any" device in the subtree, and will
|
|
|
|
* exit in the disconnected error state.
|
2018-05-17 21:44:15 +00:00
|
|
|
*/
|
|
|
|
if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
|
|
|
|
vote = PCI_ERS_RESULT_NO_AER_DRIVER;
|
|
|
|
else
|
|
|
|
vote = PCI_ERS_RESULT_NONE;
|
|
|
|
} else {
|
|
|
|
err_handler = dev->driver->err_handler;
|
|
|
|
vote = err_handler->error_detected(dev, result_data->state);
|
|
|
|
pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
|
|
|
|
}
|
|
|
|
|
|
|
|
result_data->result = merge_result(result_data->result, vote);
|
|
|
|
device_unlock(&dev->dev);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int report_mmio_enabled(struct pci_dev *dev, void *data)
|
|
|
|
{
|
|
|
|
pci_ers_result_t vote;
|
|
|
|
const struct pci_error_handlers *err_handler;
|
|
|
|
struct aer_broadcast_data *result_data;
|
|
|
|
|
|
|
|
result_data = (struct aer_broadcast_data *) data;
|
|
|
|
|
|
|
|
device_lock(&dev->dev);
|
|
|
|
if (!dev->driver ||
|
|
|
|
!dev->driver->err_handler ||
|
|
|
|
!dev->driver->err_handler->mmio_enabled)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
err_handler = dev->driver->err_handler;
|
|
|
|
vote = err_handler->mmio_enabled(dev);
|
|
|
|
result_data->result = merge_result(result_data->result, vote);
|
|
|
|
out:
|
|
|
|
device_unlock(&dev->dev);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int report_slot_reset(struct pci_dev *dev, void *data)
|
|
|
|
{
|
|
|
|
pci_ers_result_t vote;
|
|
|
|
const struct pci_error_handlers *err_handler;
|
|
|
|
struct aer_broadcast_data *result_data;
|
|
|
|
|
|
|
|
result_data = (struct aer_broadcast_data *) data;
|
|
|
|
|
|
|
|
device_lock(&dev->dev);
|
|
|
|
if (!dev->driver ||
|
|
|
|
!dev->driver->err_handler ||
|
|
|
|
!dev->driver->err_handler->slot_reset)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
err_handler = dev->driver->err_handler;
|
|
|
|
vote = err_handler->slot_reset(dev);
|
|
|
|
result_data->result = merge_result(result_data->result, vote);
|
|
|
|
out:
|
|
|
|
device_unlock(&dev->dev);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int report_resume(struct pci_dev *dev, void *data)
|
|
|
|
{
|
|
|
|
const struct pci_error_handlers *err_handler;
|
|
|
|
|
|
|
|
device_lock(&dev->dev);
|
|
|
|
dev->error_state = pci_channel_io_normal;
|
|
|
|
|
|
|
|
if (!dev->driver ||
|
|
|
|
!dev->driver->err_handler ||
|
|
|
|
!dev->driver->err_handler->resume)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
err_handler = dev->driver->err_handler;
|
|
|
|
err_handler->resume(dev);
|
|
|
|
pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
|
|
|
|
out:
|
|
|
|
device_unlock(&dev->dev);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* default_reset_link - default reset function
|
|
|
|
* @dev: pointer to pci_dev data structure
|
|
|
|
*
|
|
|
|
* Invoked when performing link reset on a Downstream Port or a
|
|
|
|
* Root Port with no aer driver.
|
|
|
|
*/
|
|
|
|
static pci_ers_result_t default_reset_link(struct pci_dev *dev)
|
|
|
|
{
|
2018-07-19 23:04:09 +00:00
|
|
|
int rc;
|
|
|
|
|
2018-09-20 16:27:11 +00:00
|
|
|
rc = pci_bus_error_reset(dev);
|
2018-05-17 21:44:15 +00:00
|
|
|
pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
|
2018-07-19 23:04:09 +00:00
|
|
|
return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
|
2018-05-17 21:44:15 +00:00
|
|
|
}
|
|
|
|
|
2018-05-17 21:44:19 +00:00
|
|
|
static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
|
2018-05-17 21:44:15 +00:00
|
|
|
{
|
|
|
|
pci_ers_result_t status;
|
|
|
|
struct pcie_port_service_driver *driver = NULL;
|
|
|
|
|
PCI/ERR: Run error recovery callbacks for all affected devices
If an Endpoint reported an error with ERR_FATAL, we previously ran driver
error recovery callbacks only for the Endpoint's driver. But if we reset a
Link to recover from the error, all downstream components are affected,
including the Endpoint, any multi-function peers, and children of those
peers.
Initiate the Link reset from the deepest Downstream Port that is
reliable, and call the error recovery callbacks for all its children.
If a Downstream Port (including a Root Port) reports an error, we assume
the Port itself is reliable and we need to reset its downstream Link. In
all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume
the Link leading to the component needs to be reset, so we initiate the
reset at the parent Downstream Port.
This allows two other clean-ups. First, we currently only use a Link
reset, which can only be initiated using a Downstream Port, so we can
remove checks for Endpoints. Second, the Downstream Port where we initiate
the Link reset is reliable (unlike components downstream from it), so the
special cases for error detect and resume are no longer necessary.
Signed-off-by: Keith Busch <keith.busch@intel.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
2018-09-20 16:27:13 +00:00
|
|
|
driver = pcie_port_find_service(dev, service);
|
2018-05-17 21:44:15 +00:00
|
|
|
if (driver && driver->reset_link) {
|
PCI/ERR: Run error recovery callbacks for all affected devices
If an Endpoint reported an error with ERR_FATAL, we previously ran driver
error recovery callbacks only for the Endpoint's driver. But if we reset a
Link to recover from the error, all downstream components are affected,
including the Endpoint, any multi-function peers, and children of those
peers.
Initiate the Link reset from the deepest Downstream Port that is
reliable, and call the error recovery callbacks for all its children.
If a Downstream Port (including a Root Port) reports an error, we assume
the Port itself is reliable and we need to reset its downstream Link. In
all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume
the Link leading to the component needs to be reset, so we initiate the
reset at the parent Downstream Port.
This allows two other clean-ups. First, we currently only use a Link
reset, which can only be initiated using a Downstream Port, so we can
remove checks for Endpoints. Second, the Downstream Port where we initiate
the Link reset is reliable (unlike components downstream from it), so the
special cases for error detect and resume are no longer necessary.
Signed-off-by: Keith Busch <keith.busch@intel.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
2018-09-20 16:27:13 +00:00
|
|
|
status = driver->reset_link(dev);
|
|
|
|
} else if (dev->has_secondary_link) {
|
|
|
|
status = default_reset_link(dev);
|
2018-05-17 21:44:15 +00:00
|
|
|
} else {
|
|
|
|
pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
|
PCI/ERR: Run error recovery callbacks for all affected devices
If an Endpoint reported an error with ERR_FATAL, we previously ran driver
error recovery callbacks only for the Endpoint's driver. But if we reset a
Link to recover from the error, all downstream components are affected,
including the Endpoint, any multi-function peers, and children of those
peers.
Initiate the Link reset from the deepest Downstream Port that is
reliable, and call the error recovery callbacks for all its children.
If a Downstream Port (including a Root Port) reports an error, we assume
the Port itself is reliable and we need to reset its downstream Link. In
all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume
the Link leading to the component needs to be reset, so we initiate the
reset at the parent Downstream Port.
This allows two other clean-ups. First, we currently only use a Link
reset, which can only be initiated using a Downstream Port, so we can
remove checks for Endpoints. Second, the Downstream Port where we initiate
the Link reset is reliable (unlike components downstream from it), so the
special cases for error detect and resume are no longer necessary.
Signed-off-by: Keith Busch <keith.busch@intel.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
2018-09-20 16:27:13 +00:00
|
|
|
pci_name(dev));
|
2018-05-17 21:44:15 +00:00
|
|
|
return PCI_ERS_RESULT_DISCONNECT;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (status != PCI_ERS_RESULT_RECOVERED) {
|
|
|
|
pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
|
PCI/ERR: Run error recovery callbacks for all affected devices
If an Endpoint reported an error with ERR_FATAL, we previously ran driver
error recovery callbacks only for the Endpoint's driver. But if we reset a
Link to recover from the error, all downstream components are affected,
including the Endpoint, any multi-function peers, and children of those
peers.
Initiate the Link reset from the deepest Downstream Port that is
reliable, and call the error recovery callbacks for all its children.
If a Downstream Port (including a Root Port) reports an error, we assume
the Port itself is reliable and we need to reset its downstream Link. In
all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume
the Link leading to the component needs to be reset, so we initiate the
reset at the parent Downstream Port.
This allows two other clean-ups. First, we currently only use a Link
reset, which can only be initiated using a Downstream Port, so we can
remove checks for Endpoints. Second, the Downstream Port where we initiate
the Link reset is reliable (unlike components downstream from it), so the
special cases for error detect and resume are no longer necessary.
Signed-off-by: Keith Busch <keith.busch@intel.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
2018-09-20 16:27:13 +00:00
|
|
|
pci_name(dev));
|
2018-05-17 21:44:15 +00:00
|
|
|
return PCI_ERS_RESULT_DISCONNECT;
|
|
|
|
}
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* broadcast_error_message - handle message broadcast to downstream drivers
|
|
|
|
* @dev: pointer to from where in a hierarchy message is broadcasted down
|
|
|
|
* @state: error state
|
|
|
|
* @error_mesg: message to print
|
|
|
|
* @cb: callback to be broadcasted
|
|
|
|
*
|
|
|
|
* Invoked during error recovery process. Once being invoked, the content
|
|
|
|
* of error severity will be broadcasted to all downstream drivers in a
|
|
|
|
* hierarchy in question.
|
|
|
|
*/
|
|
|
|
static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
|
|
|
|
enum pci_channel_state state,
|
|
|
|
char *error_mesg,
|
|
|
|
int (*cb)(struct pci_dev *, void *))
|
|
|
|
{
|
|
|
|
struct aer_broadcast_data result_data;
|
|
|
|
|
|
|
|
pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg);
|
|
|
|
result_data.state = state;
|
|
|
|
if (cb == report_error_detected)
|
|
|
|
result_data.result = PCI_ERS_RESULT_CAN_RECOVER;
|
|
|
|
else
|
|
|
|
result_data.result = PCI_ERS_RESULT_RECOVERED;
|
|
|
|
|
PCI/ERR: Run error recovery callbacks for all affected devices
If an Endpoint reported an error with ERR_FATAL, we previously ran driver
error recovery callbacks only for the Endpoint's driver. But if we reset a
Link to recover from the error, all downstream components are affected,
including the Endpoint, any multi-function peers, and children of those
peers.
Initiate the Link reset from the deepest Downstream Port that is
reliable, and call the error recovery callbacks for all its children.
If a Downstream Port (including a Root Port) reports an error, we assume
the Port itself is reliable and we need to reset its downstream Link. In
all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume
the Link leading to the component needs to be reset, so we initiate the
reset at the parent Downstream Port.
This allows two other clean-ups. First, we currently only use a Link
reset, which can only be initiated using a Downstream Port, so we can
remove checks for Endpoints. Second, the Downstream Port where we initiate
the Link reset is reliable (unlike components downstream from it), so the
special cases for error detect and resume are no longer necessary.
Signed-off-by: Keith Busch <keith.busch@intel.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
2018-09-20 16:27:13 +00:00
|
|
|
pci_walk_bus(dev->subordinate, cb, &result_data);
|
2018-05-17 21:44:15 +00:00
|
|
|
return result_data.result;
|
|
|
|
}
|
|
|
|
|
2018-09-20 16:27:12 +00:00
|
|
|
void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
|
|
|
|
u32 service)
|
2018-05-17 21:44:15 +00:00
|
|
|
{
|
|
|
|
pci_ers_result_t status;
|
|
|
|
|
PCI/ERR: Run error recovery callbacks for all affected devices
If an Endpoint reported an error with ERR_FATAL, we previously ran driver
error recovery callbacks only for the Endpoint's driver. But if we reset a
Link to recover from the error, all downstream components are affected,
including the Endpoint, any multi-function peers, and children of those
peers.
Initiate the Link reset from the deepest Downstream Port that is
reliable, and call the error recovery callbacks for all its children.
If a Downstream Port (including a Root Port) reports an error, we assume
the Port itself is reliable and we need to reset its downstream Link. In
all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume
the Link leading to the component needs to be reset, so we initiate the
reset at the parent Downstream Port.
This allows two other clean-ups. First, we currently only use a Link
reset, which can only be initiated using a Downstream Port, so we can
remove checks for Endpoints. Second, the Downstream Port where we initiate
the Link reset is reliable (unlike components downstream from it), so the
special cases for error detect and resume are no longer necessary.
Signed-off-by: Keith Busch <keith.busch@intel.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
2018-09-20 16:27:13 +00:00
|
|
|
/*
|
|
|
|
* Error recovery runs on all subordinates of the first downstream port.
|
|
|
|
* If the downstream port detected the error, it is cleared at the end.
|
|
|
|
*/
|
|
|
|
if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
|
|
|
|
pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
|
|
|
|
dev = dev->bus->self;
|
|
|
|
|
2018-05-17 21:44:15 +00:00
|
|
|
status = broadcast_error_message(dev,
|
|
|
|
state,
|
|
|
|
"error_detected",
|
|
|
|
report_error_detected);
|
|
|
|
|
2018-09-20 16:27:12 +00:00
|
|
|
if (state == pci_channel_io_frozen &&
|
|
|
|
reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED)
|
|
|
|
goto failed;
|
|
|
|
|
2018-05-17 21:44:15 +00:00
|
|
|
if (status == PCI_ERS_RESULT_CAN_RECOVER)
|
|
|
|
status = broadcast_error_message(dev,
|
|
|
|
state,
|
|
|
|
"mmio_enabled",
|
|
|
|
report_mmio_enabled);
|
|
|
|
|
|
|
|
if (status == PCI_ERS_RESULT_NEED_RESET) {
|
|
|
|
/*
|
|
|
|
* TODO: Should call platform-specific
|
|
|
|
* functions to reset slot before calling
|
|
|
|
* drivers' slot_reset callbacks?
|
|
|
|
*/
|
|
|
|
status = broadcast_error_message(dev,
|
|
|
|
state,
|
|
|
|
"slot_reset",
|
|
|
|
report_slot_reset);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (status != PCI_ERS_RESULT_RECOVERED)
|
|
|
|
goto failed;
|
|
|
|
|
|
|
|
broadcast_error_message(dev,
|
|
|
|
state,
|
|
|
|
"resume",
|
|
|
|
report_resume);
|
|
|
|
|
PCI/ERR: Run error recovery callbacks for all affected devices
If an Endpoint reported an error with ERR_FATAL, we previously ran driver
error recovery callbacks only for the Endpoint's driver. But if we reset a
Link to recover from the error, all downstream components are affected,
including the Endpoint, any multi-function peers, and children of those
peers.
Initiate the Link reset from the deepest Downstream Port that is
reliable, and call the error recovery callbacks for all its children.
If a Downstream Port (including a Root Port) reports an error, we assume
the Port itself is reliable and we need to reset its downstream Link. In
all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume
the Link leading to the component needs to be reset, so we initiate the
reset at the parent Downstream Port.
This allows two other clean-ups. First, we currently only use a Link
reset, which can only be initiated using a Downstream Port, so we can
remove checks for Endpoints. Second, the Downstream Port where we initiate
the Link reset is reliable (unlike components downstream from it), so the
special cases for error detect and resume are no longer necessary.
Signed-off-by: Keith Busch <keith.busch@intel.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Sinan Kaya <okaya@kernel.org>
2018-09-20 16:27:13 +00:00
|
|
|
pci_aer_clear_device_status(dev);
|
|
|
|
pci_cleanup_aer_uncorrect_error_status(dev);
|
2018-05-17 21:44:15 +00:00
|
|
|
pci_info(dev, "AER: Device recovery successful\n");
|
|
|
|
return;
|
|
|
|
|
|
|
|
failed:
|
|
|
|
pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
|
|
|
|
|
|
|
|
/* TODO: Should kernel panic here? */
|
|
|
|
pci_info(dev, "AER: Device recovery failed\n");
|
|
|
|
}
|