From 3f25bb4b7f7718d391321608f947840a79934568 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 21 Jun 2017 10:35:09 +0300 Subject: [PATCH 01/47] iwlwifi: mvm: fix TCP CSUM offload with WEP and A000 series When we enabled TCP checksum offload, we need to tell the firmware where the IP header starts. If we have an IV, then we need to adapt that value since the IV is placed before the SNAP header. This is true only for cases where the driver adds the IV, not the WEP case in which the IV is added by the firmware itself. On A000 devices series, the IV is always added by the device. Fix this. Fixes: 5e6a98dc4863 ("iwlwifi: mvm: enable TCP/UDP checksum support for 9000 family") Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index 60360ed73f26..e5d3eba2e82a 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -185,8 +185,14 @@ static u16 iwl_mvm_tx_csum(struct iwl_mvm *mvm, struct sk_buff *skb, else udp_hdr(skb)->check = 0; - /* mac header len should include IV, size is in words */ - if (info->control.hw_key) + /* + * mac header len should include IV, size is in words unless + * the IV is added by the firmware like in WEP. + * In new Tx API, the IV is always added by the firmware. + */ + if (!iwl_mvm_has_new_tx_api(mvm) && info->control.hw_key && + info->control.hw_key->cipher != WLAN_CIPHER_SUITE_WEP40 && + info->control.hw_key->cipher != WLAN_CIPHER_SUITE_WEP104) mh_len += info->control.hw_key->iv_len; mh_len /= 2; offload_assist |= mh_len << TX_CMD_OFFLD_MH_SIZE; From 58877d7428b0747134cb65096d51dfabdc62000d Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 29 Jun 2017 16:18:21 +0300 Subject: [PATCH 02/47] iwlwifi: add TLV for MLME offload firmware capability The firmware now adds a new DWORD for the MLME offload's capability even on firmware versions that don't support it. Add the TLV bit to avoid getting the print: capa flags index 3 larger than supported by driver. This fixes the bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196195 Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/fw/file.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/file.h b/drivers/net/wireless/intel/iwlwifi/fw/file.h index 0fa8c473f1e2..c73a6438ce8f 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/file.h +++ b/drivers/net/wireless/intel/iwlwifi/fw/file.h @@ -328,6 +328,7 @@ typedef unsigned int __bitwise iwl_ucode_tlv_capa_t; * @IWL_UCODE_TLV_CAPA_TX_POWER_ACK: reduced TX power API has larger * command size (command version 4) that supports toggling ACK TX * power reduction. + * @IWL_UCODE_TLV_CAPA_MLME_OFFLOAD: supports MLME offload * * @NUM_IWL_UCODE_TLV_CAPA: number of bits used */ @@ -373,6 +374,7 @@ enum iwl_ucode_tlv_capa { IWL_UCODE_TLV_CAPA_EXTEND_SHARED_MEM_CFG = (__force iwl_ucode_tlv_capa_t)80, IWL_UCODE_TLV_CAPA_LQM_SUPPORT = (__force iwl_ucode_tlv_capa_t)81, IWL_UCODE_TLV_CAPA_TX_POWER_ACK = (__force iwl_ucode_tlv_capa_t)84, + IWL_UCODE_TLV_CAPA_MLME_OFFLOAD = (__force iwl_ucode_tlv_capa_t)96, NUM_IWL_UCODE_TLV_CAPA #ifdef __CHECKER__ From 92b0f7b26b313b23cc9bef0bd406607f4566c0c0 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 3 Jul 2017 16:25:33 +0300 Subject: [PATCH 03/47] iwlwifi: split the regulatory rules when the bandwidth flags require it When we create a regulatory domain out of an MCC notification, we need to make sure that all the channels in the rule have the exact same properties. The current code mixes channel 36 and 40 although 36 can be a control channel with HT40+ (36, 40) whereas 40 can't be a control channel with HT40+ since (40, 44) is invalid. Because of that, cfg80211 would allow to connect in 40MHz to APs that are configured to channel 40 HT40+ and that made our firmware assert. Fix this by checking the bandwidth flags before taking the decision if the rule should be split. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=195299 partly. Fixes: af45a9003f1f ("iwlwifi: create regdomain from mcc_update_cmd response") Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho --- .../wireless/intel/iwlwifi/iwl-nvm-parse.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index 5c08f4d40f6a..3ee6767392b6 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -785,7 +785,8 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, int num_of_ch, __le32 *channels, u16 fw_mcc) { int ch_idx; - u16 ch_flags, prev_ch_flags = 0; + u16 ch_flags; + u32 reg_rule_flags, prev_reg_rule_flags = 0; const u8 *nvm_chan = cfg->ext_nvm ? iwl_ext_nvm_channels : iwl_nvm_channels; struct ieee80211_regdomain *regd; @@ -834,8 +835,11 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, continue; } + reg_rule_flags = iwl_nvm_get_regdom_bw_flags(nvm_chan, ch_idx, + ch_flags, cfg); + /* we can't continue the same rule */ - if (ch_idx == 0 || prev_ch_flags != ch_flags || + if (ch_idx == 0 || prev_reg_rule_flags != reg_rule_flags || center_freq - prev_center_freq > 20) { valid_rules++; new_rule = true; @@ -854,18 +858,17 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, rule->power_rule.max_eirp = DBM_TO_MBM(IWL_DEFAULT_MAX_TX_POWER); - rule->flags = iwl_nvm_get_regdom_bw_flags(nvm_chan, ch_idx, - ch_flags, cfg); + rule->flags = reg_rule_flags; /* rely on auto-calculation to merge BW of contiguous chans */ rule->flags |= NL80211_RRF_AUTO_BW; rule->freq_range.max_bandwidth_khz = 0; - prev_ch_flags = ch_flags; prev_center_freq = center_freq; + prev_reg_rule_flags = reg_rule_flags; IWL_DEBUG_DEV(dev, IWL_DL_LAR, - "Ch. %d [%sGHz] %s%s%s%s%s%s%s%s%s(0x%02x): Ad-Hoc %ssupported\n", + "Ch. %d [%sGHz] %s%s%s%s%s%s%s%s%s(0x%02x) reg_flags 0x%x: %s\n", center_freq, band == NL80211_BAND_5GHZ ? "5.2" : "2.4", CHECK_AND_PRINT_I(VALID), @@ -877,10 +880,10 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg, CHECK_AND_PRINT_I(160MHZ), CHECK_AND_PRINT_I(INDOOR_ONLY), CHECK_AND_PRINT_I(GO_CONCURRENT), - ch_flags, + ch_flags, reg_rule_flags, ((ch_flags & NVM_CHANNEL_ACTIVE) && !(ch_flags & NVM_CHANNEL_RADAR)) - ? "" : "not "); + ? "Ad-Hoc" : ""); } regd->n_reg_rules = valid_rules; From 9465c3f8ba67cff697c0529de5a03cc5e1509d41 Mon Sep 17 00:00:00 2001 From: Gregory Greenman Date: Thu, 6 Jul 2017 05:07:33 +0300 Subject: [PATCH 04/47] iwlwifi: mvm: set A-MPDU bit upon empty BA notification from FW The bit was set only if there was at least one reclaimed frame in an aggregation. It's important to set it also in the case that the whole A-MPDU was lost, otherwise rate scaling statistics will not be updated correctly. Thus, set it always in ba notification handler. This fixes a throughput degradation of about 20% in certain scenarios with multiple streams on 11ac. Signed-off-by: Gregory Greenman Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c index e5d3eba2e82a..5fcc9dd6be56 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c @@ -1821,6 +1821,8 @@ void iwl_mvm_rx_ba_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb) struct iwl_mvm_tid_data *tid_data; struct iwl_mvm_sta *mvmsta; + ba_info.flags = IEEE80211_TX_STAT_AMPDU; + if (iwl_mvm_has_new_tx_api(mvm)) { struct iwl_mvm_compressed_ba_notif *ba_res = (void *)pkt->data; From 87f55616f81bf6c82f0e94cf4661151399d2a7b6 Mon Sep 17 00:00:00 2001 From: Gregory Greenman Date: Thu, 6 Jul 2017 05:27:55 +0300 Subject: [PATCH 05/47] iwlwifi: mvm: rs: fix TLC statistics collection Statistics should be collected according to the actual rate a frame/aggregation was transmitted and not according to the initial rate from the last LQ command (these rates are different if the frames were retransmitted at a lower rate from the rate scale table). This is needed to remove throughput degradation. Signed-off-by: Gregory Greenman Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/rs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c index 65beca3a457a..8999a1199d60 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c @@ -1291,7 +1291,7 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta, * first index into rate scale table. */ if (info->flags & IEEE80211_TX_STAT_AMPDU) { - rs_collect_tpc_data(mvm, lq_sta, curr_tbl, lq_rate.index, + rs_collect_tpc_data(mvm, lq_sta, curr_tbl, tx_resp_rate.index, info->status.ampdu_len, info->status.ampdu_ack_len, reduced_txp); @@ -1312,7 +1312,7 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta, if (info->status.ampdu_ack_len == 0) info->status.ampdu_len = 1; - rs_collect_tlc_data(mvm, lq_sta, curr_tbl, lq_rate.index, + rs_collect_tlc_data(mvm, lq_sta, curr_tbl, tx_resp_rate.index, info->status.ampdu_len, info->status.ampdu_ack_len); @@ -1348,11 +1348,11 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta, continue; rs_collect_tpc_data(mvm, lq_sta, tmp_tbl, - lq_rate.index, 1, + tx_resp_rate.index, 1, i < retries ? 0 : legacy_success, reduced_txp); rs_collect_tlc_data(mvm, lq_sta, tmp_tbl, - lq_rate.index, 1, + tx_resp_rate.index, 1, i < retries ? 0 : legacy_success); } From e9fb92e13d5e743a6ff13d3206c0f9e5e8cbc1f4 Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Sun, 9 Jul 2017 15:51:44 +0300 Subject: [PATCH 06/47] iwlwifi: fix fw_pre_next_step to apply also for C step C step NICs should use the latest FW (currently B step). Correct the condition to make C step NICs advanced its default FW name to the latest one. Also rename _next_ to b_or_c to avoid confusion. Fixes: 5da083d1922c ("iwlwifi: add support for 9000 HW B-step NICs") Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/cfg/9000.c | 14 +++++++------- drivers/net/wireless/intel/iwlwifi/iwl-config.h | 8 ++++---- drivers/net/wireless/intel/iwlwifi/iwl-drv.c | 5 +++-- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/9000.c b/drivers/net/wireless/intel/iwlwifi/cfg/9000.c index b4ecd1fe1374..97208ce19f92 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/9000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/9000.c @@ -154,7 +154,7 @@ static const struct iwl_tt_params iwl9000_tt_params = { const struct iwl_cfg iwl9160_2ac_cfg = { .name = "Intel(R) Dual Band Wireless AC 9160", .fw_name_pre = IWL9260A_FW_PRE, - .fw_name_pre_next_step = IWL9260B_FW_PRE, + .fw_name_pre_b_or_c_step = IWL9260B_FW_PRE, IWL_DEVICE_9000, .ht_params = &iwl9000_ht_params, .nvm_ver = IWL9000_NVM_VERSION, @@ -165,7 +165,7 @@ const struct iwl_cfg iwl9160_2ac_cfg = { const struct iwl_cfg iwl9260_2ac_cfg = { .name = "Intel(R) Dual Band Wireless AC 9260", .fw_name_pre = IWL9260A_FW_PRE, - .fw_name_pre_next_step = IWL9260B_FW_PRE, + .fw_name_pre_b_or_c_step = IWL9260B_FW_PRE, IWL_DEVICE_9000, .ht_params = &iwl9000_ht_params, .nvm_ver = IWL9000_NVM_VERSION, @@ -176,7 +176,7 @@ const struct iwl_cfg iwl9260_2ac_cfg = { const struct iwl_cfg iwl9270_2ac_cfg = { .name = "Intel(R) Dual Band Wireless AC 9270", .fw_name_pre = IWL9260A_FW_PRE, - .fw_name_pre_next_step = IWL9260B_FW_PRE, + .fw_name_pre_b_or_c_step = IWL9260B_FW_PRE, IWL_DEVICE_9000, .ht_params = &iwl9000_ht_params, .nvm_ver = IWL9000_NVM_VERSION, @@ -186,8 +186,8 @@ const struct iwl_cfg iwl9270_2ac_cfg = { const struct iwl_cfg iwl9460_2ac_cfg = { .name = "Intel(R) Dual Band Wireless AC 9460", - .fw_name_pre = IWL9000_FW_PRE, - .fw_name_pre_rf_next_step = IWL9000RFB_FW_PRE, + .fw_name_pre = IWL9260A_FW_PRE, + .fw_name_pre_b_or_c_step = IWL9260B_FW_PRE, IWL_DEVICE_9000, .ht_params = &iwl9000_ht_params, .nvm_ver = IWL9000_NVM_VERSION, @@ -198,8 +198,8 @@ const struct iwl_cfg iwl9460_2ac_cfg = { const struct iwl_cfg iwl9560_2ac_cfg = { .name = "Intel(R) Dual Band Wireless AC 9560", - .fw_name_pre = IWL9000_FW_PRE, - .fw_name_pre_rf_next_step = IWL9000RFB_FW_PRE, + .fw_name_pre = IWL9260A_FW_PRE, + .fw_name_pre_b_or_c_step = IWL9260B_FW_PRE, IWL_DEVICE_9000, .ht_params = &iwl9000_ht_params, .nvm_ver = IWL9000_NVM_VERSION, diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index c52623cb7c2a..d19c74827fbb 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -276,10 +276,10 @@ struct iwl_pwr_tx_backoff { * @fw_name_pre: Firmware filename prefix. The api version and extension * (.ucode) will be added to filename before loading from disk. The * filename is constructed as fw_name_pre.ucode. - * @fw_name_pre_next_step: same as @fw_name_pre, only for next step + * @fw_name_pre_b_or_c_step: same as @fw_name_pre, only for b or c steps * (if supported) - * @fw_name_pre_rf_next_step: same as @fw_name_pre_next_step, only for rf next - * step. Supported only in integrated solutions. + * @fw_name_pre_rf_next_step: same as @fw_name_pre_b_or_c_step, only for rf + * next step. Supported only in integrated solutions. * @ucode_api_max: Highest version of uCode API supported by driver. * @ucode_api_min: Lowest version of uCode API supported by driver. * @max_inst_size: The maximal length of the fw inst section @@ -330,7 +330,7 @@ struct iwl_cfg { /* params specific to an individual device within a device family */ const char *name; const char *fw_name_pre; - const char *fw_name_pre_next_step; + const char *fw_name_pre_b_or_c_step; const char *fw_name_pre_rf_next_step; /* params not likely to change within a device family */ const struct iwl_base_params *base_params; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c index 6fdb5921e17f..4e0f86fe0a6f 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c @@ -216,8 +216,9 @@ static int iwl_request_firmware(struct iwl_drv *drv, bool first) const char *fw_pre_name; if (drv->trans->cfg->device_family == IWL_DEVICE_FAMILY_9000 && - CSR_HW_REV_STEP(drv->trans->hw_rev) == SILICON_B_STEP) - fw_pre_name = cfg->fw_name_pre_next_step; + (CSR_HW_REV_STEP(drv->trans->hw_rev) == SILICON_B_STEP || + CSR_HW_REV_STEP(drv->trans->hw_rev) == SILICON_C_STEP)) + fw_pre_name = cfg->fw_name_pre_b_or_c_step; else if (drv->trans->cfg->integrated && CSR_HW_RFID_STEP(drv->trans->hw_rf_id) == SILICON_B_STEP && cfg->fw_name_pre_rf_next_step) From 8addabf8e6e299f790038fdc92ddceaaf76adab8 Mon Sep 17 00:00:00 2001 From: Naftali Goldstein Date: Thu, 27 Jul 2017 04:53:55 +0300 Subject: [PATCH 07/47] iwlwifi: mvm: set the RTS_MIMO_PROT bit in flag mask when sending sta to fw Set the STA_FLG_RTS_MIMO_PROT bit in station_flags_msk of the add sta command, so that when smps mode changes, the FW will know about it. In particular, in AP mode, clients are added upon receival of an auth request, at which point there's no knowledge of the client's smps mode. When the assoc request arrives, the add_sta command is resent to modify the station parameters. At this point the driver knows the smps mode, but since the corresponding bit in the mask is not set, the fw doesn't update this field so there's no rts protection for mimo. Fixes: 5bc5aaad407c ("iwlwifi: mvm: set up initial SMPS/NSS station info") Signed-off-by: Naftali Goldstein Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index ab66b4394dfc..dcaef7c043ac 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -121,7 +121,8 @@ int iwl_mvm_sta_send_to_fw(struct iwl_mvm *mvm, struct ieee80211_sta *sta, .mac_id_n_color = cpu_to_le32(mvm_sta->mac_id_n_color), .add_modify = update ? 1 : 0, .station_flags_msk = cpu_to_le32(STA_FLG_FAT_EN_MSK | - STA_FLG_MIMO_EN_MSK), + STA_FLG_MIMO_EN_MSK | + STA_FLG_RTS_MIMO_PROT), .tid_disable_tx = cpu_to_le16(mvm_sta->tid_disable_agg), }; int ret; From 558f479f687aca6a336e13309424a7c3afd32721 Mon Sep 17 00:00:00 2001 From: Tzipi Peres Date: Sun, 30 Jul 2017 13:29:30 +0300 Subject: [PATCH 08/47] iwlwifi: add the new 9000 series PCI IDs Add two PCI IDs for the 9160 series. Add five PCI IDs for the 9260 series. Add one PCI IDs for the 9270 series. Add seven PCI IDs for the 9460 series. Add five PCI IDs for the 9560 series. Signed-off-by: Tzipi Peres Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index f16c1bb9bf94..84f4ba01e14f 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -510,9 +510,17 @@ static const struct pci_device_id iwl_hw_card_ids[] = { /* 9000 Series */ {IWL_PCI_DEVICE(0x271B, 0x0010, iwl9160_2ac_cfg)}, + {IWL_PCI_DEVICE(0x271B, 0x0014, iwl9160_2ac_cfg)}, + {IWL_PCI_DEVICE(0x271B, 0x0210, iwl9160_2ac_cfg)}, {IWL_PCI_DEVICE(0x2526, 0x0000, iwl9260_2ac_cfg)}, {IWL_PCI_DEVICE(0x2526, 0x0010, iwl9260_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x0014, iwl9260_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0xA014, iwl9260_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x4010, iwl9260_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x0210, iwl9260_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x0214, iwl9260_2ac_cfg)}, {IWL_PCI_DEVICE(0x2526, 0x1410, iwl9270_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x1610, iwl9270_2ac_cfg)}, {IWL_PCI_DEVICE(0x9DF0, 0x0A10, iwl9460_2ac_cfg)}, {IWL_PCI_DEVICE(0x9DF0, 0x0010, iwl9460_2ac_cfg)}, {IWL_PCI_DEVICE(0x9DF0, 0x0210, iwl9460_2ac_cfg)}, @@ -527,10 +535,22 @@ static const struct pci_device_id iwl_hw_card_ids[] = { {IWL_PCI_DEVICE(0x9DF0, 0x2A10, iwl9460_2ac_cfg)}, {IWL_PCI_DEVICE(0x30DC, 0x0060, iwl9460_2ac_cfg)}, {IWL_PCI_DEVICE(0x2526, 0x0060, iwl9460_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x0260, iwl9460_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x0064, iwl9460_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x00A4, iwl9460_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x40A4, iwl9460_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x02A4, iwl9460_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x00A0, iwl9460_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x02A0, iwl9460_2ac_cfg)}, {IWL_PCI_DEVICE(0x9DF0, 0x0060, iwl9460_2ac_cfg)}, {IWL_PCI_DEVICE(0xA370, 0x0060, iwl9460_2ac_cfg)}, {IWL_PCI_DEVICE(0x31DC, 0x0060, iwl9460_2ac_cfg)}, {IWL_PCI_DEVICE(0x2526, 0x0030, iwl9560_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x4030, iwl9560_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x0230, iwl9560_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x0234, iwl9560_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x0238, iwl9560_2ac_cfg)}, + {IWL_PCI_DEVICE(0x2526, 0x023C, iwl9560_2ac_cfg)}, {IWL_PCI_DEVICE(0x9DF0, 0x0030, iwl9560_2ac_cfg)}, {IWL_PCI_DEVICE(0xA370, 0x0030, iwl9560_2ac_cfg)}, {IWL_PCI_DEVICE(0x31DC, 0x0030, iwl9560_2ac_cfg)}, From aae9d563230f974f2daa7135f911f021b2bba9e6 Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Fri, 14 Jul 2017 12:06:59 +0200 Subject: [PATCH 09/47] iwlwifi: mvm: Fix a memory leak in an error handling path in 'iwl_mvm_sar_get_wgds_table()' We should free 'wgds.pointer' here as done a few lines above in another error handling path. It was allocated within 'acpi_evaluate_object()'. Fixes: c52030a01ccc ("iwlwifi: mvm: add GEO_TX_POWER_LIMIT cmd for geographic tx power table") Signed-off-by: Christophe JAILLET Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/fw.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c index 79e7a7a285dc..82863e9273eb 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c @@ -1275,8 +1275,10 @@ static int iwl_mvm_sar_get_wgds_table(struct iwl_mvm *mvm) entry = &wifi_pkg->package.elements[idx++]; if ((entry->type != ACPI_TYPE_INTEGER) || - (entry->integer.value > U8_MAX)) - return -EINVAL; + (entry->integer.value > U8_MAX)) { + ret = -EINVAL; + goto out_free; + } mvm->geo_profiles[i].values[j] = entry->integer.value; } From 7e39a00d593133ca8fcd3eef0409685e7c895ee6 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Wed, 26 Jul 2017 15:08:45 +0300 Subject: [PATCH 10/47] iwlwifi: mvm: start mac queues when deferred tx frames are purged In AP mode, if a station is removed just as it is adding a new stream, the queue in question will remain stopped and no more TX will happen in this queue, leading to connection failures and other problems. This is because under DQA, when tx is deferred because a queue needs to be allocated, the mac queue for that TID is stopped until the new stream is added. If at this point the station that this stream belongs to is removed, all the deferred tx frames are purged, but the mac queue is not restarted. As a result, all following tx on this queue will not be transmitted. Fix this by starting the relevant mac queues when the deferred tx frames are purged. Fixes: 24afba7690e4 ("iwlwifi: mvm: support bss dynamic alloc/dealloc of queues") Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index c7b1e58e3384..ce901be5fba8 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -2597,8 +2597,18 @@ static void iwl_mvm_purge_deferred_tx_frames(struct iwl_mvm *mvm, spin_lock_bh(&mvm_sta->lock); for (i = 0; i <= IWL_MAX_TID_COUNT; i++) { tid_data = &mvm_sta->tid_data[i]; - while ((skb = __skb_dequeue(&tid_data->deferred_tx_frames))) + + while ((skb = __skb_dequeue(&tid_data->deferred_tx_frames))) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + /* + * The first deferred frame should've stopped the MAC + * queues, so we should never get a second deferred + * frame for the RA/TID. + */ + iwl_mvm_start_mac_queues(mvm, info->hw_queue); ieee80211_free_txskb(mvm->hw, skb); + } } spin_unlock_bh(&mvm_sta->lock); } From a600852a9d00be08c539307a42729fd46b0a654e Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 27 Jul 2017 15:34:12 +0300 Subject: [PATCH 11/47] iwlwifi: mvm: don't WARN when a legit race happens in A-MPDU When we start an Rx A-MPDU session, we first get the AddBA request, then we send an ADD_STA command to the firmware that will reply with a BAID which is a hardware resource that tracks the BA session. This BAID will appear on each and every frame that we get from the firwmare until the A-MPDU session is torn down. In the Rx path, we look at this BAID to manage the reordering buffer. This flow is inherently racy since the hardware will start to put the BAID in the frames it receives even if the firmware hasn't sent the response to the ADD_STA command. This basically means that the driver can get frames with a valid BAID that it doesn't know yet. When that happens, the driver used to WARN. Fix this by simply not WARN in this case. When the driver will know abou the BAID, it will initialise the relevant states and the next frame with a valid BAID will refresh them. Fixes: b915c10174fb ("iwlwifi: mvm: add reorder buffer per queue") Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index f3e608196369..71c8b800ffa9 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -636,9 +636,9 @@ static bool iwl_mvm_reorder(struct iwl_mvm *mvm, baid_data = rcu_dereference(mvm->baid_map[baid]); if (!baid_data) { - WARN(!(reorder & IWL_RX_MPDU_REORDER_BA_OLD_SN), - "Received baid %d, but no data exists for this BAID\n", - baid); + IWL_DEBUG_RX(mvm, + "Got valid BAID but no baid allocated, bypass the re-ordering buffer. Baid %d reorder 0x%x\n", + baid, reorder); return false; } @@ -759,7 +759,9 @@ static void iwl_mvm_agg_rx_received(struct iwl_mvm *mvm, data = rcu_dereference(mvm->baid_map[baid]); if (!data) { - WARN_ON(!(reorder_data & IWL_RX_MPDU_REORDER_BA_OLD_SN)); + IWL_DEBUG_RX(mvm, + "Got valid BAID but no baid allocated, bypass the re-ordering buffer. Baid %d reorder 0x%x\n", + baid, reorder_data); goto out; } From 04c2cf34362f133be09878bd752f8b014318b59a Mon Sep 17 00:00:00 2001 From: Naftali Goldstein Date: Tue, 11 Jul 2017 10:07:25 +0300 Subject: [PATCH 12/47] mac80211: add api to start ba session timer expired flow Some drivers handle rx buffer reordering internally (and by extension handle also the rx ba session timer internally), but do not ofload the addba/delba negotiation. Add an api for these drivers to properly tear-down the ba session, including sending a delba. Signed-off-by: Naftali Goldstein Signed-off-by: Luca Coelho --- include/net/mac80211.h | 15 +++++++++++++++ net/mac80211/agg-rx.c | 22 +++++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b2b5419467cc..f8149ca192b4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5499,6 +5499,21 @@ static inline void ieee80211_stop_rx_ba_session_offl(struct ieee80211_vif *vif, ieee80211_manage_rx_ba_offl(vif, addr, tid + IEEE80211_NUM_TIDS); } +/** + * ieee80211_rx_ba_timer_expired - stop a Rx BA session due to timeout + * + * Some device drivers do not offload AddBa/DelBa negotiation, but handle rx + * buffer reording internally, and therefore also handle the session timer. + * + * Trigger the timeout flow, which sends a DelBa. + * + * @vif: &struct ieee80211_vif pointer from the add_interface callback + * @addr: station mac address + * @tid: the rx tid + */ +void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif, + const u8 *addr, unsigned int tid); + /* Rate control API */ /** diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 8708cbe8af5b..2b36eff5d97e 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -7,7 +7,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation - * Copyright(c) 2015 Intel Deutschland GmbH + * Copyright(c) 2015-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -466,3 +466,23 @@ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_manage_rx_ba_offl); + +void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif, + const u8 *addr, unsigned int tid) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + rcu_read_lock(); + sta = sta_info_get_bss(sdata, addr); + if (!sta) + goto unlock; + + set_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired); + ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work); + + unlock: + rcu_read_unlock(); +} +EXPORT_SYMBOL(ieee80211_rx_ba_timer_expired); From 20fc690f38d17b8f961101a477a9aa0841fb6e20 Mon Sep 17 00:00:00 2001 From: Naftali Goldstein Date: Tue, 11 Jul 2017 10:07:32 +0300 Subject: [PATCH 13/47] iwlwifi: mvm: send delba upon rx ba session timeout When an RX block-ack session times out, the firmware, which offloads RX reordering but not the BA session negotiation, stops the session but doesn't send a DELBA. This causes the the session to remain active in the remote device, so no more BA sessions will be established, causing a severe throughput degradation due to the lack of aggregation. Use the new ieee80211_rx_ba_timer_expired API when the ba session timer expires, since this will tear down the ba session and also send a delba. The previous API used is intended for drivers that offload the addba/delba negotiation, but not the rx reordering, while our driver does the opposite. This patch depends on "mac80211: add api to start ba session timer expired flow". Signed-off-by: Naftali Goldstein Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c index dcaef7c043ac..027ee5e72172 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c @@ -291,8 +291,8 @@ static void iwl_mvm_rx_agg_session_expired(unsigned long data) goto unlock; mvm_sta = iwl_mvm_sta_from_mac80211(sta); - ieee80211_stop_rx_ba_session_offl(mvm_sta->vif, - sta->addr, ba_data->tid); + ieee80211_rx_ba_timer_expired(mvm_sta->vif, + sta->addr, ba_data->tid); unlock: rcu_read_unlock(); } From e71cb9e00922902ba0519f37d09145f117dc02b3 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Wed, 9 Aug 2017 16:46:09 -0400 Subject: [PATCH 14/47] net: dsa: ksz: fix skb freeing The DSA layer frees the original skb when an xmit function returns NULL, meaning an error occurred. But if the tagging code copied the original skb, it is responsible of freeing the copy if an error occurs. The ksz tagging code currently has two issues: if skb_put_padto fails, the skb copy is not freed, and the original skb will be freed twice. To fix that, move skb_put_padto inside both branches of the skb_tailroom condition, before freeing the original skb, and free the copy on error. Signed-off-by: Vivien Didelot Reviewed-by: Woojung Huh Signed-off-by: David S. Miller --- net/dsa/tag_ksz.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index fab41de8e983..de66ca8e6201 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -42,6 +42,9 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len; if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) { + if (skb_put_padto(skb, skb->len + padlen)) + return NULL; + nskb = skb; } else { nskb = alloc_skb(NET_IP_ALIGN + skb->len + @@ -56,13 +59,15 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head); skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); + + if (skb_put_padto(nskb, nskb->len + padlen)) { + kfree_skb(nskb); + return NULL; + } + kfree_skb(skb); } - /* skb is freed when it fails */ - if (skb_put_padto(nskb, nskb->len + padlen)) - return NULL; - tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); tag[0] = 0; tag[1] = 1 << p->dp->index; /* destination port */ From ad729bc9acfb7c47112964b4877ef5404578ed13 Mon Sep 17 00:00:00 2001 From: Andreas Born Date: Thu, 10 Aug 2017 06:41:44 +0200 Subject: [PATCH 15/47] bonding: require speed/duplex only for 802.3ad, alb and tlb The patch c4adfc822bf5 ("bonding: make speed, duplex setting consistent with link state") puts the link state to down if bond_update_speed_duplex() cannot retrieve speed and duplex settings. Assumably the patch was written with 802.3ad mode in mind which relies on link speed/duplex settings. For other modes like active-backup these settings are not required. Thus, only for these other modes, this patch reintroduces support for slaves that do not support reporting speed or duplex such as wireless devices. This fixes the regression reported in bug 196547 (https://bugzilla.kernel.org/show_bug.cgi?id=196547). Fixes: c4adfc822bf5 ("bonding: make speed, duplex setting consistent with link state") Signed-off-by: Andreas Born Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 6 ++++-- include/net/bonding.h | 5 +++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 9bee6c1c70cc..85bb272d2a34 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1569,7 +1569,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) new_slave->delay = 0; new_slave->link_failure_count = 0; - if (bond_update_speed_duplex(new_slave)) + if (bond_update_speed_duplex(new_slave) && + bond_needs_speed_duplex(bond)) new_slave->link = BOND_LINK_DOWN; new_slave->last_rx = jiffies - @@ -2140,7 +2141,8 @@ static void bond_miimon_commit(struct bonding *bond) continue; case BOND_LINK_UP: - if (bond_update_speed_duplex(slave)) { + if (bond_update_speed_duplex(slave) && + bond_needs_speed_duplex(bond)) { slave->link = BOND_LINK_DOWN; netdev_warn(bond->dev, "failed to get link speed/duplex for %s\n", diff --git a/include/net/bonding.h b/include/net/bonding.h index b00508d22e0a..b2e68657a216 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -277,6 +277,11 @@ static inline bool bond_is_lb(const struct bonding *bond) BOND_MODE(bond) == BOND_MODE_ALB; } +static inline bool bond_needs_speed_duplex(const struct bonding *bond) +{ + return BOND_MODE(bond) == BOND_MODE_8023AD || bond_is_lb(bond); +} + static inline bool bond_is_nondyn_tlb(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_TLB) && From 8d55373875052b891ae72c9bcaf9c2d7178676c0 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 10 Aug 2017 12:31:40 +0300 Subject: [PATCH 16/47] net/sched/hfsc: allocate tcf block for hfsc root class Without this filters cannot be attached. Signed-off-by: Konstantin Khlebnikov Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b52f74610dc7..3ad02bbe6903 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1428,6 +1428,10 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) return err; q->eligible = RB_ROOT; + err = tcf_block_get(&q->root.block, &q->root.filter_list); + if (err) + goto err_tcf; + q->root.cl_common.classid = sch->handle; q->root.refcnt = 1; q->root.sched = q; @@ -1447,6 +1451,10 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); return 0; + +err_tcf: + qdisc_class_hash_destroy(&q->clhash); + return err; } static int From fbca164776e438b639af592c522b8b0506b54dcc Mon Sep 17 00:00:00 2001 From: Romain Perier Date: Thu, 10 Aug 2017 16:56:05 +0200 Subject: [PATCH 17/47] net: stmmac: Use the right logging function in stmmac_mdio_register Currently, the function stmmac_mdio_register() is only used by stmmac_dvr_probe() from stmmac_main.c, in order to register the MDIO bus and probe information about the PHY. As this function is called before calling register_netdev(), all messages logged from stmmac_mdio_register are prefixed by "(unnamed net_device)". The goal of netdev_info or netdev_err is to dump useful infos about a net_device, when this data structure is partially initialized, there is no point for using these functions. This commit fixes the issue by replacing all netdev_*() by the corresponding dev_*() function for logging. The last netdev_info is replaced by phy_attached_info(), as a valid phydev can be used at this point. Signed-off-by: Romain Perier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c index db157a47000c..72ec711fcba2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c @@ -204,6 +204,7 @@ int stmmac_mdio_register(struct net_device *ndev) struct stmmac_priv *priv = netdev_priv(ndev); struct stmmac_mdio_bus_data *mdio_bus_data = priv->plat->mdio_bus_data; struct device_node *mdio_node = priv->plat->mdio_node; + struct device *dev = ndev->dev.parent; int addr, found; if (!mdio_bus_data) @@ -237,7 +238,7 @@ int stmmac_mdio_register(struct net_device *ndev) else err = mdiobus_register(new_bus); if (err != 0) { - netdev_err(ndev, "Cannot register the MDIO bus\n"); + dev_err(dev, "Cannot register the MDIO bus\n"); goto bus_register_fail; } @@ -285,14 +286,12 @@ int stmmac_mdio_register(struct net_device *ndev) irq_str = irq_num; break; } - netdev_info(ndev, "PHY ID %08x at %d IRQ %s (%s)%s\n", - phydev->phy_id, addr, irq_str, phydev_name(phydev), - act ? " active" : ""); + phy_attached_info(phydev); found = 1; } if (!found && !mdio_node) { - netdev_warn(ndev, "No PHY found\n"); + dev_warn(dev, "No PHY found\n"); mdiobus_unregister(new_bus); mdiobus_free(new_bus); return -ENODEV; From bb3afda4fc4ea690ff92a36eef4c0afe4d19da04 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 11 Aug 2017 10:18:20 +0200 Subject: [PATCH 18/47] nfp: do not update MTU from BH in flower app The Flower app may receive a request to update the MTU of a representor netdev upon receipt of a control message from the firmware. This requires the RTNL lock which needs to be taken outside of the packet processing path. As a handling of this correctly seems a little to invasive for a fix simply skip setting the MTU for now. Relevant backtrace: [ 1496.288489] BUG: scheduling while atomic: kworker/0:3/373/0x00000100 [ 1496.294911] dca syscopyarea sysfillrect sysimgblt fb_sys_fops ptp drm mxm_wmi ahci pps_core libahci i2c_algo_bit wmi [last unloaded: nfp] [ 1496.294918] CPU: 0 PID: 373 Comm: kworker/0:3 Tainted: G OE 4.13.0-rc3+ #3 [ 1496.294919] Hardware name: Supermicro X10DRi/X10DRi, BIOS 2.0 12/28/2015 [ 1496.294923] Workqueue: events work_for_cpu_fn [ 1496.294924] Call Trace: [ 1496.294927] [ 1496.294931] dump_stack+0x63/0x82 [ 1496.294935] __schedule_bug+0x54/0x70 [ 1496.294937] __schedule+0x62f/0x890 [ 1496.294941] ? intel_unmap_sg+0x90/0x90 [ 1496.294942] schedule+0x36/0x80 [ 1496.294943] schedule_preempt_disabled+0xe/0x10 [ 1496.294945] __mutex_lock.isra.2+0x445/0x4a0 [ 1496.294947] ? device_is_rmrr_locked+0x12/0x50 [ 1496.294950] ? kfree+0x162/0x170 [ 1496.294952] ? device_is_rmrr_locked+0x12/0x50 [ 1496.294953] ? iommu_should_identity_map+0x50/0xe0 [ 1496.294954] __mutex_lock_slowpath+0x13/0x20 [ 1496.294955] ? iommu_no_mapping+0x48/0xd0 [ 1496.294956] ? __mutex_lock_slowpath+0x13/0x20 [ 1496.294957] mutex_lock+0x2f/0x40 [ 1496.294960] rtnl_lock+0x15/0x20 [ 1496.294979] nfp_flower_cmsg_rx+0xc8/0x150 [nfp] [ 1496.294986] nfp_ctrl_poll+0x286/0x350 [nfp] [ 1496.294989] tasklet_action+0xf6/0x110 [ 1496.294992] __do_softirq+0xed/0x278 [ 1496.294993] irq_exit+0xb6/0xc0 [ 1496.294994] do_IRQ+0x4f/0xd0 [ 1496.294996] common_interrupt+0x89/0x89 Fixes: 948faa46c05b ("nfp: add support for control messages for flower app") Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/cmsg.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c index dd7fa9cf225f..b0837b58c3a1 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c @@ -115,14 +115,10 @@ nfp_flower_cmsg_portmod_rx(struct nfp_app *app, struct sk_buff *skb) return; } - if (link) { + if (link) netif_carrier_on(netdev); - rtnl_lock(); - dev_set_mtu(netdev, be16_to_cpu(msg->mtu)); - rtnl_unlock(); - } else { + else netif_carrier_off(netdev); - } rcu_read_unlock(); } From 54a6a043fb8580d5a741774669ef6049f402f228 Mon Sep 17 00:00:00 2001 From: Anton Vasilyev Date: Fri, 11 Aug 2017 15:57:22 +0300 Subject: [PATCH 19/47] mISDN: Fix null pointer dereference at mISDN_FsmNew If mISDN_FsmNew() fails to allocate memory for jumpmatrix then null pointer dereference will occur on any write to jumpmatrix. The patch adds check on successful allocation and corresponding error handling. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Anton Vasilyev Signed-off-by: David S. Miller --- drivers/isdn/mISDN/fsm.c | 5 ++++- drivers/isdn/mISDN/fsm.h | 2 +- drivers/isdn/mISDN/layer1.c | 3 +-- drivers/isdn/mISDN/layer2.c | 15 +++++++++++++-- drivers/isdn/mISDN/tei.c | 20 +++++++++++++++++--- 5 files changed, 36 insertions(+), 9 deletions(-) diff --git a/drivers/isdn/mISDN/fsm.c b/drivers/isdn/mISDN/fsm.c index 78fc5d5e9051..92e6570b1143 100644 --- a/drivers/isdn/mISDN/fsm.c +++ b/drivers/isdn/mISDN/fsm.c @@ -26,7 +26,7 @@ #define FSM_TIMER_DEBUG 0 -void +int mISDN_FsmNew(struct Fsm *fsm, struct FsmNode *fnlist, int fncount) { @@ -34,6 +34,8 @@ mISDN_FsmNew(struct Fsm *fsm, fsm->jumpmatrix = kzalloc(sizeof(FSMFNPTR) * fsm->state_count * fsm->event_count, GFP_KERNEL); + if (fsm->jumpmatrix == NULL) + return -ENOMEM; for (i = 0; i < fncount; i++) if ((fnlist[i].state >= fsm->state_count) || @@ -45,6 +47,7 @@ mISDN_FsmNew(struct Fsm *fsm, } else fsm->jumpmatrix[fsm->state_count * fnlist[i].event + fnlist[i].state] = (FSMFNPTR) fnlist[i].routine; + return 0; } EXPORT_SYMBOL(mISDN_FsmNew); diff --git a/drivers/isdn/mISDN/fsm.h b/drivers/isdn/mISDN/fsm.h index 928f5be192c1..e1def8490221 100644 --- a/drivers/isdn/mISDN/fsm.h +++ b/drivers/isdn/mISDN/fsm.h @@ -55,7 +55,7 @@ struct FsmTimer { void *arg; }; -extern void mISDN_FsmNew(struct Fsm *, struct FsmNode *, int); +extern int mISDN_FsmNew(struct Fsm *, struct FsmNode *, int); extern void mISDN_FsmFree(struct Fsm *); extern int mISDN_FsmEvent(struct FsmInst *, int , void *); extern void mISDN_FsmChangeState(struct FsmInst *, int); diff --git a/drivers/isdn/mISDN/layer1.c b/drivers/isdn/mISDN/layer1.c index bebc57b72138..3192b0eb3944 100644 --- a/drivers/isdn/mISDN/layer1.c +++ b/drivers/isdn/mISDN/layer1.c @@ -414,8 +414,7 @@ l1_init(u_int *deb) l1fsm_s.event_count = L1_EVENT_COUNT; l1fsm_s.strEvent = strL1Event; l1fsm_s.strState = strL1SState; - mISDN_FsmNew(&l1fsm_s, L1SFnList, ARRAY_SIZE(L1SFnList)); - return 0; + return mISDN_FsmNew(&l1fsm_s, L1SFnList, ARRAY_SIZE(L1SFnList)); } void diff --git a/drivers/isdn/mISDN/layer2.c b/drivers/isdn/mISDN/layer2.c index 7243a6746f8b..9ff0903a0e89 100644 --- a/drivers/isdn/mISDN/layer2.c +++ b/drivers/isdn/mISDN/layer2.c @@ -2247,15 +2247,26 @@ static struct Bprotocol X75SLP = { int Isdnl2_Init(u_int *deb) { + int res; debug = deb; mISDN_register_Bprotocol(&X75SLP); l2fsm.state_count = L2_STATE_COUNT; l2fsm.event_count = L2_EVENT_COUNT; l2fsm.strEvent = strL2Event; l2fsm.strState = strL2State; - mISDN_FsmNew(&l2fsm, L2FnList, ARRAY_SIZE(L2FnList)); - TEIInit(deb); + res = mISDN_FsmNew(&l2fsm, L2FnList, ARRAY_SIZE(L2FnList)); + if (res) + goto error; + res = TEIInit(deb); + if (res) + goto error_fsm; return 0; + +error_fsm: + mISDN_FsmFree(&l2fsm); +error: + mISDN_unregister_Bprotocol(&X75SLP); + return res; } void diff --git a/drivers/isdn/mISDN/tei.c b/drivers/isdn/mISDN/tei.c index 908127efccf8..12d9e5f4beb1 100644 --- a/drivers/isdn/mISDN/tei.c +++ b/drivers/isdn/mISDN/tei.c @@ -1387,23 +1387,37 @@ create_teimanager(struct mISDNdevice *dev) int TEIInit(u_int *deb) { + int res; debug = deb; teifsmu.state_count = TEI_STATE_COUNT; teifsmu.event_count = TEI_EVENT_COUNT; teifsmu.strEvent = strTeiEvent; teifsmu.strState = strTeiState; - mISDN_FsmNew(&teifsmu, TeiFnListUser, ARRAY_SIZE(TeiFnListUser)); + res = mISDN_FsmNew(&teifsmu, TeiFnListUser, ARRAY_SIZE(TeiFnListUser)); + if (res) + goto error; teifsmn.state_count = TEI_STATE_COUNT; teifsmn.event_count = TEI_EVENT_COUNT; teifsmn.strEvent = strTeiEvent; teifsmn.strState = strTeiState; - mISDN_FsmNew(&teifsmn, TeiFnListNet, ARRAY_SIZE(TeiFnListNet)); + res = mISDN_FsmNew(&teifsmn, TeiFnListNet, ARRAY_SIZE(TeiFnListNet)); + if (res) + goto error_smn; deactfsm.state_count = DEACT_STATE_COUNT; deactfsm.event_count = DEACT_EVENT_COUNT; deactfsm.strEvent = strDeactEvent; deactfsm.strState = strDeactState; - mISDN_FsmNew(&deactfsm, DeactFnList, ARRAY_SIZE(DeactFnList)); + res = mISDN_FsmNew(&deactfsm, DeactFnList, ARRAY_SIZE(DeactFnList)); + if (res) + goto error_deact; return 0; + +error_deact: + mISDN_FsmFree(&teifsmn); +error_smn: + mISDN_FsmFree(&teifsmu); +error: + return res; } void TEIFree(void) From e4dde4127396f0c8f1c2e11b3ecc5baf4f8628bf Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 11 Aug 2017 18:31:24 +0200 Subject: [PATCH 20/47] net: fix compilation when busy poll is not enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIN_NAPI_ID is used in various places outside of CONFIG_NET_RX_BUSY_POLL wrapping, so when it's not set we run into build errors such as: net/core/dev.c: In function 'dev_get_by_napi_id': net/core/dev.c:886:16: error: ‘MIN_NAPI_ID’ undeclared (first use in this function) if (napi_id < MIN_NAPI_ID) ^~~~~~~~~~~ Thus, have MIN_NAPI_ID always defined to fix these errors. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/busy_poll.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 8ffd434676b7..71c72a939bf8 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -29,18 +29,18 @@ #include #include -#ifdef CONFIG_NET_RX_BUSY_POLL - -struct napi_struct; -extern unsigned int sysctl_net_busy_read __read_mostly; -extern unsigned int sysctl_net_busy_poll __read_mostly; - /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu * NR_CPUS+1..~0 - Region available for NAPI IDs */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +#ifdef CONFIG_NET_RX_BUSY_POLL + +struct napi_struct; +extern unsigned int sysctl_net_busy_read __read_mostly; +extern unsigned int sysctl_net_busy_poll __read_mostly; + static inline bool net_busy_loop_on(void) { return sysctl_net_busy_poll; From 2ed46ce45ec02f6b2188419acdf372a144e06fb5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 11 Aug 2017 18:31:25 +0200 Subject: [PATCH 21/47] bpf: fix two missing target_size settings in bpf_convert_ctx_access When CONFIG_NET_SCHED or CONFIG_NET_RX_BUSY_POLL is /not/ set and we try a narrow __sk_buff load of tc_index or napi_id, respectively, then verifier rightfully complains that it's misconfigured, because we need to set target_size in each of the two cases. The rewrite for the ctx access is just a dummy op, but needs to pass, so fix this up. Fixes: f96da09473b5 ("bpf: simplify narrower ctx access") Reported-by: Shubham Bansal Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index f44fc22fd45a..6280a602604c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3505,6 +3505,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); #else + *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else @@ -3520,6 +3521,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else + *target_size = 4; *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; From fd851ba9caa9a63fdbb72a2e6ed5560c0989e999 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Aug 2017 10:48:53 -0700 Subject: [PATCH 22/47] udp: harden copy_linear_skb() syzkaller got crashes with CONFIG_HARDENED_USERCOPY=y configs. Issue here is that recvfrom() can be used with user buffer of Z bytes, and SO_PEEK_OFF of X bytes, from a skb with Y bytes, and following condition : Z < X < Y kernel BUG at mm/usercopy.c:72! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 2917 Comm: syzkaller842281 Not tainted 4.13.0-rc3+ #16 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d2fa40c0 task.stack: ffff8801d1fe8000 RIP: 0010:report_usercopy mm/usercopy.c:64 [inline] RIP: 0010:__check_object_size+0x3ad/0x500 mm/usercopy.c:264 RSP: 0018:ffff8801d1fef8a8 EFLAGS: 00010286 RAX: 0000000000000078 RBX: ffffffff847102c0 RCX: 0000000000000000 RDX: 0000000000000078 RSI: 1ffff1003a3fded5 RDI: ffffed003a3fdf09 RBP: ffff8801d1fef998 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801d1ea480e R13: fffffffffffffffa R14: ffffffff84710280 R15: dffffc0000000000 FS: 0000000001360880(0000) GS:ffff8801dc000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000202ecfe4 CR3: 00000001d1ff8000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: check_object_size include/linux/thread_info.h:108 [inline] check_copy_size include/linux/thread_info.h:139 [inline] copy_to_iter include/linux/uio.h:105 [inline] copy_linear_skb include/net/udp.h:371 [inline] udpv6_recvmsg+0x1040/0x1af0 net/ipv6/udp.c:395 inet_recvmsg+0x14c/0x5f0 net/ipv4/af_inet.c:793 sock_recvmsg_nosec net/socket.c:792 [inline] sock_recvmsg+0xc9/0x110 net/socket.c:799 SYSC_recvfrom+0x2d6/0x570 net/socket.c:1788 SyS_recvfrom+0x40/0x50 net/socket.c:1760 entry_SYSCALL_64_fastpath+0x1f/0xbe Fixes: b65ac44674dd ("udp: try to avoid 2 cache miss on dequeue") Signed-off-by: Eric Dumazet Cc: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/net/udp.h b/include/net/udp.h index cc8036987dcb..e9b1d1eacb59 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -368,6 +368,8 @@ static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, { int n, copy = len - off; + if (copy < 0) + return -EINVAL; n = copy_to_iter(skb->data + off, copy, to); if (n == copy) return 0; From 11e9d7829dd08dbafb24517fe922f11c3a8a9dc2 Mon Sep 17 00:00:00 2001 From: Andreas Born Date: Sat, 12 Aug 2017 00:36:55 +0200 Subject: [PATCH 23/47] bonding: ratelimit failed speed/duplex update warning bond_miimon_commit() handles the UP transition for each slave of a bond in the case of MII. It is triggered 10 times per second for the default MII Polling interval of 100ms. For device drivers that do not implement __ethtool_get_link_ksettings() the call to bond_update_speed_duplex() fails persistently while the MII status could remain UP. That is, in this and other cases where the speed/duplex update keeps failing over a longer period of time while the MII state is UP, a warning is printed every MII polling interval. To address these excessive warnings net_ratelimit() should be used. Printing a warning once would not be sufficient since the call to bond_update_speed_duplex() could recover to succeed and fail again later. In that case there would be no new indication what went wrong. Fixes: b5bf0f5b16b9c (bonding: correctly update link status during mii-commit phase) Signed-off-by: Andreas Born Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 85bb272d2a34..fc63992ab0e0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2144,9 +2144,10 @@ static void bond_miimon_commit(struct bonding *bond) if (bond_update_speed_duplex(slave) && bond_needs_speed_duplex(bond)) { slave->link = BOND_LINK_DOWN; - netdev_warn(bond->dev, - "failed to get link speed/duplex for %s\n", - slave->dev->name); + if (net_ratelimit()) + netdev_warn(bond->dev, + "failed to get link speed/duplex for %s\n", + slave->dev->name); continue; } bond_set_slave_link_state(slave, BOND_LINK_UP, From e9bf53ab1ee34bb05c104bbfd2b77c844773f8e6 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Fri, 11 Aug 2017 11:07:36 +0100 Subject: [PATCH 24/47] brcmfmac: feature check for multi-scheduled scan fails on bcm4343x devices The firmware feature check introduced for multi-scheduled scan turned out to be failing for bcm4343{0,1,8} devices resulting in a firmware crash. The reason for this crash has not yet been root cause so this patch avoids the feature check for those device as a short-term fix. Reported-by: Stefan Wahren Reported-by: Ian Molton Fixes: 9fe929aaace6 ("brcmfmac: add firmware feature detection for gscan feature") Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c index d21258d277ce..f1b60740e020 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c @@ -159,8 +159,10 @@ void brcmf_feat_attach(struct brcmf_pub *drvr) brcmf_feat_firmware_capabilities(ifp); memset(&gscan_cfg, 0, sizeof(gscan_cfg)); - brcmf_feat_iovar_data_set(ifp, BRCMF_FEAT_GSCAN, "pfn_gscan_cfg", - &gscan_cfg, sizeof(gscan_cfg)); + if (drvr->bus_if->chip != BRCM_CC_43430_CHIP_ID) + brcmf_feat_iovar_data_set(ifp, BRCMF_FEAT_GSCAN, + "pfn_gscan_cfg", + &gscan_cfg, sizeof(gscan_cfg)); brcmf_feat_iovar_int_get(ifp, BRCMF_FEAT_PNO, "pfn"); if (drvr->bus_if->wowl_supported) brcmf_feat_iovar_int_get(ifp, BRCMF_FEAT_WOWL, "wowl"); From 2c87d63ac853550e734edfd45e1be5e5aa44fbcc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Aug 2017 00:52:58 +0200 Subject: [PATCH 25/47] ipv4: route: fix inet_rtm_getroute induced crash "ip route get $daddr iif eth0 from $saddr" causes: BUG: KASAN: use-after-free in ip_route_input_rcu+0x1535/0x1b50 Call Trace: ip_route_input_rcu+0x1535/0x1b50 ip_route_input_noref+0xf9/0x190 tcp_v4_early_demux+0x1a4/0x2b0 ip_rcv+0xbcb/0xc05 __netif_receive_skb+0x9c/0xd0 netif_receive_skb_internal+0x5a8/0x890 Problem is that inet_rtm_getroute calls either ip_route_input_rcu (if an iif was provided) or ip_route_output_key_hash_rcu. But ip_route_input_rcu, unlike ip_route_output_key_hash_rcu, already associates the dst_entry with the skb. This clears the SKB_DST_NOREF bit (i.e. skb_dst_drop will release/free the entry while it should not). Thus only set the dst if we called ip_route_output_key_hash_rcu(). I tested this patch by running: while true;do ip r get 10.0.1.2;done > /dev/null & while true;do ip r get 10.0.1.2 iif eth0 from 10.0.1.1;done > /dev/null & ... and saw no crash or memory leak. Cc: Roopa Prabhu Cc: David Ahern Fixes: ba52d61e0ff ("ipv4: route: restore skb_dst_set in inet_rtm_getroute") Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0383e66f59bc..7effa62beed3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2750,12 +2750,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); + else + skb_dst_set(skb, &rt->dst); } if (err) goto errout_free; - skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; From fed5f5718c4989a03b1b4cdc0c7f273c3c74ee9e Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 14 Aug 2017 17:55:56 +0200 Subject: [PATCH 26/47] tipc: accept PACKET_MULTICAST packets On L2 bearers, the TIPC broadcast function is sending out packets using the corresponding L2 broadcast address. At reception, we filter such packets under the assumption that they will also be delivered as broadcast packets. This assumption doesn't always hold true. Under high load, we have seen that a switch may convert the destination address and deliver the packet as a PACKET_MULTICAST, something leading to inadvertently dropped packets and a stale and reset broadcast link. We fix this by extending the reception filtering to accept packets of type PACKET_MULTICAST. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index d174ee3254ee..767e0537dde5 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -596,7 +596,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, rcu_read_lock(); b = rcu_dereference_rtnl(dev->tipc_ptr); if (likely(b && test_bit(0, &b->up) && - (skb->pkt_type <= PACKET_BROADCAST))) { + (skb->pkt_type <= PACKET_MULTICAST))) { skb->next = NULL; tipc_rcv(dev_net(dev), skb, b); rcu_read_unlock(); From 59a361bc6f6e91d57f25ff0aebb0e646beb3b41d Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 14 Aug 2017 18:28:49 +0200 Subject: [PATCH 27/47] tipc: avoid inheriting msg_non_seq flag when message is returned In the function msg_reverse(), we reverse the header while trying to reuse the original buffer whenever possible. Those rejected/returned messages are always transmitted as unicast, but the msg_non_seq field is not explicitly set to zero as it should be. We have seen cases where multicast senders set the message type to "NOT dest_droppable", meaning that a multicast message shorter than one MTU will be returned, e.g., during receive buffer overflow, by reusing the original buffer. This has the effect that even the 'msg_non_seq' field is inadvertently inherited by the rejected message, although it is now sent as a unicast message. This again leads the receiving unicast link endpoint to steer the packet toward the broadcast link receive function, where it is dropped. The affected unicast link is thereafter (after 100 failed retransmissions) declared 'stale' and reset. We fix this by unconditionally setting the 'msg_non_seq' flag to zero for all rejected/returned messages. Reported-by: Canh Duc Luu Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index ab3087687a32..dcd90e6fa7c3 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -513,6 +513,7 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) /* Now reverse the concerned fields */ msg_set_errcode(hdr, err); + msg_set_non_seq(hdr, 0); msg_set_origport(hdr, msg_destport(&ohdr)); msg_set_destport(hdr, msg_origport(&ohdr)); msg_set_destnode(hdr, msg_prevnode(&ohdr)); From a99b646afa8a02571ea298bedca6592d818229cd Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Tue, 15 Aug 2017 11:23:23 +0800 Subject: [PATCH 28/47] PCI: Disable PCIe Relaxed Ordering if unsupported When bit4 is set in the PCIe Device Control register, it indicates whether the device is permitted to use relaxed ordering. On some platforms using relaxed ordering can have performance issues or due to erratum can cause data-corruption. In such cases devices must avoid using relaxed ordering. The patch adds a new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING to indicate that Relaxed Ordering (RO) attribute should not be used for Transaction Layer Packets (TLP) targeted towards these affected root complexes. This patch checks if there is any node in the hierarchy that indicates that using relaxed ordering is not safe. In such cases the patch turns off the relaxed ordering by clearing the capability for this device. Signed-off-by: Casey Leedom Signed-off-by: Ding Tianhong Acked-by: Ashok Raj Acked-by: Alexander Duyck Acked-by: Casey Leedom Signed-off-by: David S. Miller --- drivers/pci/probe.c | 43 +++++++++++++++++++++++++++++++++++++++++++ drivers/pci/quirks.c | 11 +++++++++++ include/linux/pci.h | 3 +++ 3 files changed, 57 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index c31310db0404..e6a917b4acd3 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1762,6 +1762,48 @@ static void pci_configure_extended_tags(struct pci_dev *dev) PCI_EXP_DEVCTL_EXT_TAG); } +/** + * pcie_relaxed_ordering_enabled - Probe for PCIe relaxed ordering enable + * @dev: PCI device to query + * + * Returns true if the device has enabled relaxed ordering attribute. + */ +bool pcie_relaxed_ordering_enabled(struct pci_dev *dev) +{ + u16 v; + + pcie_capability_read_word(dev, PCI_EXP_DEVCTL, &v); + + return !!(v & PCI_EXP_DEVCTL_RELAX_EN); +} +EXPORT_SYMBOL(pcie_relaxed_ordering_enabled); + +static void pci_configure_relaxed_ordering(struct pci_dev *dev) +{ + struct pci_dev *root; + + /* PCI_EXP_DEVICE_RELAX_EN is RsvdP in VFs */ + if (dev->is_virtfn) + return; + + if (!pcie_relaxed_ordering_enabled(dev)) + return; + + /* + * For now, we only deal with Relaxed Ordering issues with Root + * Ports. Peer-to-Peer DMA is another can of worms. + */ + root = pci_find_pcie_root_port(dev); + if (!root) + return; + + if (root->dev_flags & PCI_DEV_FLAGS_NO_RELAXED_ORDERING) { + pcie_capability_clear_word(dev, PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_RELAX_EN); + dev_info(&dev->dev, "Disable Relaxed Ordering because the Root Port didn't support it\n"); + } +} + static void pci_configure_device(struct pci_dev *dev) { struct hotplug_params hpp; @@ -1769,6 +1811,7 @@ static void pci_configure_device(struct pci_dev *dev) pci_configure_mps(dev); pci_configure_extended_tags(dev); + pci_configure_relaxed_ordering(dev); memset(&hpp, 0, sizeof(hpp)); ret = pci_get_hp_params(dev, &hpp); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 6967c6b4cf6b..61b59bfa7bfc 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4015,6 +4015,17 @@ DECLARE_PCI_FIXUP_CLASS_EARLY(0x1797, 0x6868, PCI_CLASS_NOT_DEFINED, 8, DECLARE_PCI_FIXUP_CLASS_EARLY(0x1797, 0x6869, PCI_CLASS_NOT_DEFINED, 8, quirk_tw686x_class); +/* + * Some devices have problems with Transaction Layer Packets with the Relaxed + * Ordering Attribute set. Such devices should mark themselves and other + * Device Drivers should check before sending TLPs with RO set. + */ +static void quirk_relaxedordering_disable(struct pci_dev *dev) +{ + dev->dev_flags |= PCI_DEV_FLAGS_NO_RELAXED_ORDERING; + dev_info(&dev->dev, "Disable Relaxed Ordering Attributes to avoid PCIe Completion erratum\n"); +} + /* * Per PCIe r3.0, sec 2.2.9, "Completion headers must supply the same * values for the Attribute as were supplied in the header of the diff --git a/include/linux/pci.h b/include/linux/pci.h index 4869e66dd659..29606fb89464 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -188,6 +188,8 @@ enum pci_dev_flags { * the direct_complete optimization. */ PCI_DEV_FLAGS_NEEDS_RESUME = (__force pci_dev_flags_t) (1 << 11), + /* Don't use Relaxed Ordering for TLPs directed at this device */ + PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 12), }; enum pci_irq_reroute_variant { @@ -1125,6 +1127,7 @@ bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); void pci_d3cold_enable(struct pci_dev *dev); void pci_d3cold_disable(struct pci_dev *dev); +bool pcie_relaxed_ordering_enabled(struct pci_dev *dev); /* PCI Virtual Channel */ int pci_save_vc_state(struct pci_dev *dev); From 87e09cdec4dae08acdb4aa49beb793c19d73e73e Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Tue, 15 Aug 2017 11:23:24 +0800 Subject: [PATCH 29/47] PCI: Disable Relaxed Ordering for some Intel processors According to the Intel spec section 3.9.1 said: 3.9.1 Optimizing PCIe Performance for Accesses Toward Coherent Memory and Toward MMIO Regions (P2P) In order to maximize performance for PCIe devices in the processors listed in Table 3-6 below, the soft- ware should determine whether the accesses are toward coherent memory (system memory) or toward MMIO regions (P2P access to other devices). If the access is toward MMIO region, then software can command HW to set the RO bit in the TLP header, as this would allow hardware to achieve maximum throughput for these types of accesses. For accesses toward coherent memory, software can command HW to clear the RO bit in the TLP header (no RO), as this would allow hardware to achieve maximum throughput for these types of accesses. Table 3-6. Intel Processor CPU RP Device IDs for Processors Optimizing PCIe Performance Processor CPU RP Device IDs Intel Xeon processors based on 6F01H-6F0EH Broadwell microarchitecture Intel Xeon processors based on 2F01H-2F0EH Haswell microarchitecture It means some Intel processors has performance issue when use the Relaxed Ordering Attribute, so disable Relaxed Ordering for these root port. Signed-off-by: Casey Leedom Signed-off-by: Ding Tianhong Acked-by: Alexander Duyck Acked-by: Ashok Raj Signed-off-by: David S. Miller --- drivers/pci/quirks.c | 62 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 61b59bfa7bfc..1272f7e65699 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4026,6 +4026,68 @@ static void quirk_relaxedordering_disable(struct pci_dev *dev) dev_info(&dev->dev, "Disable Relaxed Ordering Attributes to avoid PCIe Completion erratum\n"); } +/* + * Intel Xeon processors based on Broadwell/Haswell microarchitecture Root + * Complex has a Flow Control Credit issue which can cause performance + * problems with Upstream Transaction Layer Packets with Relaxed Ordering set. + */ +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f01, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f02, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f03, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f04, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f05, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f06, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f07, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f08, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f09, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f0a, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f0b, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f0c, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f0d, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x6f0e, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f01, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f02, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f03, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f04, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f05, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f06, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f07, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f08, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f09, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f0a, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f0b, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f0c, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f0d, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f0e, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); + /* * Per PCIe r3.0, sec 2.2.9, "Completion headers must supply the same * values for the Attribute as were supplied in the header of the From 077fa19c5dfa06a6ae04fb1661680940ff837612 Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Tue, 15 Aug 2017 11:23:25 +0800 Subject: [PATCH 30/47] PCI: Disable Relaxed Ordering Attributes for AMD A1100 Casey reported that the AMD ARM A1100 SoC has a bug in its PCIe Root Port where Upstream Transaction Layer Packets with the Relaxed Ordering Attribute clear are allowed to bypass earlier TLPs with Relaxed Ordering set, it would cause Data Corruption, so we need to disable Relaxed Ordering Attribute when Upstream TLPs to the Root Port. Reported-and-suggested-by: Casey Leedom Signed-off-by: Casey Leedom Signed-off-by: Ding Tianhong Acked-by: Casey Leedom Signed-off-by: David S. Miller --- drivers/pci/quirks.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 1272f7e65699..140760403f36 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -4088,6 +4088,22 @@ DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f0d, PCI_CLASS_NOT_DEFINED DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, 0x2f0e, PCI_CLASS_NOT_DEFINED, 8, quirk_relaxedordering_disable); +/* + * The AMD ARM A1100 (AKA "SEATTLE") SoC has a bug in its PCIe Root Complex + * where Upstream Transaction Layer Packets with the Relaxed Ordering + * Attribute clear are allowed to bypass earlier TLPs with Relaxed Ordering + * set. This is a violation of the PCIe 3.0 Transaction Ordering Rules + * outlined in Section 2.4.1 (PCI Express(r) Base Specification Revision 3.0 + * November 10, 2010). As a result, on this platform we can't use Relaxed + * Ordering for Upstream TLPs. + */ +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_AMD, 0x1a00, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_AMD, 0x1a01, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_AMD, 0x1a02, PCI_CLASS_NOT_DEFINED, 8, + quirk_relaxedordering_disable); + /* * Per PCIe r3.0, sec 2.2.9, "Completion headers must supply the same * values for the Attribute as were supplied in the header of the From b0ba9d5fded9590cac67a482c5aab8b1bf86ee40 Mon Sep 17 00:00:00 2001 From: Casey Leedom Date: Tue, 15 Aug 2017 11:23:26 +0800 Subject: [PATCH 31/47] net/cxgb4: Use new PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag cxgb4 Ethernet driver now queries PCIe configuration space to determine if it can send TLPs to it with the Relaxed Ordering Attribute set. Remove the enable_pcie_relaxed_ordering() to avoid enable PCIe Capability Device Control[Relaxed Ordering Enable] at probe routine, to make sure the driver will not send the Relaxed Ordering TLPs to the Root Complex which could not deal the Relaxed Ordering TLPs. Signed-off-by: Casey Leedom Signed-off-by: Ding Tianhong Reviewed-by: Casey Leedom Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 + .../net/ethernet/chelsio/cxgb4/cxgb4_main.c | 23 ++++++++++++++----- drivers/net/ethernet/chelsio/cxgb4/sge.c | 5 ++-- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index ef4be781fd05..09ea62ee96d3 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -529,6 +529,7 @@ enum { /* adapter flags */ USING_SOFT_PARAMS = (1 << 6), MASTER_PF = (1 << 7), FW_OFLD_CONN = (1 << 9), + ROOT_NO_RELAXED_ORDERING = (1 << 10), }; enum { diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index e403fa18f1b1..33bb8678833a 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -4654,11 +4654,6 @@ static void print_port_info(const struct net_device *dev) dev->name, adap->params.vpd.id, adap->name, buf); } -static void enable_pcie_relaxed_ordering(struct pci_dev *dev) -{ - pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_RELAX_EN); -} - /* * Free the following resources: * - memory used for tables @@ -4908,7 +4903,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) } pci_enable_pcie_error_reporting(pdev); - enable_pcie_relaxed_ordering(pdev); pci_set_master(pdev); pci_save_state(pdev); @@ -4947,6 +4941,23 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) adapter->msg_enable = DFLT_MSG_ENABLE; memset(adapter->chan_map, 0xff, sizeof(adapter->chan_map)); + /* If possible, we use PCIe Relaxed Ordering Attribute to deliver + * Ingress Packet Data to Free List Buffers in order to allow for + * chipset performance optimizations between the Root Complex and + * Memory Controllers. (Messages to the associated Ingress Queue + * notifying new Packet Placement in the Free Lists Buffers will be + * send without the Relaxed Ordering Attribute thus guaranteeing that + * all preceding PCIe Transaction Layer Packets will be processed + * first.) But some Root Complexes have various issues with Upstream + * Transaction Layer Packets with the Relaxed Ordering Attribute set. + * The PCIe devices which under the Root Complexes will be cleared the + * Relaxed Ordering bit in the configuration space, So we check our + * PCIe configuration space to see if it's flagged with advice against + * using Relaxed Ordering. + */ + if (!pcie_relaxed_ordering_enabled(pdev)) + adapter->flags |= ROOT_NO_RELAXED_ORDERING; + spin_lock_init(&adapter->stats_lock); spin_lock_init(&adapter->tid_release_lock); spin_lock_init(&adapter->win0_lock); diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index ede12209f20b..4ef68f69b58c 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -2719,6 +2719,7 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq, struct fw_iq_cmd c; struct sge *s = &adap->sge; struct port_info *pi = netdev_priv(dev); + int relaxed = !(adap->flags & ROOT_NO_RELAXED_ORDERING); /* Size needs to be multiple of 16, including status entry. */ iq->size = roundup(iq->size, 16); @@ -2772,8 +2773,8 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq, flsz = fl->size / 8 + s->stat_len / sizeof(struct tx_desc); c.iqns_to_fl0congen |= htonl(FW_IQ_CMD_FL0PACKEN_F | - FW_IQ_CMD_FL0FETCHRO_F | - FW_IQ_CMD_FL0DATARO_F | + FW_IQ_CMD_FL0FETCHRO_V(relaxed) | + FW_IQ_CMD_FL0DATARO_V(relaxed) | FW_IQ_CMD_FL0PADEN_F); if (cong >= 0) c.iqns_to_fl0congen |= From b629276df7f669b1daaad2131ca418ab55186565 Mon Sep 17 00:00:00 2001 From: Casey Leedom Date: Tue, 15 Aug 2017 11:23:27 +0800 Subject: [PATCH 32/47] net/cxgb4vf: Use new PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag cxgb4vf Ethernet driver now queries PCIe configuration space to determine if it can send TLPs to it with the Relaxed Ordering Attribute set, just like the pf did. Signed-off-by: Casey Leedom Signed-off-by: Ding Tianhong Reviewed-by: Casey Leedom Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4vf/adapter.h | 1 + .../ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 18 ++++++++++++++++++ drivers/net/ethernet/chelsio/cxgb4vf/sge.c | 3 +++ 3 files changed, 22 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h b/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h index 109bc630408b..08c6ddb84a04 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h +++ b/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h @@ -408,6 +408,7 @@ enum { /* adapter flags */ USING_MSI = (1UL << 1), USING_MSIX = (1UL << 2), QUEUES_BOUND = (1UL << 3), + ROOT_NO_RELAXED_ORDERING = (1UL << 4), }; /* diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index ac7a150c54e9..2b85b874fd0d 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -2888,6 +2888,24 @@ static int cxgb4vf_pci_probe(struct pci_dev *pdev, */ adapter->name = pci_name(pdev); adapter->msg_enable = DFLT_MSG_ENABLE; + + /* If possible, we use PCIe Relaxed Ordering Attribute to deliver + * Ingress Packet Data to Free List Buffers in order to allow for + * chipset performance optimizations between the Root Complex and + * Memory Controllers. (Messages to the associated Ingress Queue + * notifying new Packet Placement in the Free Lists Buffers will be + * send without the Relaxed Ordering Attribute thus guaranteeing that + * all preceding PCIe Transaction Layer Packets will be processed + * first.) But some Root Complexes have various issues with Upstream + * Transaction Layer Packets with the Relaxed Ordering Attribute set. + * The PCIe devices which under the Root Complexes will be cleared the + * Relaxed Ordering bit in the configuration space, So we check our + * PCIe configuration space to see if it's flagged with advice against + * using Relaxed Ordering. + */ + if (!pcie_relaxed_ordering_enabled(pdev)) + adapter->flags |= ROOT_NO_RELAXED_ORDERING; + err = adap_init0(adapter); if (err) goto err_unmap_bar; diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c index e37dde2ba97f..05498e7f2840 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c @@ -2205,6 +2205,7 @@ int t4vf_sge_alloc_rxq(struct adapter *adapter, struct sge_rspq *rspq, struct port_info *pi = netdev_priv(dev); struct fw_iq_cmd cmd, rpl; int ret, iqandst, flsz = 0; + int relaxed = !(adapter->flags & ROOT_NO_RELAXED_ORDERING); /* * If we're using MSI interrupts and we're not initializing the @@ -2300,6 +2301,8 @@ int t4vf_sge_alloc_rxq(struct adapter *adapter, struct sge_rspq *rspq, cpu_to_be32( FW_IQ_CMD_FL0HOSTFCMODE_V(SGE_HOSTFCMODE_NONE) | FW_IQ_CMD_FL0PACKEN_F | + FW_IQ_CMD_FL0FETCHRO_V(relaxed) | + FW_IQ_CMD_FL0DATARO_V(relaxed) | FW_IQ_CMD_FL0PADEN_F); /* In T6, for egress queue type FL there is internal overhead From 539a06baedd06127389b254f6b9f016ca072da13 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 14 Aug 2017 18:04:24 +0200 Subject: [PATCH 33/47] tcp: ulp: avoid module refcnt leak in tcp_set_ulp __tcp_ulp_find_autoload returns tcp_ulp_ops after taking a reference on the module. Then, if ->init fails, tcp_set_ulp propagates the error but nothing releases that reference. Fixes: 734942cc4ea6 ("tcp: ULP infrastructure") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv4/tcp_ulp.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 2417f55374c5..6bb9e14c710a 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -122,14 +122,14 @@ int tcp_set_ulp(struct sock *sk, const char *name) ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) - err = -ENOENT; - else - err = ulp_ops->init(sk); + return -ENOENT; - if (err) - goto out; + err = ulp_ops->init(sk); + if (err) { + module_put(ulp_ops->owner); + return err; + } icsk->icsk_ulp_ops = ulp_ops; - out: - return err; + return 0; } From 36f41f8fc6d8aa9f8c9072d66ff7cf9055f5e69b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Aug 2017 10:16:45 -0700 Subject: [PATCH 34/47] af_key: do not use GFP_KERNEL in atomic contexts pfkey_broadcast() might be called from non process contexts, we can not use GFP_KERNEL in these cases [1]. This patch partially reverts commit ba51b6be38c1 ("net: Fix RCU splat in af_key"), only keeping the GFP_ATOMIC forcing under rcu_read_lock() section. [1] : syzkaller reported : in_atomic(): 1, irqs_disabled(): 0, pid: 2932, name: syzkaller183439 3 locks held by syzkaller183439/2932: #0: (&net->xfrm.xfrm_cfg_mutex){+.+.+.}, at: [] pfkey_sendmsg+0x4c8/0x9f0 net/key/af_key.c:3649 #1: (&pfk->dump_lock){+.+.+.}, at: [] pfkey_do_dump+0x76/0x3f0 net/key/af_key.c:293 #2: (&(&net->xfrm.xfrm_policy_lock)->rlock){+...+.}, at: [] spin_lock_bh include/linux/spinlock.h:304 [inline] #2: (&(&net->xfrm.xfrm_policy_lock)->rlock){+...+.}, at: [] xfrm_policy_walk+0x192/0xa30 net/xfrm/xfrm_policy.c:1028 CPU: 0 PID: 2932 Comm: syzkaller183439 Not tainted 4.13.0-rc4+ #24 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 ___might_sleep+0x2b2/0x470 kernel/sched/core.c:5994 __might_sleep+0x95/0x190 kernel/sched/core.c:5947 slab_pre_alloc_hook mm/slab.h:416 [inline] slab_alloc mm/slab.c:3383 [inline] kmem_cache_alloc+0x24b/0x6e0 mm/slab.c:3559 skb_clone+0x1a0/0x400 net/core/skbuff.c:1037 pfkey_broadcast_one+0x4b2/0x6f0 net/key/af_key.c:207 pfkey_broadcast+0x4ba/0x770 net/key/af_key.c:281 dump_sp+0x3d6/0x500 net/key/af_key.c:2685 xfrm_policy_walk+0x2f1/0xa30 net/xfrm/xfrm_policy.c:1042 pfkey_dump_sp+0x42/0x50 net/key/af_key.c:2695 pfkey_do_dump+0xaa/0x3f0 net/key/af_key.c:299 pfkey_spddump+0x1a0/0x210 net/key/af_key.c:2722 pfkey_process+0x606/0x710 net/key/af_key.c:2814 pfkey_sendmsg+0x4d6/0x9f0 net/key/af_key.c:3650 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 ___sys_sendmsg+0x755/0x890 net/socket.c:2035 __sys_sendmsg+0xe5/0x210 net/socket.c:2069 SYSC_sendmsg net/socket.c:2080 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2076 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x445d79 RSP: 002b:00007f32447c1dc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000445d79 RDX: 0000000000000000 RSI: 000000002023dfc8 RDI: 0000000000000008 RBP: 0000000000000086 R08: 00007f32447c2700 R09: 00007f32447c2700 R10: 00007f32447c2700 R11: 0000000000000202 R12: 0000000000000000 R13: 00007ffe33edec4f R14: 00007f32447c29c0 R15: 0000000000000000 Fixes: ba51b6be38c1 ("net: Fix RCU splat in af_key") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: David Ahern Acked-by: David Ahern Signed-off-by: David S. Miller --- net/key/af_key.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index ca9d3ae665e7..98f4d8211b9a 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -228,7 +228,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 -static int pfkey_broadcast(struct sk_buff *skb, +static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, int broadcast_flags, struct sock *one_sk, struct net *net) { @@ -278,7 +278,7 @@ static int pfkey_broadcast(struct sk_buff *skb, rcu_read_unlock(); if (one_sk != NULL) - err = pfkey_broadcast_one(skb, &skb2, GFP_KERNEL, one_sk); + err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); kfree_skb(skb2); kfree_skb(skb); @@ -311,7 +311,7 @@ static int pfkey_do_dump(struct pfkey_sock *pfk) hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } @@ -355,7 +355,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1389,7 +1389,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ xfrm_state_put(x); - pfkey_broadcast(resp_skb, BROADCAST_ONE, sk, net); + pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); return 0; } @@ -1476,7 +1476,7 @@ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(skb, BROADCAST_ALL, NULL, xs_net(x)); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); return 0; } @@ -1589,7 +1589,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1694,8 +1694,8 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad return -ENOBUFS; } - pfkey_broadcast(supp_skb, BROADCAST_REGISTERED, sk, sock_net(sk)); - + pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, + sock_net(sk)); return 0; } @@ -1712,7 +1712,8 @@ static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - return pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, + sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) @@ -1733,7 +1734,7 @@ static int key_notify_sa_flush(const struct km_event *c) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } @@ -1790,7 +1791,7 @@ static int dump_sa(struct xfrm_state *x, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -1878,7 +1879,7 @@ static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb new_hdr->sadb_msg_errno = 0; } - pfkey_broadcast(skb, BROADCAST_ALL, NULL, sock_net(sk)); + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } @@ -2206,7 +2207,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(out_skb, BROADCAST_ALL, NULL, xp_net(xp)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } @@ -2426,7 +2427,7 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, BROADCAST_ONE, sk, xp_net(xp)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: @@ -2682,7 +2683,7 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -2739,7 +2740,7 @@ static int key_notify_policy_flush(const struct km_event *c) hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb_out, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } @@ -2803,7 +2804,7 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb void *ext_hdrs[SADB_EXT_MAX]; int err; - pfkey_broadcast(skb_clone(skb, GFP_KERNEL), + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); @@ -3024,7 +3025,8 @@ static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; - pfkey_broadcast(out_skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); return 0; } @@ -3212,7 +3214,8 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_ctx->ctx_len); } - return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, @@ -3408,7 +3411,8 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; - return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE @@ -3599,7 +3603,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* broadcast migrate message to sockets */ - pfkey_broadcast(skb, BROADCAST_ALL, NULL, &init_net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); return 0; From e5645f51ba99738b0e5d708edf9c6454f33b9310 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 14 Aug 2017 10:44:59 -0700 Subject: [PATCH 35/47] ipv6: release rt6->rt6i_idev properly during ifdown When a dst is created by addrconf_dst_alloc() for a host route or an anycast route, dst->dev points to loopback dev while rt6->rt6i_idev points to a real device. When the real device goes down, the current cleanup code only checks for dst->dev and assumes rt6->rt6i_idev->dev is the same. This causes the refcount leak on the real device in the above situation. This patch makes sure to always release the refcount taken on rt6->rt6i_idev during dst_dev_put(). Fixes: 587fea741134 ("ipv6: mark DST_NOGC and remove the operation of dst_free()") Reported-by: John Stultz Tested-by: John Stultz Tested-by: Martin KaFai Lau Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a640fbcba15d..99d4727f2b18 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -417,14 +417,11 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, struct net_device *loopback_dev = dev_net(dev)->loopback_dev; - if (dev != loopback_dev) { - if (idev && idev->dev == dev) { - struct inet6_dev *loopback_idev = - in6_dev_get(loopback_dev); - if (loopback_idev) { - rt->rt6i_idev = loopback_idev; - in6_dev_put(idev); - } + if (idev && idev->dev != loopback_dev) { + struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); + if (loopback_idev) { + rt->rt6i_idev = loopback_idev; + in6_dev_put(idev); } } } From 42b7305905be52e467bbc346b0f2f95ad44eb1a0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 14 Aug 2017 21:31:38 +0200 Subject: [PATCH 36/47] udp: fix linear skb reception with PEEK_OFF copy_linear_skb() is broken; both of its callers actually expect 'len' to be the amount we are trying to copy, not the offset of the end. Fix it keeping the meanings of arguments in sync with what the callers (both of them) expect. Also restore a saner behavior on EFAULT (i.e. preserving the iov_iter position in case of failure): The commit fd851ba9caa9 ("udp: harden copy_linear_skb()") avoids the more destructive effect of the buggy copy_linear_skb(), e.g. no more invalid memory access, but said function still behaves incorrectly: when peeking with offset it can fail with EINVAL instead of copying the appropriate amount of memory. Reported-by: Sasha Levin Fixes: b65ac44674dd ("udp: try to avoid 2 cache miss on dequeue") Fixes: fd851ba9caa9 ("udp: harden copy_linear_skb()") Signed-off-by: Al Viro Acked-by: Paolo Abeni Tested-by: Sasha Levin Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/udp.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/net/udp.h b/include/net/udp.h index e9b1d1eacb59..586de4b811b5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -366,14 +366,13 @@ static inline bool udp_skb_is_linear(struct sk_buff *skb) static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { - int n, copy = len - off; + int n; - if (copy < 0) - return -EINVAL; - n = copy_to_iter(skb->data + off, copy, to); - if (n == copy) + n = copy_to_iter(skb->data + off, len, to); + if (n == len) return 0; + iov_iter_revert(to, n); return -EFAULT; } From 7749d4ff88d31b0be17c8683143135adaaadc6a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Aug 2017 14:10:25 -0700 Subject: [PATCH 37/47] dccp: purge write queue in dccp_destroy_sock() syzkaller reported that DCCP could have a non empty write queue at dismantle time. WARNING: CPU: 1 PID: 2953 at net/core/stream.c:199 sk_stream_kill_queues+0x3ce/0x520 net/core/stream.c:199 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 2953 Comm: syz-executor0 Not tainted 4.13.0-rc4+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:180 __warn+0x1c4/0x1d9 kernel/panic.c:541 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:273 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:310 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323 invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:846 RIP: 0010:sk_stream_kill_queues+0x3ce/0x520 net/core/stream.c:199 RSP: 0018:ffff8801d182f108 EFLAGS: 00010297 RAX: ffff8801d1144140 RBX: ffff8801d13cb280 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff85137b00 RDI: ffff8801d13cb280 RBP: ffff8801d182f148 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801d13cb4d0 R13: ffff8801d13cb3b8 R14: ffff8801d13cb300 R15: ffff8801d13cb3b8 inet_csk_destroy_sock+0x175/0x3f0 net/ipv4/inet_connection_sock.c:835 dccp_close+0x84d/0xc10 net/dccp/proto.c:1067 inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425 sock_release+0x8d/0x1e0 net/socket.c:597 sock_close+0x16/0x20 net/socket.c:1126 __fput+0x327/0x7e0 fs/file_table.c:210 ____fput+0x15/0x20 fs/file_table.c:246 task_work_run+0x18a/0x260 kernel/task_work.c:116 exit_task_work include/linux/task_work.h:21 [inline] do_exit+0xa32/0x1b10 kernel/exit.c:865 do_group_exit+0x149/0x400 kernel/exit.c:969 get_signal+0x7e8/0x17e0 kernel/signal.c:2330 do_signal+0x94/0x1ee0 arch/x86/kernel/signal.c:808 exit_to_usermode_loop+0x21c/0x2d0 arch/x86/entry/common.c:157 prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline] syscall_return_slowpath+0x3a7/0x450 arch/x86/entry/common.c:263 Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/dccp/proto.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 9fe25bf63296..86bc40ba6ba5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -201,10 +201,7 @@ void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - /* - * DCCP doesn't use sk_write_queue, just sk_send_head - * for retransmissions - */ + __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; From d624d276d1ddacbcb12ad96832ce0c7b82cd25db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Aug 2017 17:44:43 -0700 Subject: [PATCH 38/47] tcp: fix possible deadlock in TCP stack vs BPF filter Filtering the ACK packet was not put at the right place. At this place, we already allocated a child and put it into accept queue. We absolutely need to call tcp_child_process() to release its spinlock, or we will deadlock at accept() or close() time. Found by syzkaller team (Thanks a lot !) Fixes: 8fac365f63c8 ("tcp: Add a tcp_filter hook before handle ack packet") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Chenbo Feng Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a20e7f03d5f7..e9252c7df809 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1722,6 +1722,8 @@ int tcp_v4_rcv(struct sk_buff *skb) */ sock_hold(sk); refcounted = true; + if (tcp_filter(sk, skb)) + goto discard_and_relse; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1729,8 +1731,6 @@ int tcp_v4_rcv(struct sk_buff *skb) } if (nsk == sk) { reqsk_put(req); - } else if (tcp_filter(sk, skb)) { - goto discard_and_relse; } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2521690d62d6..206210125fd7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1456,6 +1456,8 @@ static int tcp_v6_rcv(struct sk_buff *skb) } sock_hold(sk); refcounted = true; + if (tcp_filter(sk, skb)) + goto discard_and_relse; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1464,8 +1466,6 @@ static int tcp_v6_rcv(struct sk_buff *skb) if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); - } else if (tcp_filter(sk, skb)) { - goto discard_and_relse; } else if (tcp_child_process(sk, nsk, skb)) { tcp_v6_send_reset(nsk, skb); goto discard_and_relse; From 12d94a804946af291e24b80fc53ec86264765781 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Aug 2017 04:09:51 -0700 Subject: [PATCH 39/47] ipv6: fix NULL dereference in ip6_route_dev_notify() Based on a syzkaller report [1], I found that a per cpu allocation failure in snmp6_alloc_dev() would then lead to NULL dereference in ip6_route_dev_notify(). It seems this is a very old bug, thus no Fixes tag in this submission. Let's add in6_dev_put_clear() helper, as we will probably use it elsewhere (once available/present in net-next) [1] kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 17294 Comm: syz-executor6 Not tainted 4.13.0-rc2+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff88019f456680 task.stack: ffff8801c6e58000 RIP: 0010:__read_once_size include/linux/compiler.h:250 [inline] RIP: 0010:atomic_read arch/x86/include/asm/atomic.h:26 [inline] RIP: 0010:refcount_sub_and_test+0x7d/0x1b0 lib/refcount.c:178 RSP: 0018:ffff8801c6e5f1b0 EFLAGS: 00010202 RAX: 0000000000000037 RBX: dffffc0000000000 RCX: ffffc90005d25000 RDX: ffff8801c6e5f218 RSI: ffffffff82342bbf RDI: 0000000000000001 RBP: ffff8801c6e5f240 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff10038dcbe37 R13: 0000000000000006 R14: 0000000000000001 R15: 00000000000001b8 FS: 00007f21e0429700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001ddbc22000 CR3: 00000001d632b000 CR4: 00000000001426e0 DR0: 0000000020000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600 Call Trace: refcount_dec_and_test+0x1a/0x20 lib/refcount.c:211 in6_dev_put include/net/addrconf.h:335 [inline] ip6_route_dev_notify+0x1c9/0x4a0 net/ipv6/route.c:3732 notifier_call_chain+0x136/0x2c0 kernel/notifier.c:93 __raw_notifier_call_chain kernel/notifier.c:394 [inline] raw_notifier_call_chain+0x2d/0x40 kernel/notifier.c:401 call_netdevice_notifiers_info+0x51/0x90 net/core/dev.c:1678 call_netdevice_notifiers net/core/dev.c:1694 [inline] rollback_registered_many+0x91c/0xe80 net/core/dev.c:7107 rollback_registered+0x1be/0x3c0 net/core/dev.c:7149 register_netdevice+0xbcd/0xee0 net/core/dev.c:7587 register_netdev+0x1a/0x30 net/core/dev.c:7669 loopback_net_init+0x76/0x160 drivers/net/loopback.c:214 ops_init+0x10a/0x570 net/core/net_namespace.c:118 setup_net+0x313/0x710 net/core/net_namespace.c:294 copy_net_ns+0x27c/0x580 net/core/net_namespace.c:418 create_new_namespaces+0x425/0x880 kernel/nsproxy.c:107 unshare_nsproxy_namespaces+0xae/0x1e0 kernel/nsproxy.c:206 SYSC_unshare kernel/fork.c:2347 [inline] SyS_unshare+0x653/0xfa0 kernel/fork.c:2297 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512c9 RSP: 002b:00007f21e0428c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000110 RAX: ffffffffffffffda RBX: 0000000000718150 RCX: 00000000004512c9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000062020200 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b973d R13: 00000000ffffffff R14: 000000002001d000 R15: 00000000000002dd Code: 50 2b 34 82 c7 00 f1 f1 f1 f1 c7 40 04 04 f2 f2 f2 c7 40 08 f3 f3 f3 f3 e8 a1 43 39 ff 4c 89 f8 48 8b 95 70 ff ff ff 48 c1 e8 03 <0f> b6 0c 18 4c 89 f8 83 e0 07 83 c0 03 38 c8 7c 08 84 c9 0f 85 RIP: __read_once_size include/linux/compiler.h:250 [inline] RSP: ffff8801c6e5f1b0 RIP: atomic_read arch/x86/include/asm/atomic.h:26 [inline] RSP: ffff8801c6e5f1b0 RIP: refcount_sub_and_test+0x7d/0x1b0 lib/refcount.c:178 RSP: ffff8801c6e5f1b0 ---[ end trace e441d046c6410d31 ]--- Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- include/net/addrconf.h | 10 ++++++++++ net/ipv6/route.c | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 6df79e96a780..f44ff2476758 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -336,6 +336,16 @@ static inline void in6_dev_put(struct inet6_dev *idev) in6_dev_finish_destroy(idev); } +static inline void in6_dev_put_clear(struct inet6_dev **pidev) +{ + struct inet6_dev *idev = *pidev; + + if (idev) { + in6_dev_put(idev); + *pidev = NULL; + } +} + static inline void __in6_dev_put(struct inet6_dev *idev) { refcount_dec(&idev->refcnt); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 99d4727f2b18..94d6a13d47f0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3721,10 +3721,10 @@ static int ip6_route_dev_notify(struct notifier_block *this, /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ - in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES - in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev); - in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); #endif } From b3dc8f772fab5b2d284b780830fd56494491e493 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Tue, 15 Aug 2017 04:28:54 -0700 Subject: [PATCH 40/47] net: Fix a typo in comment about sock flags. Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- include/linux/net.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/net.h b/include/linux/net.h index dda2cc939a53..ebeb48c92005 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -37,7 +37,7 @@ struct net; /* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. - * Eventually all flags will be in sk->sk_wq_flags. + * Eventually all flags will be in sk->sk_wq->flags. */ #define SOCKWQ_ASYNC_NOSPACE 0 #define SOCKWQ_ASYNC_WAITDATA 1 From 187e5b3ac84d3421d2de3aca949b2791fbcad554 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Aug 2017 05:26:17 -0700 Subject: [PATCH 41/47] ipv4: fix NULL dereference in free_fib_info_rcu() If fi->fib_metrics could not be allocated in fib_create_info() we attempt to dereference a NULL pointer in free_fib_info_rcu() : m = fi->fib_metrics; if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) kfree(m); Before my recent patch, we used to call kfree(NULL) and nothing wrong happened. Instead of using RCU to defer freeing while we are under memory stress, it seems better to take immediate action. This was reported by syzkaller team. Fixes: 3fb07daff8e9 ("ipv4: add reference counting to metrics") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b8d18171cca3..ec3a9ce281a6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1083,15 +1083,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (!fi) goto failure; - fib_info_cnt++; if (cfg->fc_mx) { fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); - if (!fi->fib_metrics) - goto failure; + if (unlikely(!fi->fib_metrics)) { + kfree(fi); + return ERR_PTR(err); + } atomic_set(&fi->fib_metrics->refcnt, 1); - } else + } else { fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; - + } + fib_info_cnt++; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; From 898904226b5a6dee657f23cf51e385f50da22596 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 15 Aug 2017 16:35:21 +0300 Subject: [PATCH 42/47] net_sched: reset pointers to tcf blocks in classful qdiscs' destructors Traffic filters could keep direct pointers to classes in classful qdisc, thus qdisc destruction first removes all filters before freeing classes. Class destruction methods also tries to free attached filters but now this isn't safe because tcf_block_put() unlike to tcf_destroy_chain() cannot be called second time. This patch set class->block to NULL after first tcf_block_put() and turn second call into no-op. Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Signed-off-by: Konstantin Khlebnikov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_atm.c | 4 +++- net/sched/sch_cbq.c | 4 +++- net/sched/sch_hfsc.c | 4 +++- net/sched/sch_htb.c | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 572fe2584e48..c403c87aff7a 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -572,8 +572,10 @@ static void atm_tc_destroy(struct Qdisc *sch) struct atm_flow_data *flow, *tmp; pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) + list_for_each_entry(flow, &p->flows, list) { tcf_block_put(flow->block); + flow->block = NULL; + } list_for_each_entry_safe(flow, tmp, &p->flows, list) { if (flow->ref > 1) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 481036f6b54e..780db43300b1 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1431,8 +1431,10 @@ static void cbq_destroy(struct Qdisc *sch) * be bound to classes which have been destroyed already. --TGR '04 */ for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { tcf_block_put(cl->block); + cl->block = NULL; + } } for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3ad02bbe6903..fd15200f8627 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1530,8 +1530,10 @@ hfsc_destroy_qdisc(struct Qdisc *sch) unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) { tcf_block_put(cl->block); + cl->block = NULL; + } } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 203286ab4427..5d65ec5207e9 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1258,8 +1258,10 @@ static void htb_destroy(struct Qdisc *sch) tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { tcf_block_put(cl->block); + cl->block = NULL; + } } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], From 325d5dc3f7e7c2840b65e4a2988c082c2c0025c5 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 15 Aug 2017 16:37:04 +0300 Subject: [PATCH 43/47] net_sched/sfq: update hierarchical backlog when drop packet When sfq_enqueue() drops head packet or packet from another queue it have to update backlog at upper qdiscs too. Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too") Signed-off-by: Konstantin Khlebnikov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index f80ea2cc5f1f..82469ef9655e 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -437,6 +437,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) qdisc_drop(head, sch, to_free); slot_queue_add(slot, skb); + qdisc_tree_reduce_backlog(sch, 0, delta); return NET_XMIT_CN; } @@ -468,8 +469,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) /* Return Congestion Notification only if we dropped a packet * from this flow. */ - if (qlen != slot->qlen) + if (qlen != slot->qlen) { + qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb)); return NET_XMIT_CN; + } /* As we dropped a packet, better let upper stack know this */ qdisc_tree_reduce_backlog(sch, 1, dropped); From c90e95147c27b1780e76c6e8fea1b5c78d7d387f Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 15 Aug 2017 16:39:05 +0300 Subject: [PATCH 44/47] net_sched: remove warning from qdisc_hash_add It was added in commit e57a784d8cae ("pkt_sched: set root qdisc before change() in attach_default_qdiscs()") to hide duplicates from "tc qdisc show" for incative deivices. After 59cc1f61f ("net: sched: convert qdisc linked list to hashtable") it triggered when classful qdisc is added to inactive device because default qdiscs are added before switching root qdisc. Anyway after commit ea3274695353 ("net: sched: avoid duplicates in qdisc dump") duplicates are filtered right in dumper. Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/sched/sch_api.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index bd24a550e0f9..a3fa144b8648 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -286,9 +286,6 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) void qdisc_hash_add(struct Qdisc *q, bool invisible) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { - struct Qdisc *root = qdisc_dev(q)->qdisc; - - WARN_ON_ONCE(root == &noop_qdisc); ASSERT_RTNL(); hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); if (invisible) From 61deee962896f7eb547adc66ef09c8f1e7ddf7d7 Mon Sep 17 00:00:00 2001 From: Bert Kenward Date: Tue, 15 Aug 2017 14:55:32 +0100 Subject: [PATCH 45/47] sfc: don't try and read ef10 data on non-ef10 NIC The MAC stats command takes a port ID, which doesn't exist on pre-ef10 NICs (5000- and 6000- series). This is extracted from the NIC specific data; we misinterpret this as the ef10 data structure, causing us to read potentially unallocated data. With a KASAN kernel this can cause errors with: BUG: KASAN: slab-out-of-bounds in efx_mcdi_mac_stats Fixes: 0a2ab4d988d7 ("sfc: set the port-id when calling MC_CMD_MAC_STATS") Reported-by: Stefano Brivio Tested-by: Stefano Brivio Signed-off-by: Bert Kenward Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/mcdi_port.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/sfc/mcdi_port.c b/drivers/net/ethernet/sfc/mcdi_port.c index c905971c5f3a..990a63d7fcb7 100644 --- a/drivers/net/ethernet/sfc/mcdi_port.c +++ b/drivers/net/ethernet/sfc/mcdi_port.c @@ -938,7 +938,6 @@ enum efx_stats_action { static int efx_mcdi_mac_stats(struct efx_nic *efx, enum efx_stats_action action, int clear) { - struct efx_ef10_nic_data *nic_data = efx->nic_data; MCDI_DECLARE_BUF(inbuf, MC_CMD_MAC_STATS_IN_LEN); int rc; int change = action == EFX_STATS_PULL ? 0 : 1; @@ -960,7 +959,12 @@ static int efx_mcdi_mac_stats(struct efx_nic *efx, MAC_STATS_IN_PERIODIC_NOEVENT, 1, MAC_STATS_IN_PERIOD_MS, period); MCDI_SET_DWORD(inbuf, MAC_STATS_IN_DMA_LEN, dma_len); - MCDI_SET_DWORD(inbuf, MAC_STATS_IN_PORT_ID, nic_data->vport_id); + + if (efx_nic_rev(efx) >= EFX_REV_HUNT_A0) { + struct efx_ef10_nic_data *nic_data = efx->nic_data; + + MCDI_SET_DWORD(inbuf, MAC_STATS_IN_PORT_ID, nic_data->vport_id); + } rc = efx_mcdi_rpc_quiet(efx, MC_CMD_MAC_STATS, inbuf, sizeof(inbuf), NULL, 0, NULL); From 0e405232871d67bf1b238d56b6b3d500e69ebbf3 Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Tue, 15 Aug 2017 23:24:48 +0800 Subject: [PATCH 46/47] PCI: fix oops when try to find Root Port for a PCI device Eric report a oops when booting the system after applying the commit a99b646afa8a ("PCI: Disable PCIe Relaxed..."): [ 4.241029] BUG: unable to handle kernel NULL pointer dereference at 0000000000000050 [ 4.247001] IP: pci_find_pcie_root_port+0x62/0x80 [ 4.253011] PGD 0 [ 4.253011] P4D 0 [ 4.253011] [ 4.258013] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 4.262015] Modules linked in: [ 4.265005] CPU: 31 PID: 1 Comm: swapper/0 Not tainted 4.13.0-dbx-DEV #316 [ 4.271002] Hardware name: Intel RML,PCH/Iota_QC_19, BIOS 2.40.0 06/22/2016 [ 4.279002] task: ffffa2ee38cfa040 task.stack: ffffa51ec0004000 [ 4.285001] RIP: 0010:pci_find_pcie_root_port+0x62/0x80 [ 4.290012] RSP: 0000:ffffa51ec0007ab8 EFLAGS: 00010246 [ 4.295003] RAX: 0000000000000000 RBX: ffffa2ee36bae000 RCX: 0000000000000006 [ 4.303002] RDX: 000000000000081c RSI: ffffa2ee38cfa8c8 RDI: ffffa2ee36bae000 [ 4.310013] RBP: ffffa51ec0007b58 R08: 0000000000000001 R09: 0000000000000000 [ 4.317001] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa51ec0007ad0 [ 4.324005] R13: ffffa2ee36bae098 R14: 0000000000000002 R15: ffffa2ee37204818 [ 4.331002] FS: 0000000000000000(0000) GS:ffffa2ee3fcc0000(0000) knlGS:0000000000000000 [ 4.339002] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4.345001] CR2: 0000000000000050 CR3: 000000401000f000 CR4: 00000000001406e0 [ 4.351002] Call Trace: [ 4.354012] ? pci_configure_device+0x19f/0x570 [ 4.359002] ? pci_conf1_read+0xb8/0xf0 [ 4.363002] ? raw_pci_read+0x23/0x40 [ 4.366011] ? pci_read+0x2c/0x30 [ 4.370014] ? pci_read_config_word+0x67/0x70 [ 4.374012] pci_device_add+0x28/0x230 [ 4.378012] ? pci_vpd_f0_read+0x50/0x80 [ 4.382014] pci_scan_single_device+0x96/0xc0 [ 4.386012] pci_scan_slot+0x79/0xf0 [ 4.389001] pci_scan_child_bus+0x31/0x180 [ 4.394014] acpi_pci_root_create+0x1c6/0x240 [ 4.398013] pci_acpi_scan_root+0x15f/0x1b0 [ 4.402012] acpi_pci_root_add+0x2e6/0x400 [ 4.406012] ? acpi_evaluate_integer+0x37/0x60 [ 4.411002] acpi_bus_attach+0xdf/0x200 [ 4.415002] acpi_bus_attach+0x6a/0x200 [ 4.418014] acpi_bus_attach+0x6a/0x200 [ 4.422013] acpi_bus_scan+0x38/0x70 [ 4.426011] acpi_scan_init+0x10c/0x271 [ 4.429001] acpi_init+0x2fa/0x348 [ 4.433004] ? acpi_sleep_proc_init+0x2d/0x2d [ 4.437001] do_one_initcall+0x43/0x169 [ 4.441001] kernel_init_freeable+0x1d0/0x258 [ 4.445003] ? rest_init+0xe0/0xe0 [ 4.449001] kernel_init+0xe/0x150 ====================== cut here ============================= It looks like the pci_find_pcie_root_port() was trying to find the Root Port for the PCI device which is the Root Port already, it will return NULL and trigger the problem, so check the highest_pcie_bridge to fix thie problem. Fixes: a99b646afa8a ("PCI: Disable PCIe Relaxed Ordering if unsupported") Fixes: c56d4450eb68 ("PCI: Turn off Request Attributes to avoid Chelsio T5 Completion erratum") Reported-by: Eric Dumazet Signed-off-by: Eric Dumazet Signed-off-by: Ding Tianhong Signed-off-by: David S. Miller --- drivers/pci/pci.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index af0cc3456dc1..587cd7623ed8 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -522,10 +522,11 @@ struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev) bridge = pci_upstream_bridge(bridge); } - if (pci_pcie_type(highest_pcie_bridge) != PCI_EXP_TYPE_ROOT_PORT) - return NULL; + if (highest_pcie_bridge && + pci_pcie_type(highest_pcie_bridge) == PCI_EXP_TYPE_ROOT_PORT) + return highest_pcie_bridge; - return highest_pcie_bridge; + return NULL; } EXPORT_SYMBOL(pci_find_pcie_root_port); From 88a5c690b66110ad255380d8f629c629cf6ca559 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Aug 2017 01:45:33 +0200 Subject: [PATCH 47/47] bpf: fix bpf_trace_printk on 32 bit archs James reported that on MIPS32 bpf_trace_printk() is currently broken while MIPS64 works fine: bpf_trace_printk() uses conditional operators to attempt to pass different types to __trace_printk() depending on the format operators. This doesn't work as intended on 32-bit architectures where u32 and long are passed differently to u64, since the result of C conditional operators follows the "usual arithmetic conversions" rules, such that the values passed to __trace_printk() will always be u64 [causing issues later in the va_list handling for vscnprintf()]. For example the samples/bpf/tracex5 test printed lines like below on MIPS32, where the fd and buf have come from the u64 fd argument, and the size from the buf argument: [...] 1180.941542: 0x00000001: write(fd=1, buf= (null), size=6258688) Instead of this: [...] 1625.616026: 0x00000001: write(fd=1, buf=009e4000, size=512) One way to get it working is to expand various combinations of argument types into 8 different combinations for 32 bit and 64 bit kernels. Fix tested by James on MIPS32 and MIPS64 as well that it resolves the issue. Fixes: 9c959c863f82 ("tracing: Allow BPF programs to call bpf_trace_printk()") Reported-by: James Hogan Tested-by: James Hogan Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 37385193a608..dc498b605d5d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, fmt_cnt++; } - return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, - mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, - mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); +/* Horrid workaround for getting va_list handling working with different + * argument type combinations generically for 32 and 64 bit archs. + */ +#define __BPF_TP_EMIT() __BPF_ARG3_TP() +#define __BPF_TP(...) \ + __trace_printk(1 /* Fake ip will not be printed. */, \ + fmt, ##__VA_ARGS__) + +#define __BPF_ARG1_TP(...) \ + ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_TP(arg1, ##__VA_ARGS__) \ + : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ + : __BPF_TP((u32)arg1, ##__VA_ARGS__))) + +#define __BPF_ARG2_TP(...) \ + ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ + : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ + : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) + +#define __BPF_ARG3_TP(...) \ + ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ + : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ + : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) + + return __BPF_TP_EMIT(); } static const struct bpf_func_proto bpf_trace_printk_proto = {