linux-stable/net/ipv6/ndisc.c

2074 lines
53 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Neighbour Discovery for IPv6
* Linux INET6 implementation
*
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
* Mike Shaver <shaver@ingenia.com>
*/
/*
* Changes:
*
* Alexey I. Froloff : RFC6106 (DNSSL) support
* Pierre Ynard : export userland ND options
* through netlink (RDNSS support)
* Lars Fenneberg : fixed MTU setting on receipt
* of an RA.
* Janos Farkas : kmalloc failure checks
* Alexey Kuznetsov : state machine reworked
* and moved to net/core.
* Pekka Savola : RFC2461 validation
* YOSHIFUJI Hideaki @USAGI : Verify ND options properly
*/
#define pr_fmt(fmt) "ICMPv6: " fmt
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/sched.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/route.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/if_addr.h>
#include <linux/if_ether.h>
#include <linux/if_arp.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/jhash.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/icmp.h>
#include <net/netlink.h>
#include <linux/rtnetlink.h>
#include <net/flow.h>
#include <net/ip6_checksum.h>
#include <net/inet_common.h>
#include <linux/proc_fs.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
static u32 ndisc_hash(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd);
static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey);
static bool ndisc_allow_add(const struct net_device *dev,
struct netlink_ext_ack *extack);
static int ndisc_constructor(struct neighbour *neigh);
static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
static int pndisc_constructor(struct pneigh_entry *n);
static void pndisc_destructor(struct pneigh_entry *n);
static void pndisc_redo(struct sk_buff *skb);
static int ndisc_is_multicast(const void *pkey);
static const struct neigh_ops ndisc_generic_ops = {
.family = AF_INET6,
.solicit = ndisc_solicit,
.error_report = ndisc_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_connected_output,
};
static const struct neigh_ops ndisc_hh_ops = {
.family = AF_INET6,
.solicit = ndisc_solicit,
.error_report = ndisc_error_report,
.output = neigh_resolve_output,
.connected_output = neigh_resolve_output,
};
static const struct neigh_ops ndisc_direct_ops = {
.family = AF_INET6,
.output = neigh_direct_output,
.connected_output = neigh_direct_output,
};
struct neigh_table nd_tbl = {
.family = AF_INET6,
.key_len = sizeof(struct in6_addr),
.protocol = cpu_to_be16(ETH_P_IPV6),
.hash = ndisc_hash,
.key_eq = ndisc_key_eq,
.constructor = ndisc_constructor,
.pconstructor = pndisc_constructor,
.pdestructor = pndisc_destructor,
.proxy_redo = pndisc_redo,
.is_multicast = ndisc_is_multicast,
.allow_add = ndisc_allow_add,
.id = "ndisc_cache",
.parms = {
.tbl = &nd_tbl,
.reachable_time = ND_REACHABLE_TIME,
.data = {
[NEIGH_VAR_MCAST_PROBES] = 3,
[NEIGH_VAR_UCAST_PROBES] = 3,
[NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER,
[NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
[NEIGH_VAR_PROXY_QLEN] = 64,
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
},
},
.gc_interval = 30 * HZ,
.gc_thresh1 = 128,
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
};
EXPORT_SYMBOL_GPL(nd_tbl);
void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data,
int data_len, int pad)
{
int space = __ndisc_opt_addr_space(data_len, pad);
u8 *opt = skb_put(skb, space);
opt[0] = type;
opt[1] = space>>3;
memset(opt + 2, 0, pad);
opt += pad;
space -= pad;
memcpy(opt+2, data, data_len);
data_len += 2;
opt += data_len;
space -= data_len;
if (space > 0)
memset(opt, 0, space);
}
EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option);
static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type,
const void *data, u8 icmp6_type)
{
__ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len,
ndisc_addr_option_pad(skb->dev->type));
ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type);
}
static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb,
void *ha,
const u8 *ops_data)
{
ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT);
ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data);
}
static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
struct nd_opt_hdr *end)
{
int type;
if (!cur || !end || cur >= end)
return NULL;
type = cur->nd_opt_type;
do {
cur = ((void *)cur) + (cur->nd_opt_len << 3);
} while (cur < end && cur->nd_opt_type != type);
return cur <= end && cur->nd_opt_type == type ? cur : NULL;
}
static inline int ndisc_is_useropt(const struct net_device *dev,
struct nd_opt_hdr *opt)
{
return opt->nd_opt_type == ND_OPT_PREFIX_INFO ||
opt->nd_opt_type == ND_OPT_RDNSS ||
opt->nd_opt_type == ND_OPT_DNSSL ||
opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL ||
opt->nd_opt_type == ND_OPT_PREF64 ||
ndisc_ops_is_useropt(dev, opt->nd_opt_type);
}
static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev,
struct nd_opt_hdr *cur,
struct nd_opt_hdr *end)
{
if (!cur || !end || cur >= end)
return NULL;
do {
cur = ((void *)cur) + (cur->nd_opt_len << 3);
} while (cur < end && !ndisc_is_useropt(dev, cur));
return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL;
}
struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
u8 *opt, int opt_len,
struct ndisc_options *ndopts)
{
struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt;
if (!nd_opt || opt_len < 0 || !ndopts)
return NULL;
memset(ndopts, 0, sizeof(*ndopts));
while (opt_len) {
int l;
if (opt_len < sizeof(struct nd_opt_hdr))
return NULL;
l = nd_opt->nd_opt_len << 3;
if (opt_len < l || l == 0)
return NULL;
if (ndisc_ops_parse_options(dev, nd_opt, ndopts))
goto next_opt;
switch (nd_opt->nd_opt_type) {
case ND_OPT_SOURCE_LL_ADDR:
case ND_OPT_TARGET_LL_ADDR:
case ND_OPT_MTU:
case ND_OPT_NONCE:
case ND_OPT_REDIRECT_HDR:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
ND_PRINTK(2, warn,
"%s: duplicated ND6 option found: type=%d\n",
__func__, nd_opt->nd_opt_type);
} else {
ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
}
break;
case ND_OPT_PREFIX_INFO:
ndopts->nd_opts_pi_end = nd_opt;
if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
break;
#ifdef CONFIG_IPV6_ROUTE_INFO
case ND_OPT_ROUTE_INFO:
ndopts->nd_opts_ri_end = nd_opt;
if (!ndopts->nd_opts_ri)
ndopts->nd_opts_ri = nd_opt;
break;
#endif
default:
if (ndisc_is_useropt(dev, nd_opt)) {
ndopts->nd_useropts_end = nd_opt;
if (!ndopts->nd_useropts)
ndopts->nd_useropts = nd_opt;
} else {
/*
* Unknown options must be silently ignored,
* to accommodate future extension to the
* protocol.
*/
ND_PRINTK(2, notice,
"%s: ignored unsupported option; type=%d, len=%d\n",
__func__,
nd_opt->nd_opt_type,
nd_opt->nd_opt_len);
}
}
next_opt:
opt_len -= l;
nd_opt = ((void *)nd_opt) + l;
}
return ndopts;
}
int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
{
switch (dev->type) {
case ARPHRD_ETHER:
case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */
case ARPHRD_FDDI:
ipv6_eth_mc_map(addr, buf);
return 0;
case ARPHRD_ARCNET:
ipv6_arcnet_mc_map(addr, buf);
return 0;
case ARPHRD_INFINIBAND:
ipv6_ib_mc_map(addr, dev->broadcast, buf);
return 0;
case ARPHRD_IPGRE:
return ipv6_ipgre_mc_map(addr, dev->broadcast, buf);
default:
if (dir) {
memcpy(buf, dev->broadcast, dev->addr_len);
return 0;
}
}
return -EINVAL;
}
EXPORT_SYMBOL(ndisc_mc_map);
static u32 ndisc_hash(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd)
{
return ndisc_hashfn(pkey, dev, hash_rnd);
}
static bool ndisc_key_eq(const struct neighbour *n, const void *pkey)
{
return neigh_key_eq128(n, pkey);
}
static int ndisc_constructor(struct neighbour *neigh)
{
struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key;
struct net_device *dev = neigh->dev;
struct inet6_dev *in6_dev;
struct neigh_parms *parms;
bool is_multicast = ipv6_addr_is_multicast(addr);
in6_dev = in6_dev_get(dev);
if (!in6_dev) {
return -EINVAL;
}
parms = in6_dev->nd_parms;
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST;
if (!dev->header_ops) {
neigh->nud_state = NUD_NOARP;
neigh->ops = &ndisc_direct_ops;
neigh->output = neigh_direct_output;
} else {
if (is_multicast) {
neigh->nud_state = NUD_NOARP;
ndisc_mc_map(addr, neigh->ha, dev, 1);
} else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
if (dev->flags&IFF_LOOPBACK)
neigh->type = RTN_LOCAL;
} else if (dev->flags&IFF_POINTOPOINT) {
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
if (dev->header_ops->cache)
neigh->ops = &ndisc_hh_ops;
else
neigh->ops = &ndisc_generic_ops;
if (neigh->nud_state&NUD_VALID)
neigh->output = neigh->ops->connected_output;
else
neigh->output = neigh->ops->output;
}
in6_dev_put(in6_dev);
return 0;
}
static int pndisc_constructor(struct pneigh_entry *n)
{
struct in6_addr *addr = (struct in6_addr *)&n->key;
struct in6_addr maddr;
struct net_device *dev = n->dev;
if (!dev || !__in6_dev_get(dev))
return -EINVAL;
addrconf_addr_solict_mult(addr, &maddr);
ipv6_dev_mc_inc(dev, &maddr);
return 0;
}
static void pndisc_destructor(struct pneigh_entry *n)
{
struct in6_addr *addr = (struct in6_addr *)&n->key;
struct in6_addr maddr;
struct net_device *dev = n->dev;
if (!dev || !__in6_dev_get(dev))
return;
addrconf_addr_solict_mult(addr, &maddr);
ipv6_dev_mc_dec(dev, &maddr);
}
/* called with rtnl held */
static bool ndisc_allow_add(const struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct inet6_dev *idev = __in6_dev_get(dev);
if (!idev || idev->cnf.disable_ipv6) {
NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device");
return false;
}
return true;
}
static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
int len)
{
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
struct sock *sk = dev_net(dev)->ipv6.ndisc_sk;
struct sk_buff *skb;
skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC);
if (!skb) {
ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n",
__func__);
return NULL;
}
skb->protocol = htons(ETH_P_IPV6);
skb->dev = dev;
skb_reserve(skb, hlen + sizeof(struct ipv6hdr));
skb_reset_transport_header(skb);
/* Manually assign socket ownership as we avoid calling
* sock_alloc_send_pskb() to bypass wmem buffer limits
*/
skb_set_owner_w(skb, sk);
return skb;
}
static void ip6_nd_hdr(struct sk_buff *skb,
const struct in6_addr *saddr,
const struct in6_addr *daddr,
int hop_limit, int len)
{
struct ipv6hdr *hdr;
net: ipv6: sysctl to specify IPv6 ND traffic class Add a per-device sysctl to specify the default traffic class to use for kernel originated IPv6 Neighbour Discovery packets. Currently this includes: - Router Solicitation (ICMPv6 type 133) ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Solicitation (ICMPv6 type 135) ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Advertisement (ICMPv6 type 136) ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr() - Redirect (ICMPv6 type 137) ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr() and if the kernel ever gets around to generating RA's, it would presumably also include: - Router Advertisement (ICMPv6 type 134) (radvd daemon could pick up on the kernel setting and use it) Interface drivers may examine the Traffic Class value and translate the DiffServ Code Point into a link-layer appropriate traffic prioritization scheme. An example of mapping IETF DSCP values to IEEE 802.11 User Priority values can be found here: https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11 The expected primary use case is to properly prioritize ND over wifi. Testing: jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 0 jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 255 jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 34 jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1 tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24) jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 24, tgt is jzem22.pgc, Flags [solicited] (based on original change written by Erik Kline, with minor changes) v2: fix 'suspicious rcu_dereference_check() usage' by explicitly grabbing the rcu_read_lock. Cc: Lorenzo Colitti <lorenzo@google.com> Signed-off-by: Erik Kline <ek@google.com> Signed-off-by: Maciej Żenczykowski <maze@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 05:52:09 +00:00
struct inet6_dev *idev;
unsigned tclass;
rcu_read_lock();
idev = __in6_dev_get(skb->dev);
tclass = idev ? READ_ONCE(idev->cnf.ndisc_tclass) : 0;
net: ipv6: sysctl to specify IPv6 ND traffic class Add a per-device sysctl to specify the default traffic class to use for kernel originated IPv6 Neighbour Discovery packets. Currently this includes: - Router Solicitation (ICMPv6 type 133) ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Solicitation (ICMPv6 type 135) ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Advertisement (ICMPv6 type 136) ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr() - Redirect (ICMPv6 type 137) ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr() and if the kernel ever gets around to generating RA's, it would presumably also include: - Router Advertisement (ICMPv6 type 134) (radvd daemon could pick up on the kernel setting and use it) Interface drivers may examine the Traffic Class value and translate the DiffServ Code Point into a link-layer appropriate traffic prioritization scheme. An example of mapping IETF DSCP values to IEEE 802.11 User Priority values can be found here: https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11 The expected primary use case is to properly prioritize ND over wifi. Testing: jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 0 jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 255 jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 34 jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1 tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24) jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 24, tgt is jzem22.pgc, Flags [solicited] (based on original change written by Erik Kline, with minor changes) v2: fix 'suspicious rcu_dereference_check() usage' by explicitly grabbing the rcu_read_lock. Cc: Lorenzo Colitti <lorenzo@google.com> Signed-off-by: Erik Kline <ek@google.com> Signed-off-by: Maciej Żenczykowski <maze@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 05:52:09 +00:00
rcu_read_unlock();
skb_push(skb, sizeof(*hdr));
skb_reset_network_header(skb);
hdr = ipv6_hdr(skb);
net: ipv6: sysctl to specify IPv6 ND traffic class Add a per-device sysctl to specify the default traffic class to use for kernel originated IPv6 Neighbour Discovery packets. Currently this includes: - Router Solicitation (ICMPv6 type 133) ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Solicitation (ICMPv6 type 135) ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr() - Neighbour Advertisement (ICMPv6 type 136) ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr() - Redirect (ICMPv6 type 137) ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr() and if the kernel ever gets around to generating RA's, it would presumably also include: - Router Advertisement (ICMPv6 type 134) (radvd daemon could pick up on the kernel setting and use it) Interface drivers may examine the Traffic Class value and translate the DiffServ Code Point into a link-layer appropriate traffic prioritization scheme. An example of mapping IETF DSCP values to IEEE 802.11 User Priority values can be found here: https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11 The expected primary use case is to properly prioritize ND over wifi. Testing: jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 0 jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass -bash: echo: write error: Invalid argument jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 255 jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass 34 jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1 tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24) jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement, length 24, tgt is jzem22.pgc, Flags [solicited] (based on original change written by Erik Kline, with minor changes) v2: fix 'suspicious rcu_dereference_check() usage' by explicitly grabbing the rcu_read_lock. Cc: Lorenzo Colitti <lorenzo@google.com> Signed-off-by: Erik Kline <ek@google.com> Signed-off-by: Maciej Żenczykowski <maze@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 05:52:09 +00:00
ip6_flow_hdr(hdr, tclass, 0);
hdr->payload_len = htons(len);
hdr->nexthdr = IPPROTO_ICMPV6;
hdr->hop_limit = hop_limit;
hdr->saddr = *saddr;
hdr->daddr = *daddr;
}
void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
struct dst_entry *dst = skb_dst(skb);
struct net *net = dev_net(skb->dev);
struct sock *sk = net->ipv6.ndisc_sk;
struct inet6_dev *idev;
int err;
struct icmp6hdr *icmp6h = icmp6_hdr(skb);
u8 type;
type = icmp6h->icmp6_type;
if (!dst) {
struct flowi6 fl6;
int oif = skb->dev->ifindex;
icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
dst = icmp6_dst_alloc(skb->dev, &fl6);
if (IS_ERR(dst)) {
kfree_skb(skb);
return;
}
skb_dst_set(skb, dst);
}
icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len,
IPPROTO_ICMPV6,
csum_partial(icmp6h,
skb->len, 0));
ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len);
rcu_read_lock();
idev = __in6_dev_get(dst->dev);
net: fix IPSTATS_MIB_OUTPKGS increment in OutForwDatagrams. Reproduce environment: network with 3 VM linuxs is connected as below: VM1<---->VM2(latest kernel 6.5.0-rc7)<---->VM3 VM1: eth0 ip: 192.168.122.207 MTU 1500 VM2: eth0 ip: 192.168.122.208, eth1 ip: 192.168.123.224 MTU 1500 VM3: eth0 ip: 192.168.123.240 MTU 1500 Reproduce: VM1 send 1400 bytes UDP data to VM3 using tools scapy with flags=0. scapy command: send(IP(dst="192.168.123.240",flags=0)/UDP()/str('0'*1400),count=1, inter=1.000000) Result: Before IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates Ip: 1 64 11 0 3 4 0 0 4 7 0 0 0 0 0 0 0 0 0 ...... ---------------------------------------------------------------------- After IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates Ip: 1 64 12 0 3 5 0 0 4 8 0 0 0 0 0 0 0 0 0 ...... ---------------------------------------------------------------------- "ForwDatagrams" increase from 4 to 5 and "OutRequests" also increase from 7 to 8. Issue description and patch: IPSTATS_MIB_OUTPKTS("OutRequests") is counted with IPSTATS_MIB_OUTOCTETS ("OutOctets") in ip_finish_output2(). According to RFC 4293, it is "OutOctets" counted with "OutTransmits" but not "OutRequests". "OutRequests" does not include any datagrams counted in "ForwDatagrams". ipSystemStatsOutOctets OBJECT-TYPE DESCRIPTION "The total number of octets in IP datagrams delivered to the lower layers for transmission. Octets from datagrams counted in ipIfStatsOutTransmits MUST be counted here. ipSystemStatsOutRequests OBJECT-TYPE DESCRIPTION "The total number of IP datagrams that local IP user- protocols (including ICMP) supplied to IP in requests for transmission. Note that this counter does not include any datagrams counted in ipSystemStatsOutForwDatagrams. So do patch to define IPSTATS_MIB_OUTPKTS to "OutTransmits" and add IPSTATS_MIB_OUTREQUESTS for "OutRequests". Add IPSTATS_MIB_OUTREQUESTS counter in __ip_local_out() for ipv4 and add IPSTATS_MIB_OUT counter in ip6_finish_output2() for ipv6. Test result with patch: Before IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates OutTransmits Ip: 1 64 9 0 5 1 0 0 3 3 0 0 0 0 0 0 0 0 0 4 ...... root@qemux86-64:~# cat /proc/net/netstat ...... IpExt: InNoRoutes InTruncatedPkts InMcastPkts OutMcastPkts InBcastPkts OutBcastPkts InOctets OutOctets InMcastOctets OutMcastOctets InBcastOctets OutBcastOctets InCsumErrors InNoECTPkts InECT1Pkts InECT0Pkts InCEPkts ReasmOverlaps IpExt: 0 0 0 0 0 0 2976 1896 0 0 0 0 0 9 0 0 0 0 ---------------------------------------------------------------------- After IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates OutTransmits Ip: 1 64 10 0 5 2 0 0 3 3 0 0 0 0 0 0 0 0 0 5 ...... root@qemux86-64:~# cat /proc/net/netstat ...... IpExt: InNoRoutes InTruncatedPkts InMcastPkts OutMcastPkts InBcastPkts OutBcastPkts InOctets OutOctets InMcastOctets OutMcastOctets InBcastOctets OutBcastOctets InCsumErrors InNoECTPkts InECT1Pkts InECT0Pkts InCEPkts ReasmOverlaps IpExt: 0 0 0 0 0 0 4404 3324 0 0 0 0 0 10 0 0 0 0 ---------------------------------------------------------------------- "ForwDatagrams" increase from 1 to 2 and "OutRequests" is keeping 3. "OutTransmits" increase from 4 to 5 and "OutOctets" increase 1428. Signed-off-by: Heng Guo <heng.guo@windriver.com> Reviewed-by: Kun Song <Kun.Song@windriver.com> Reviewed-by: Filip Pudak <filip.pudak@windriver.com> Reviewed-by: David Ahern <dsahern@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2023-10-19 01:20:53 +00:00
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
2015-09-16 01:04:16 +00:00
err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, dst->dev,
dst_output);
if (!err) {
ICMP6MSGOUT_INC_STATS(net, idev, type);
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
}
rcu_read_unlock();
}
EXPORT_SYMBOL(ndisc_send_skb);
void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
const struct in6_addr *solicited_addr,
bool router, bool solicited, bool override, bool inc_opt)
{
struct sk_buff *skb;
struct in6_addr tmpaddr;
struct inet6_ifaddr *ifp;
const struct in6_addr *src_addr;
struct nd_msg *msg;
int optlen = 0;
/* for anycast or proxy, solicited_addr != src_addr */
ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1);
if (ifp) {
src_addr = solicited_addr;
if (ifp->flags & IFA_F_OPTIMISTIC)
override = false;
inc_opt |= READ_ONCE(ifp->idev->cnf.force_tllao);
in6_ifa_put(ifp);
} else {
if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr,
inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs,
&tmpaddr))
return;
src_addr = &tmpaddr;
}
if (!dev->addr_len)
inc_opt = false;
if (inc_opt)
optlen += ndisc_opt_addr_space(dev,
NDISC_NEIGHBOUR_ADVERTISEMENT);
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
return;
msg = skb_put(skb, sizeof(*msg));
*msg = (struct nd_msg) {
.icmph = {
.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT,
.icmp6_router = router,
.icmp6_solicited = solicited,
.icmp6_override = override,
},
.target = *solicited_addr,
};
if (inc_opt)
ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
dev->dev_addr,
NDISC_NEIGHBOUR_ADVERTISEMENT);
ndisc_send_skb(skb, daddr, src_addr);
}
static void ndisc_send_unsol_na(struct net_device *dev)
{
struct inet6_dev *idev;
struct inet6_ifaddr *ifa;
idev = in6_dev_get(dev);
if (!idev)
return;
read_lock_bh(&idev->lock);
list_for_each_entry(ifa, &idev->addr_list, if_list) {
/* skip tentative addresses until dad completes */
if (ifa->flags & IFA_F_TENTATIVE &&
!(ifa->flags & IFA_F_OPTIMISTIC))
continue;
ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr,
/*router=*/ !!idev->cnf.forwarding,
/*solicited=*/ false, /*override=*/ true,
/*inc_opt=*/ true);
}
read_unlock_bh(&idev->lock);
in6_dev_put(idev);
}
struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit,
const struct in6_addr *saddr, u64 nonce)
{
int inc_opt = dev->addr_len;
struct sk_buff *skb;
struct nd_msg *msg;
int optlen = 0;
if (!saddr)
return NULL;
if (ipv6_addr_any(saddr))
inc_opt = false;
if (inc_opt)
optlen += ndisc_opt_addr_space(dev,
NDISC_NEIGHBOUR_SOLICITATION);
if (nonce != 0)
optlen += 8;
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
return NULL;
msg = skb_put(skb, sizeof(*msg));
*msg = (struct nd_msg) {
.icmph = {
.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION,
},
.target = *solicit,
};
if (inc_opt)
ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
dev->dev_addr,
NDISC_NEIGHBOUR_SOLICITATION);
if (nonce != 0) {
u8 *opt = skb_put(skb, 8);
opt[0] = ND_OPT_NONCE;
opt[1] = 8 >> 3;
memcpy(opt + 2, &nonce, 6);
}
return skb;
}
EXPORT_SYMBOL(ndisc_ns_create);
void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
const struct in6_addr *daddr, const struct in6_addr *saddr,
u64 nonce)
{
struct in6_addr addr_buf;
struct sk_buff *skb;
if (!saddr) {
if (ipv6_get_lladdr(dev, &addr_buf,
(IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)))
return;
saddr = &addr_buf;
}
skb = ndisc_ns_create(dev, solicit, saddr, nonce);
if (skb)
ndisc_send_skb(skb, daddr, saddr);
}
void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
const struct in6_addr *daddr)
{
struct sk_buff *skb;
struct rs_msg *msg;
int send_sllao = dev->addr_len;
int optlen = 0;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
/*
* According to section 2.2 of RFC 4429, we must not
* send router solicitations with a sllao from
* optimistic addresses, but we may send the solicitation
* if we don't include the sllao. So here we check
* if our address is optimistic, and if so, we
* suppress the inclusion of the sllao.
*/
if (send_sllao) {
struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr,
dev, 1);
if (ifp) {
if (ifp->flags & IFA_F_OPTIMISTIC) {
send_sllao = 0;
}
in6_ifa_put(ifp);
} else {
send_sllao = 0;
}
}
#endif
if (send_sllao)
optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION);
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
return;
msg = skb_put(skb, sizeof(*msg));
*msg = (struct rs_msg) {
.icmph = {
.icmp6_type = NDISC_ROUTER_SOLICITATION,
},
};
if (send_sllao)
ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
dev->dev_addr,
NDISC_ROUTER_SOLICITATION);
ndisc_send_skb(skb, daddr, saddr);
}
static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb)
{
/*
* "The sender MUST return an ICMP
* destination unreachable"
*/
dst_link_failure(skb);
kfree_skb(skb);
}
/* Called with locked neigh: either read or both */
static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
struct in6_addr *saddr = NULL;
struct in6_addr mcaddr;
struct net_device *dev = neigh->dev;
struct in6_addr *target = (struct in6_addr *)&neigh->primary_key;
int probes = atomic_read(&neigh->probes);
if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr,
net/ipv6: Change address check to always take a device argument ipv6_chk_addr_and_flags determines if an address is a local address and optionally if it is an address on a specific device. For example, it is called by ip6_route_info_create to determine if a given gateway address is a local address. The address check currently does not consider L3 domains and as a result does not allow a route to be added in one VRF if the nexthop points to an address in a second VRF. e.g., $ ip route add 2001:db8:1::/64 vrf r2 via 2001:db8:102::23 Error: Invalid gateway address. where 2001:db8:102::23 is an address on an interface in vrf r1. ipv6_chk_addr_and_flags needs to allow callers to always pass in a device with a separate argument to not limit the address to the specific device. The device is used used to determine the L3 domain of interest. To that end add an argument to skip the device check and update callers to always pass a device where possible and use the new argument to mean any address in the domain. Update a handful of users of ipv6_chk_addr with a NULL dev argument. This patch handles the change to these callers without adding the domain check. ip6_validate_gw needs to handle 2 cases - one where the device is given as part of the nexthop spec and the other where the device is resolved. There is at least 1 VRF case where deferring the check to only after the route lookup has resolved the device fails with an unintuitive error "RTNETLINK answers: No route to host" as opposed to the preferred "Error: Gateway can not be a local address." The 'no route to host' error is because of the fallback to a full lookup. The check is done twice to avoid this error. Signed-off-by: David Ahern <dsahern@gmail.com> Reviewed-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-13 15:29:37 +00:00
dev, false, 1,
IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))
saddr = &ipv6_hdr(skb)->saddr;
probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
if (probes < 0) {
if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) {
ND_PRINTK(1, dbg,
"%s: trying to ucast probe in NUD_INVALID: %pI6\n",
__func__, target);
}
ndisc_send_ns(dev, target, target, saddr, 0);
} else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
neigh_app_ns(neigh);
} else {
addrconf_addr_solict_mult(target, &mcaddr);
ndisc_send_ns(dev, target, &mcaddr, saddr, 0);
}
}
static int pndisc_is_router(const void *pkey,
struct net_device *dev)
{
struct pneigh_entry *n;
int ret = -1;
read_lock_bh(&nd_tbl.lock);
n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev);
if (n)
ret = !!(n->flags & NTF_ROUTER);
read_unlock_bh(&nd_tbl.lock);
return ret;
}
void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type,
struct ndisc_options *ndopts)
{
neigh_update(neigh, lladdr, new, flags, 0);
/* report ndisc ops about neighbour update */
ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts);
}
static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb)
{
struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
u8 *lladdr = NULL;
u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
offsetof(struct nd_msg, opt));
struct ndisc_options ndopts;
struct net_device *dev = skb->dev;
struct inet6_ifaddr *ifp;
struct inet6_dev *idev = NULL;
struct neighbour *neigh;
int dad = ipv6_addr_any(saddr);
int is_router = -1;
SKB_DR(reason);
u64 nonce = 0;
bool inc;
if (skb->len < sizeof(struct nd_msg))
return SKB_DROP_REASON_PKT_TOO_SMALL;
if (ipv6_addr_is_multicast(&msg->target)) {
ND_PRINTK(2, warn, "NS: multicast target address\n");
return reason;
}
/*
* RFC2461 7.1.1:
* DAD has to be destined for solicited node multicast address.
*/
if (dad && !ipv6_addr_is_solict_mult(daddr)) {
ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n");
return reason;
}
if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts))
return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (ndopts.nd_opts_src_lladdr) {
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev);
if (!lladdr) {
ND_PRINTK(2, warn,
"NS: invalid link-layer address length\n");
return reason;
}
/* RFC2461 7.1.1:
* If the IP source address is the unspecified address,
* there MUST NOT be source link-layer address option
* in the message.
*/
if (dad) {
ND_PRINTK(2, warn,
"NS: bad DAD packet (link-layer address option)\n");
return reason;
}
}
if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1)
memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
inc = ipv6_addr_is_multicast(daddr);
ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
if (ifp) {
have_ifp:
if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
if (dad) {
if (nonce != 0 && ifp->dad_nonce == nonce) {
u8 *np = (u8 *)&nonce;
/* Matching nonce if looped back */
ND_PRINTK(2, notice,
"%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
ifp->idev->dev->name,
&ifp->addr, np);
goto out;
}
/*
* We are colliding with another node
* who is doing DAD
* so fail our DAD process
*/
addrconf_dad_failure(skb, ifp);
return reason;
} else {
/*
* This is not a dad solicitation.
* If we are an optimistic node,
* we should respond.
* Otherwise, we should ignore it.
*/
if (!(ifp->flags & IFA_F_OPTIMISTIC))
goto out;
}
}
idev = ifp->idev;
} else {
struct net *net = dev_net(dev);
/* perhaps an address on the master device */
if (netif_is_l3_slave(dev)) {
struct net_device *mdev;
mdev = netdev_master_upper_dev_get_rcu(dev);
if (mdev) {
ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1);
if (ifp)
goto have_ifp;
}
}
idev = in6_dev_get(dev);
if (!idev) {
/* XXX: count this drop? */
return reason;
}
if (ipv6_chk_acast_addr(net, dev, &msg->target) ||
(READ_ONCE(idev->cnf.forwarding) &&
(READ_ONCE(net->ipv6.devconf_all->proxy_ndp) ||
READ_ONCE(idev->cnf.proxy_ndp)) &&
(is_router = pndisc_is_router(&msg->target, dev)) >= 0)) {
if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
skb->pkt_type != PACKET_HOST &&
inc &&
NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) {
/*
* for anycast or proxy,
* sender should delay its response
* by a random time between 0 and
* MAX_ANYCAST_DELAY_TIME seconds.
* (RFC2461) -- yoshfuji
*/
struct sk_buff *n = skb_clone(skb, GFP_ATOMIC);
if (n)
pneigh_enqueue(&nd_tbl, idev->nd_parms, n);
goto out;
}
} else {
SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST);
goto out;
}
}
if (is_router < 0)
is_router = READ_ONCE(idev->cnf.forwarding);
if (dad) {
ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target,
!!is_router, false, (ifp != NULL), true);
goto out;
}
if (inc)
NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast);
else
NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast);
/*
* update / create cache entry
* for the source address
*/
neigh = __neigh_lookup(&nd_tbl, saddr, dev,
!inc || lladdr || !dev->addr_len);
if (neigh)
ndisc_update(dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE,
NDISC_NEIGHBOUR_SOLICITATION, &ndopts);
if (neigh || !dev->header_ops) {
ndisc_send_na(dev, saddr, &msg->target, !!is_router,
true, (ifp != NULL && inc), inc);
if (neigh)
neigh_release(neigh);
reason = SKB_CONSUMED;
}
out:
if (ifp)
in6_ifa_put(ifp);
else
in6_dev_put(idev);
return reason;
}
static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr)
{
struct inet6_dev *idev = __in6_dev_get(dev);
switch (READ_ONCE(idev->cnf.accept_untracked_na)) {
case 0: /* Don't accept untracked na (absent in neighbor cache) */
return 0;
case 1: /* Create new entries from na if currently untracked */
return 1;
case 2: /* Create new entries from untracked na only if saddr is in the
* same subnet as an address configured on the interface that
* received the na
*/
return !!ipv6_chk_prefix(saddr, dev);
default:
return 0;
}
}
static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
{
struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
u8 *lladdr = NULL;
u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
offsetof(struct nd_msg, opt));
struct ndisc_options ndopts;
struct net_device *dev = skb->dev;
struct inet6_dev *idev = __in6_dev_get(dev);
struct inet6_ifaddr *ifp;
struct neighbour *neigh;
SKB_DR(reason);
u8 new_state;
if (skb->len < sizeof(struct nd_msg))
return SKB_DROP_REASON_PKT_TOO_SMALL;
if (ipv6_addr_is_multicast(&msg->target)) {
ND_PRINTK(2, warn, "NA: target address is multicast\n");
return reason;
}
if (ipv6_addr_is_multicast(daddr) &&
msg->icmph.icmp6_solicited) {
ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n");
return reason;
}
/* For some 802.11 wireless deployments (and possibly other networks),
* there will be a NA proxy and unsolicitd packets are attacks
* and thus should not be accepted.
* drop_unsolicited_na takes precedence over accept_untracked_na
*/
if (!msg->icmph.icmp6_solicited && idev &&
READ_ONCE(idev->cnf.drop_unsolicited_na))
return reason;
if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts))
return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (ndopts.nd_opts_tgt_lladdr) {
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev);
if (!lladdr) {
ND_PRINTK(2, warn,
"NA: invalid link-layer address length\n");
return reason;
}
}
ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
if (ifp) {
if (skb->pkt_type != PACKET_LOOPBACK
&& (ifp->flags & IFA_F_TENTATIVE)) {
addrconf_dad_failure(skb, ifp);
return reason;
}
/* What should we make now? The advertisement
is invalid, but ndisc specs say nothing
about it. It could be misconfiguration, or
an smart proxy agent tries to help us :-)
We should not print the error if NA has been
received from loopback - it is just our own
unsolicited advertisement.
*/
if (skb->pkt_type != PACKET_LOOPBACK)
ND_PRINTK(1, warn,
"NA: %pM advertised our address %pI6c on %s!\n",
eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name);
in6_ifa_put(ifp);
return reason;
}
neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
/* RFC 9131 updates original Neighbour Discovery RFC 4861.
* NAs with Target LL Address option without a corresponding
* entry in the neighbour cache can now create a STALE neighbour
* cache entry on routers.
*
* entry accept fwding solicited behaviour
* ------- ------ ------ --------- ----------------------
* present X X 0 Set state to STALE
* present X X 1 Set state to REACHABLE
* absent 0 X X Do nothing
* absent 1 0 X Do nothing
* absent 1 1 X Add a new STALE entry
*
* Note that we don't do a (daddr == all-routers-mcast) check.
*/
new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE;
if (!neigh && lladdr && idev && READ_ONCE(idev->cnf.forwarding)) {
if (accept_untracked_na(dev, saddr)) {
neigh = neigh_create(&nd_tbl, &msg->target, dev);
new_state = NUD_STALE;
}
}
if (neigh && !IS_ERR(neigh)) {
u8 old_flags = neigh->flags;
struct net *net = dev_net(dev);
if (READ_ONCE(neigh->nud_state) & NUD_FAILED)
goto out;
/*
* Don't update the neighbor cache entry on a proxy NA from
* ourselves because either the proxied node is off link or it
* has already sent a NA to us.
*/
if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
READ_ONCE(net->ipv6.devconf_all->forwarding) &&
READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) {
/* XXX: idev->cnf.proxy_ndp */
goto out;
}
ndisc_update(dev, neigh, lladdr,
new_state,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
(msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
(msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0),
NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts);
if ((old_flags & ~neigh->flags) & NTF_ROUTER) {
/*
* Change: router to host
*/
rt6_clean_tohost(dev_net(dev), saddr);
}
reason = SKB_CONSUMED;
out:
neigh_release(neigh);
}
return reason;
}
static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb)
{
struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb);
unsigned long ndoptlen = skb->len - sizeof(*rs_msg);
struct neighbour *neigh;
struct inet6_dev *idev;
const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
struct ndisc_options ndopts;
u8 *lladdr = NULL;
SKB_DR(reason);
if (skb->len < sizeof(*rs_msg))
return SKB_DROP_REASON_PKT_TOO_SMALL;
idev = __in6_dev_get(skb->dev);
if (!idev) {
ND_PRINTK(1, err, "RS: can't find in6 device\n");
return reason;
}
/* Don't accept RS if we're not in router mode */
if (!READ_ONCE(idev->cnf.forwarding))
goto out;
/*
* Don't update NCE if src = ::;
* this implies that the source node has no ip address assigned yet.
*/
if (ipv6_addr_any(saddr))
goto out;
/* Parse ND options */
if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts))
return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (ndopts.nd_opts_src_lladdr) {
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
skb->dev);
if (!lladdr)
goto out;
}
neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1);
if (neigh) {
ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER,
NDISC_ROUTER_SOLICITATION, &ndopts);
neigh_release(neigh);
reason = SKB_CONSUMED;
}
out:
return reason;
}
static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt)
{
struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra);
struct sk_buff *skb;
struct nlmsghdr *nlh;
struct nduseroptmsg *ndmsg;
struct net *net = dev_net(ra->dev);
int err;
int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg)
+ (opt->nd_opt_len << 3));
size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr));
skb = nlmsg_new(msg_size, GFP_ATOMIC);
if (!skb) {
err = -ENOBUFS;
goto errout;
}
nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0);
if (!nlh) {
goto nla_put_failure;
}
ndmsg = nlmsg_data(nlh);
ndmsg->nduseropt_family = AF_INET6;
ndmsg->nduseropt_ifindex = ra->dev->ifindex;
ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type;
ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code;
ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3;
memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3);
if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr))
goto nla_put_failure;
nlmsg_end(skb, nlh);
2009-02-25 07:18:28 +00:00
rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC);
return;
nla_put_failure:
nlmsg_free(skb);
err = -EMSGSIZE;
errout:
rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err);
}
static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
{
struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
bool send_ifinfo_notify = false;
struct neighbour *neigh = NULL;
struct ndisc_options ndopts;
struct fib6_info *rt = NULL;
struct inet6_dev *in6_dev;
struct fib6_table *table;
net: allow user to set metric on default route learned via Router Advertisement For IPv4, default route is learned via DHCPv4 and user is allowed to change metric using config etc/network/interfaces. But for IPv6, default route can be learned via RA, for which, currently a fixed metric value 1024 is used. Ideally, user should be able to configure metric on default route for IPv6 similar to IPv4. This patch adds sysctl for the same. Logs: For IPv4: Config in etc/network/interfaces: auto eth0 iface eth0 inet dhcp metric 4261413864 IPv4 Kernel Route Table: $ ip route list default via 172.21.47.1 dev eth0 metric 4261413864 FRR Table, if a static route is configured: [In real scenario, it is useful to prefer BGP learned default route over DHCPv4 default route.] Codes: K - kernel route, C - connected, S - static, R - RIP, O - OSPF, I - IS-IS, B - BGP, P - PIM, E - EIGRP, N - NHRP, T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP, > - selected route, * - FIB route S>* 0.0.0.0/0 [20/0] is directly connected, eth0, 00:00:03 K 0.0.0.0/0 [254/1000] via 172.21.47.1, eth0, 6d08h51m i.e. User can prefer Default Router learned via Routing Protocol in IPv4. Similar behavior is not possible for IPv6, without this fix. After fix [for IPv6]: sudo sysctl -w net.ipv6.conf.eth0.net.ipv6.conf.eth0.ra_defrtr_metric=1996489705 IP monitor: [When IPv6 RA is received] default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 pref high Kernel IPv6 routing table $ ip -6 route list default via fe80::be16:65ff:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 21sec hoplimit 64 pref high FRR Table, if a static route is configured: [In real scenario, it is useful to prefer BGP learned default route over IPv6 RA default route.] Codes: K - kernel route, C - connected, S - static, R - RIPng, O - OSPFv3, I - IS-IS, B - BGP, N - NHRP, T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP, > - selected route, * - FIB route S>* ::/0 [20/0] is directly connected, eth0, 00:00:06 K ::/0 [119/1001] via fe80::xx16:xxxx:feb3:ce8e, eth0, 6d07h43m If the metric is changed later, the effect will be seen only when next IPv6 RA is received, because the default route must be fully controlled by RA msg. Below metric is changed from 1996489705 to 1996489704. $ sudo sysctl -w net.ipv6.conf.eth0.ra_defrtr_metric=1996489704 net.ipv6.conf.eth0.ra_defrtr_metric = 1996489704 IP monitor: [On next IPv6 RA msg, Kernel deletes prev route and installs new route with updated metric] Deleted default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 3sec hoplimit 64 pref high default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489704 pref high Signed-off-by: Praveen Chaudhary <pchaudhary@linkedin.com> Signed-off-by: Zhenggen Xu <zxu@linkedin.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20210125214430.24079-1-pchaudhary@linkedin.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-25 21:44:30 +00:00
u32 defrtr_usr_metric;
unsigned int pref = 0;
__u32 old_if_flags;
struct net *net;
SKB_DR(reason);
int lifetime;
int optlen;
__u8 *opt = (__u8 *)(ra_msg + 1);
optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) -
sizeof(struct ra_msg);
ND_PRINTK(2, info,
"RA: %s, dev: %s\n",
__func__, skb->dev->name);
if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
ND_PRINTK(2, warn, "RA: source address is not link-local\n");
return reason;
}
if (optlen < 0)
return SKB_DROP_REASON_PKT_TOO_SMALL;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) {
ND_PRINTK(2, warn, "RA: from host or unauthorized router\n");
return reason;
}
#endif
in6_dev = __in6_dev_get(skb->dev);
if (!in6_dev) {
ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n",
skb->dev->name);
return reason;
}
if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts))
return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (!ipv6_accept_ra(in6_dev)) {
ND_PRINTK(2, info,
"RA: %s, did not accept ra for dev: %s\n",
__func__, skb->dev->name);
goto skip_linkparms;
}
#ifdef CONFIG_IPV6_NDISC_NODETYPE
/* skip link-specific parameters from interior routers */
if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
ND_PRINTK(2, info,
"RA: %s, nodetype is NODEFAULT, dev: %s\n",
__func__, skb->dev->name);
goto skip_linkparms;
}
#endif
if (in6_dev->if_flags & IF_RS_SENT) {
/*
* flag that an RA was received after an RS was sent
* out on this interface.
*/
in6_dev->if_flags |= IF_RA_RCVD;
}
/*
* Remember the managed/otherconf flags from most recently
* received RA message (RFC 2462) -- yoshfuji
*/
old_if_flags = in6_dev->if_flags;
in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED |
IF_RA_OTHERCONF)) |
(ra_msg->icmph.icmp6_addrconf_managed ?
IF_RA_MANAGED : 0) |
(ra_msg->icmph.icmp6_addrconf_other ?
IF_RA_OTHERCONF : 0);
if (old_if_flags != in6_dev->if_flags)
send_ifinfo_notify = true;
if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) {
ND_PRINTK(2, info,
"RA: %s, defrtr is false for dev: %s\n",
__func__, skb->dev->name);
goto skip_defrtr;
}
net: change accept_ra_min_rtr_lft to affect all RA lifetimes accept_ra_min_rtr_lft only considered the lifetime of the default route and discarded entire RAs accordingly. This change renames accept_ra_min_rtr_lft to accept_ra_min_lft, and applies the value to individual RA sections; in particular, router lifetime, PIO preferred lifetime, and RIO lifetime. If any of those lifetimes are lower than the configured value, the specific RA section is ignored. In order for the sysctl to be useful to Android, it should really apply to all lifetimes in the RA, since that is what determines the minimum frequency at which RAs must be processed by the kernel. Android uses hardware offloads to drop RAs for a fraction of the minimum of all lifetimes present in the RA (some networks have very frequent RAs (5s) with high lifetimes (2h)). Despite this, we have encountered networks that set the router lifetime to 30s which results in very frequent CPU wakeups. Instead of disabling IPv6 (and dropping IPv6 ethertype in the WiFi firmware) entirely on such networks, it seems better to ignore the misconfigured routers while still processing RAs from other IPv6 routers on the same network (i.e. to support IoT applications). The previous implementation dropped the entire RA based on router lifetime. This turned out to be hard to expand to the other lifetimes present in the RA in a consistent manner; dropping the entire RA based on RIO/PIO lifetimes would essentially require parsing the whole thing twice. Fixes: 1671bcfd76fd ("net: add sysctl accept_ra_min_rtr_lft") Cc: Lorenzo Colitti <lorenzo@google.com> Signed-off-by: Patrick Rohr <prohr@google.com> Reviewed-by: Maciej Żenczykowski <maze@google.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20230726230701.919212-1-prohr@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-07-26 23:07:01 +00:00
lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime);
if (lifetime != 0 &&
lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) {
net: change accept_ra_min_rtr_lft to affect all RA lifetimes accept_ra_min_rtr_lft only considered the lifetime of the default route and discarded entire RAs accordingly. This change renames accept_ra_min_rtr_lft to accept_ra_min_lft, and applies the value to individual RA sections; in particular, router lifetime, PIO preferred lifetime, and RIO lifetime. If any of those lifetimes are lower than the configured value, the specific RA section is ignored. In order for the sysctl to be useful to Android, it should really apply to all lifetimes in the RA, since that is what determines the minimum frequency at which RAs must be processed by the kernel. Android uses hardware offloads to drop RAs for a fraction of the minimum of all lifetimes present in the RA (some networks have very frequent RAs (5s) with high lifetimes (2h)). Despite this, we have encountered networks that set the router lifetime to 30s which results in very frequent CPU wakeups. Instead of disabling IPv6 (and dropping IPv6 ethertype in the WiFi firmware) entirely on such networks, it seems better to ignore the misconfigured routers while still processing RAs from other IPv6 routers on the same network (i.e. to support IoT applications). The previous implementation dropped the entire RA based on router lifetime. This turned out to be hard to expand to the other lifetimes present in the RA in a consistent manner; dropping the entire RA based on RIO/PIO lifetimes would essentially require parsing the whole thing twice. Fixes: 1671bcfd76fd ("net: add sysctl accept_ra_min_rtr_lft") Cc: Lorenzo Colitti <lorenzo@google.com> Signed-off-by: Patrick Rohr <prohr@google.com> Reviewed-by: Maciej Żenczykowski <maze@google.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20230726230701.919212-1-prohr@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-07-26 23:07:01 +00:00
ND_PRINTK(2, info,
"RA: router lifetime (%ds) is too short: %s\n",
lifetime, skb->dev->name);
goto skip_defrtr;
}
/* Do not accept RA with source-addr found on local machine unless
* accept_ra_from_local is set to true.
*/
net = dev_net(in6_dev->dev);
if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) &&
ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) {
ND_PRINTK(2, info,
"RA from local address detected on dev: %s: default router ignored\n",
skb->dev->name);
goto skip_defrtr;
}
#ifdef CONFIG_IPV6_ROUTER_PREF
pref = ra_msg->icmph.icmp6_router_pref;
/* 10b is handled as if it were 00b (medium) */
if (pref == ICMPV6_ROUTER_PREF_INVALID ||
!READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref))
pref = ICMPV6_ROUTER_PREF_MEDIUM;
#endif
ipv6: Plumb support for nexthop object in a fib6_info Add struct nexthop and nh_list list_head to fib6_info. nh_list is the fib6_info side of the nexthop <-> fib_info relationship. Since a fib6_info referencing a nexthop object can not have 'sibling' entries (the old way of doing multipath routes), the nh_list is a union with fib6_siblings. Add f6i_list list_head to 'struct nexthop' to track fib6_info entries using a nexthop instance. Update __remove_nexthop_fib to walk f6_list and delete fib entries using the nexthop. Add a few nexthop helpers for use when a nexthop is added to fib6_info: - nexthop_fib6_nh - return first fib6_nh in a nexthop object - fib6_info_nh_dev moved to nexthop.h and updated to use nexthop_fib6_nh if the fib6_info references a nexthop object - nexthop_path_fib6_result - similar to ipv4, select a path within a multipath nexthop object. If the nexthop is a blackhole, set fib6_result type to RTN_BLACKHOLE, and set the REJECT flag Update the fib6_info references to check for nh and take a different path as needed: - rt6_qualify_for_ecmp - if a fib entry uses a nexthop object it can NOT be coalesced with other fib entries into a multipath route - rt6_duplicate_nexthop - use nexthop_cmp if either fib6_info references a nexthop - addrconf (host routes), RA's and info entries (anything configured via ndisc) does not use nexthop objects - fib6_info_destroy_rcu - put reference to nexthop object - fib6_purge_rt - drop fib6_info from f6i_list - fib6_select_path - update to use the new nexthop_path_fib6_result when fib entry uses a nexthop object - rt6_device_match - update to catch use of nexthop object as a blackhole and set fib6_type and flags. - ip6_route_info_create - don't add space for fib6_nh if fib entry is going to reference a nexthop object, take a reference to nexthop object, disallow use of source routing - rt6_nlmsg_size - add space for RTA_NH_ID - add rt6_fill_node_nexthop to add nexthop data on a dump As with ipv4, most of the changes push existing code into the else branch of whether the fib entry uses a nexthop object. Update the nexthop code to walk f6i_list on a nexthop deleted to remove fib entries referencing it. Signed-off-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-04 03:19:52 +00:00
/* routes added from RAs do not use nexthop objects */
rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
if (rt) {
neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
"RA: %s got default router without neighbour\n",
__func__);
fib6_info_release(rt);
return reason;
}
}
net: allow user to set metric on default route learned via Router Advertisement For IPv4, default route is learned via DHCPv4 and user is allowed to change metric using config etc/network/interfaces. But for IPv6, default route can be learned via RA, for which, currently a fixed metric value 1024 is used. Ideally, user should be able to configure metric on default route for IPv6 similar to IPv4. This patch adds sysctl for the same. Logs: For IPv4: Config in etc/network/interfaces: auto eth0 iface eth0 inet dhcp metric 4261413864 IPv4 Kernel Route Table: $ ip route list default via 172.21.47.1 dev eth0 metric 4261413864 FRR Table, if a static route is configured: [In real scenario, it is useful to prefer BGP learned default route over DHCPv4 default route.] Codes: K - kernel route, C - connected, S - static, R - RIP, O - OSPF, I - IS-IS, B - BGP, P - PIM, E - EIGRP, N - NHRP, T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP, > - selected route, * - FIB route S>* 0.0.0.0/0 [20/0] is directly connected, eth0, 00:00:03 K 0.0.0.0/0 [254/1000] via 172.21.47.1, eth0, 6d08h51m i.e. User can prefer Default Router learned via Routing Protocol in IPv4. Similar behavior is not possible for IPv6, without this fix. After fix [for IPv6]: sudo sysctl -w net.ipv6.conf.eth0.net.ipv6.conf.eth0.ra_defrtr_metric=1996489705 IP monitor: [When IPv6 RA is received] default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 pref high Kernel IPv6 routing table $ ip -6 route list default via fe80::be16:65ff:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 21sec hoplimit 64 pref high FRR Table, if a static route is configured: [In real scenario, it is useful to prefer BGP learned default route over IPv6 RA default route.] Codes: K - kernel route, C - connected, S - static, R - RIPng, O - OSPFv3, I - IS-IS, B - BGP, N - NHRP, T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP, > - selected route, * - FIB route S>* ::/0 [20/0] is directly connected, eth0, 00:00:06 K ::/0 [119/1001] via fe80::xx16:xxxx:feb3:ce8e, eth0, 6d07h43m If the metric is changed later, the effect will be seen only when next IPv6 RA is received, because the default route must be fully controlled by RA msg. Below metric is changed from 1996489705 to 1996489704. $ sudo sysctl -w net.ipv6.conf.eth0.ra_defrtr_metric=1996489704 net.ipv6.conf.eth0.ra_defrtr_metric = 1996489704 IP monitor: [On next IPv6 RA msg, Kernel deletes prev route and installs new route with updated metric] Deleted default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 3sec hoplimit 64 pref high default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489704 pref high Signed-off-by: Praveen Chaudhary <pchaudhary@linkedin.com> Signed-off-by: Zhenggen Xu <zxu@linkedin.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20210125214430.24079-1-pchaudhary@linkedin.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-25 21:44:30 +00:00
/* Set default route metric as specified by user */
defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric;
/* delete the route if lifetime is 0 or if metric needs change */
if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) {
ip6_del_rt(net, rt, false);
rt = NULL;
}
net: allow user to set metric on default route learned via Router Advertisement For IPv4, default route is learned via DHCPv4 and user is allowed to change metric using config etc/network/interfaces. But for IPv6, default route can be learned via RA, for which, currently a fixed metric value 1024 is used. Ideally, user should be able to configure metric on default route for IPv6 similar to IPv4. This patch adds sysctl for the same. Logs: For IPv4: Config in etc/network/interfaces: auto eth0 iface eth0 inet dhcp metric 4261413864 IPv4 Kernel Route Table: $ ip route list default via 172.21.47.1 dev eth0 metric 4261413864 FRR Table, if a static route is configured: [In real scenario, it is useful to prefer BGP learned default route over DHCPv4 default route.] Codes: K - kernel route, C - connected, S - static, R - RIP, O - OSPF, I - IS-IS, B - BGP, P - PIM, E - EIGRP, N - NHRP, T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP, > - selected route, * - FIB route S>* 0.0.0.0/0 [20/0] is directly connected, eth0, 00:00:03 K 0.0.0.0/0 [254/1000] via 172.21.47.1, eth0, 6d08h51m i.e. User can prefer Default Router learned via Routing Protocol in IPv4. Similar behavior is not possible for IPv6, without this fix. After fix [for IPv6]: sudo sysctl -w net.ipv6.conf.eth0.net.ipv6.conf.eth0.ra_defrtr_metric=1996489705 IP monitor: [When IPv6 RA is received] default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 pref high Kernel IPv6 routing table $ ip -6 route list default via fe80::be16:65ff:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 21sec hoplimit 64 pref high FRR Table, if a static route is configured: [In real scenario, it is useful to prefer BGP learned default route over IPv6 RA default route.] Codes: K - kernel route, C - connected, S - static, R - RIPng, O - OSPFv3, I - IS-IS, B - BGP, N - NHRP, T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP, > - selected route, * - FIB route S>* ::/0 [20/0] is directly connected, eth0, 00:00:06 K ::/0 [119/1001] via fe80::xx16:xxxx:feb3:ce8e, eth0, 6d07h43m If the metric is changed later, the effect will be seen only when next IPv6 RA is received, because the default route must be fully controlled by RA msg. Below metric is changed from 1996489705 to 1996489704. $ sudo sysctl -w net.ipv6.conf.eth0.ra_defrtr_metric=1996489704 net.ipv6.conf.eth0.ra_defrtr_metric = 1996489704 IP monitor: [On next IPv6 RA msg, Kernel deletes prev route and installs new route with updated metric] Deleted default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 3sec hoplimit 64 pref high default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489704 pref high Signed-off-by: Praveen Chaudhary <pchaudhary@linkedin.com> Signed-off-by: Zhenggen Xu <zxu@linkedin.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20210125214430.24079-1-pchaudhary@linkedin.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-25 21:44:30 +00:00
ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n",
rt, lifetime, defrtr_usr_metric, skb->dev->name);
if (!rt && lifetime) {
ND_PRINTK(3, info, "RA: adding default router\n");
if (neigh)
neigh_release(neigh);
rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
skb->dev, pref, defrtr_usr_metric,
lifetime);
if (!rt) {
ND_PRINTK(0, err,
"RA: %s failed to add default route\n",
__func__);
return reason;
}
neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
ND_PRINTK(0, err,
"RA: %s got default router without neighbour\n",
__func__);
fib6_info_release(rt);
return reason;
}
neigh->flags |= NTF_ROUTER;
} else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) {
struct nl_info nlinfo = {
.nl_net = net,
};
rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE);
}
if (rt) {
table = rt->fib6_table;
spin_lock_bh(&table->tb6_lock);
fib6_set_expires(rt, jiffies + (HZ * lifetime));
fib6_add_gc_list(rt);
spin_unlock_bh(&table->tb6_lock);
}
if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) < 256 &&
ra_msg->icmph.icmp6_hop_limit) {
if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) <=
ra_msg->icmph.icmp6_hop_limit) {
WRITE_ONCE(in6_dev->cnf.hop_limit,
ra_msg->icmph.icmp6_hop_limit);
fib6_metric_set(rt, RTAX_HOPLIMIT,
ra_msg->icmph.icmp6_hop_limit);
} else {
ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
}
}
skip_defrtr:
/*
* Update Reachable Time and Retrans Timer
*/
if (in6_dev->nd_parms) {
unsigned long rtime = ntohl(ra_msg->retrans_timer);
if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) {
rtime = (rtime*HZ)/1000;
if (rtime < HZ/100)
rtime = HZ/100;
NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime);
in6_dev->tstamp = jiffies;
send_ifinfo_notify = true;
}
rtime = ntohl(ra_msg->reachable_time);
if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) {
rtime = (rtime*HZ)/1000;
if (rtime < HZ/10)
rtime = HZ/10;
if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) {
NEIGH_VAR_SET(in6_dev->nd_parms,
BASE_REACHABLE_TIME, rtime);
NEIGH_VAR_SET(in6_dev->nd_parms,
GC_STALETIME, 3 * rtime);
in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime);
in6_dev->tstamp = jiffies;
send_ifinfo_notify = true;
}
}
}
skip_linkparms:
/*
* Process options.
*/
if (!neigh)
neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr,
skb->dev, 1);
if (neigh) {
u8 *lladdr = NULL;
if (ndopts.nd_opts_src_lladdr) {
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
skb->dev);
if (!lladdr) {
ND_PRINTK(2, warn,
"RA: invalid link-layer address length\n");
goto out;
}
}
ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
NEIGH_UPDATE_F_ISROUTER,
NDISC_ROUTER_ADVERTISEMENT, &ndopts);
reason = SKB_CONSUMED;
}
if (!ipv6_accept_ra(in6_dev)) {
ND_PRINTK(2, info,
"RA: %s, accept_ra is false for dev: %s\n",
__func__, skb->dev->name);
goto out;
}
#ifdef CONFIG_IPV6_ROUTE_INFO
if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) &&
ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
in6_dev->dev, 0)) {
ND_PRINTK(2, info,
"RA from local address detected on dev: %s: router info ignored.\n",
skb->dev->name);
goto skip_routeinfo;
}
if (READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref) && ndopts.nd_opts_ri) {
struct nd_opt_hdr *p;
for (p = ndopts.nd_opts_ri;
p;
p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) {
struct route_info *ri = (struct route_info *)p;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT &&
ri->prefix_len == 0)
continue;
#endif
if (ri->prefix_len == 0 &&
!READ_ONCE(in6_dev->cnf.accept_ra_defrtr))
continue;
net: change accept_ra_min_rtr_lft to affect all RA lifetimes accept_ra_min_rtr_lft only considered the lifetime of the default route and discarded entire RAs accordingly. This change renames accept_ra_min_rtr_lft to accept_ra_min_lft, and applies the value to individual RA sections; in particular, router lifetime, PIO preferred lifetime, and RIO lifetime. If any of those lifetimes are lower than the configured value, the specific RA section is ignored. In order for the sysctl to be useful to Android, it should really apply to all lifetimes in the RA, since that is what determines the minimum frequency at which RAs must be processed by the kernel. Android uses hardware offloads to drop RAs for a fraction of the minimum of all lifetimes present in the RA (some networks have very frequent RAs (5s) with high lifetimes (2h)). Despite this, we have encountered networks that set the router lifetime to 30s which results in very frequent CPU wakeups. Instead of disabling IPv6 (and dropping IPv6 ethertype in the WiFi firmware) entirely on such networks, it seems better to ignore the misconfigured routers while still processing RAs from other IPv6 routers on the same network (i.e. to support IoT applications). The previous implementation dropped the entire RA based on router lifetime. This turned out to be hard to expand to the other lifetimes present in the RA in a consistent manner; dropping the entire RA based on RIO/PIO lifetimes would essentially require parsing the whole thing twice. Fixes: 1671bcfd76fd ("net: add sysctl accept_ra_min_rtr_lft") Cc: Lorenzo Colitti <lorenzo@google.com> Signed-off-by: Patrick Rohr <prohr@google.com> Reviewed-by: Maciej Żenczykowski <maze@google.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20230726230701.919212-1-prohr@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-07-26 23:07:01 +00:00
if (ri->lifetime != 0 &&
ntohl(ri->lifetime) < READ_ONCE(in6_dev->cnf.accept_ra_min_lft))
net: change accept_ra_min_rtr_lft to affect all RA lifetimes accept_ra_min_rtr_lft only considered the lifetime of the default route and discarded entire RAs accordingly. This change renames accept_ra_min_rtr_lft to accept_ra_min_lft, and applies the value to individual RA sections; in particular, router lifetime, PIO preferred lifetime, and RIO lifetime. If any of those lifetimes are lower than the configured value, the specific RA section is ignored. In order for the sysctl to be useful to Android, it should really apply to all lifetimes in the RA, since that is what determines the minimum frequency at which RAs must be processed by the kernel. Android uses hardware offloads to drop RAs for a fraction of the minimum of all lifetimes present in the RA (some networks have very frequent RAs (5s) with high lifetimes (2h)). Despite this, we have encountered networks that set the router lifetime to 30s which results in very frequent CPU wakeups. Instead of disabling IPv6 (and dropping IPv6 ethertype in the WiFi firmware) entirely on such networks, it seems better to ignore the misconfigured routers while still processing RAs from other IPv6 routers on the same network (i.e. to support IoT applications). The previous implementation dropped the entire RA based on router lifetime. This turned out to be hard to expand to the other lifetimes present in the RA in a consistent manner; dropping the entire RA based on RIO/PIO lifetimes would essentially require parsing the whole thing twice. Fixes: 1671bcfd76fd ("net: add sysctl accept_ra_min_rtr_lft") Cc: Lorenzo Colitti <lorenzo@google.com> Signed-off-by: Patrick Rohr <prohr@google.com> Reviewed-by: Maciej Żenczykowski <maze@google.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20230726230701.919212-1-prohr@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-07-26 23:07:01 +00:00
continue;
if (ri->prefix_len < READ_ONCE(in6_dev->cnf.accept_ra_rt_info_min_plen))
continue;
if (ri->prefix_len > READ_ONCE(in6_dev->cnf.accept_ra_rt_info_max_plen))
continue;
rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3,
&ipv6_hdr(skb)->saddr);
}
}
skip_routeinfo:
#endif
#ifdef CONFIG_IPV6_NDISC_NODETYPE
/* skip link-specific ndopts from interior routers */
if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
ND_PRINTK(2, info,
"RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n",
__func__, skb->dev->name);
goto out;
}
#endif
if (READ_ONCE(in6_dev->cnf.accept_ra_pinfo) && ndopts.nd_opts_pi) {
struct nd_opt_hdr *p;
for (p = ndopts.nd_opts_pi;
p;
p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) {
addrconf_prefix_rcv(skb->dev, (u8 *)p,
(p->nd_opt_len) << 3,
ndopts.nd_opts_src_lladdr != NULL);
}
}
if (ndopts.nd_opts_mtu && READ_ONCE(in6_dev->cnf.accept_ra_mtu)) {
__be32 n;
u32 mtu;
memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu));
mtu = ntohl(n);
ipv6: add IFLA_INET6_RA_MTU to expose mtu value The kernel provides a "/proc/sys/net/ipv6/conf/<iface>/mtu" file, which can temporarily record the mtu value of the last received RA message when the RA mtu value is lower than the interface mtu, but this proc has following limitations: (1) when the interface mtu (/sys/class/net/<iface>/mtu) is updeated, mtu6 (/proc/sys/net/ipv6/conf/<iface>/mtu) will be updated to the value of interface mtu; (2) mtu6 (/proc/sys/net/ipv6/conf/<iface>/mtu) only affect ipv6 connection, and not affect ipv4. Therefore, when the mtu option is carried in the RA message, there will be a problem that the user sometimes cannot obtain RA mtu value correctly by reading mtu6. After this patch set, if a RA message carries the mtu option, you can send a netlink msg which nlmsg_type is RTM_GETLINK, and then by parsing the attribute of IFLA_INET6_RA_MTU to get the mtu value carried in the RA message received on the inet6 device. In addition, you can also get a link notification when ra_mtu is updated so it doesn't have to poll. In this way, if the MTU values that the device receives from the network in the PCO IPv4 and the RA IPv6 procedures are different, the user can obtain the correct ipv6 ra_mtu value and compare the value of ra_mtu and ipv4 mtu, then the device can use the lower MTU value for both IPv4 and IPv6. Signed-off-by: Rocco Yue <rocco.yue@mediatek.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20210827150412.9267-1-rocco.yue@mediatek.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-08-27 15:04:12 +00:00
if (in6_dev->ra_mtu != mtu) {
in6_dev->ra_mtu = mtu;
send_ifinfo_notify = true;
}
if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
} else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) {
WRITE_ONCE(in6_dev->cnf.mtu6, mtu);
fib6_metric_set(rt, RTAX_MTU, mtu);
rt6_mtu_change(skb->dev, mtu);
}
}
if (ndopts.nd_useropts) {
struct nd_opt_hdr *p;
for (p = ndopts.nd_useropts;
p;
p = ndisc_next_useropt(skb->dev, p,
ndopts.nd_useropts_end)) {
ndisc_ra_useropt(skb, p);
}
}
if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) {
ND_PRINTK(2, warn, "RA: invalid RA options\n");
}
out:
ipv6: add IFLA_INET6_RA_MTU to expose mtu value The kernel provides a "/proc/sys/net/ipv6/conf/<iface>/mtu" file, which can temporarily record the mtu value of the last received RA message when the RA mtu value is lower than the interface mtu, but this proc has following limitations: (1) when the interface mtu (/sys/class/net/<iface>/mtu) is updeated, mtu6 (/proc/sys/net/ipv6/conf/<iface>/mtu) will be updated to the value of interface mtu; (2) mtu6 (/proc/sys/net/ipv6/conf/<iface>/mtu) only affect ipv6 connection, and not affect ipv4. Therefore, when the mtu option is carried in the RA message, there will be a problem that the user sometimes cannot obtain RA mtu value correctly by reading mtu6. After this patch set, if a RA message carries the mtu option, you can send a netlink msg which nlmsg_type is RTM_GETLINK, and then by parsing the attribute of IFLA_INET6_RA_MTU to get the mtu value carried in the RA message received on the inet6 device. In addition, you can also get a link notification when ra_mtu is updated so it doesn't have to poll. In this way, if the MTU values that the device receives from the network in the PCO IPv4 and the RA IPv6 procedures are different, the user can obtain the correct ipv6 ra_mtu value and compare the value of ra_mtu and ipv4 mtu, then the device can use the lower MTU value for both IPv4 and IPv6. Signed-off-by: Rocco Yue <rocco.yue@mediatek.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20210827150412.9267-1-rocco.yue@mediatek.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-08-27 15:04:12 +00:00
/* Send a notify if RA changed managed/otherconf flags or
* timer settings or ra_mtu value
*/
if (send_ifinfo_notify)
inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
fib6_info_release(rt);
if (neigh)
neigh_release(neigh);
return reason;
}
static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb)
{
struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb);
u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
offsetof(struct rd_msg, opt));
struct ndisc_options ndopts;
SKB_DR(reason);
u8 *hdr;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
switch (skb->ndisc_nodetype) {
case NDISC_NODETYPE_HOST:
case NDISC_NODETYPE_NODEFAULT:
ND_PRINTK(2, warn,
"Redirect: from host or unauthorized router\n");
return reason;
}
#endif
if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
ND_PRINTK(2, warn,
"Redirect: source address is not link-local\n");
return reason;
}
if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts))
return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (!ndopts.nd_opts_rh) {
ip6_redirect_no_header(skb, dev_net(skb->dev),
skb->dev->ifindex);
return reason;
}
hdr = (u8 *)ndopts.nd_opts_rh;
hdr += 8;
if (!pskb_pull(skb, hdr - skb_transport_header(skb)))
return SKB_DROP_REASON_PKT_TOO_SMALL;
return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0);
}
static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb,
struct sk_buff *orig_skb,
int rd_len)
{
u8 *opt = skb_put(skb, rd_len);
memset(opt, 0, 8);
*(opt++) = ND_OPT_REDIRECT_HDR;
*(opt++) = (rd_len >> 3);
opt += 6;
ipv6: fix access to non-linear packet in ndisc_fill_redirect_hdr_option() Fix the following slab-out-of-bounds kasan report in ndisc_fill_redirect_hdr_option when the incoming ipv6 packet is not linear and the accessed data are not in the linear data region of orig_skb. [ 1503.122508] ================================================================== [ 1503.122832] BUG: KASAN: slab-out-of-bounds in ndisc_send_redirect+0x94e/0x990 [ 1503.123036] Read of size 1184 at addr ffff8800298ab6b0 by task netperf/1932 [ 1503.123220] CPU: 0 PID: 1932 Comm: netperf Not tainted 4.16.0-rc2+ #124 [ 1503.123347] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-2.fc27 04/01/2014 [ 1503.123527] Call Trace: [ 1503.123579] <IRQ> [ 1503.123638] print_address_description+0x6e/0x280 [ 1503.123849] kasan_report+0x233/0x350 [ 1503.123946] memcpy+0x1f/0x50 [ 1503.124037] ndisc_send_redirect+0x94e/0x990 [ 1503.125150] ip6_forward+0x1242/0x13b0 [...] [ 1503.153890] Allocated by task 1932: [ 1503.153982] kasan_kmalloc+0x9f/0xd0 [ 1503.154074] __kmalloc_track_caller+0xb5/0x160 [ 1503.154198] __kmalloc_reserve.isra.41+0x24/0x70 [ 1503.154324] __alloc_skb+0x130/0x3e0 [ 1503.154415] sctp_packet_transmit+0x21a/0x1810 [ 1503.154533] sctp_outq_flush+0xc14/0x1db0 [ 1503.154624] sctp_do_sm+0x34e/0x2740 [ 1503.154715] sctp_primitive_SEND+0x57/0x70 [ 1503.154807] sctp_sendmsg+0xaa6/0x1b10 [ 1503.154897] sock_sendmsg+0x68/0x80 [ 1503.154987] ___sys_sendmsg+0x431/0x4b0 [ 1503.155078] __sys_sendmsg+0xa4/0x130 [ 1503.155168] do_syscall_64+0x171/0x3f0 [ 1503.155259] entry_SYSCALL_64_after_hwframe+0x42/0xb7 [ 1503.155436] Freed by task 1932: [ 1503.155527] __kasan_slab_free+0x134/0x180 [ 1503.155618] kfree+0xbc/0x180 [ 1503.155709] skb_release_data+0x27f/0x2c0 [ 1503.155800] consume_skb+0x94/0xe0 [ 1503.155889] sctp_chunk_put+0x1aa/0x1f0 [ 1503.155979] sctp_inq_pop+0x2f8/0x6e0 [ 1503.156070] sctp_assoc_bh_rcv+0x6a/0x230 [ 1503.156164] sctp_inq_push+0x117/0x150 [ 1503.156255] sctp_backlog_rcv+0xdf/0x4a0 [ 1503.156346] __release_sock+0x142/0x250 [ 1503.156436] release_sock+0x80/0x180 [ 1503.156526] sctp_sendmsg+0xbb0/0x1b10 [ 1503.156617] sock_sendmsg+0x68/0x80 [ 1503.156708] ___sys_sendmsg+0x431/0x4b0 [ 1503.156799] __sys_sendmsg+0xa4/0x130 [ 1503.156889] do_syscall_64+0x171/0x3f0 [ 1503.156980] entry_SYSCALL_64_after_hwframe+0x42/0xb7 [ 1503.157158] The buggy address belongs to the object at ffff8800298ab600 which belongs to the cache kmalloc-1024 of size 1024 [ 1503.157444] The buggy address is located 176 bytes inside of 1024-byte region [ffff8800298ab600, ffff8800298aba00) [ 1503.157702] The buggy address belongs to the page: [ 1503.157820] page:ffffea0000a62a00 count:1 mapcount:0 mapping:0000000000000000 index:0x0 compound_mapcount: 0 [ 1503.158053] flags: 0x4000000000008100(slab|head) [ 1503.158171] raw: 4000000000008100 0000000000000000 0000000000000000 00000001800e000e [ 1503.158350] raw: dead000000000100 dead000000000200 ffff880036002600 0000000000000000 [ 1503.158523] page dumped because: kasan: bad access detected [ 1503.158698] Memory state around the buggy address: [ 1503.158816] ffff8800298ab900: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 1503.158988] ffff8800298ab980: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 1503.159165] >ffff8800298aba00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 1503.159338] ^ [ 1503.159436] ffff8800298aba80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1503.159610] ffff8800298abb00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 1503.159785] ================================================================== [ 1503.159964] Disabling lock debugging due to kernel taint The test scenario to trigger the issue consists of 4 devices: - H0: data sender, connected to LAN0 - H1: data receiver, connected to LAN1 - GW0 and GW1: routers between LAN0 and LAN1. Both of them have an ethernet connection on LAN0 and LAN1 On H{0,1} set GW0 as default gateway while on GW0 set GW1 as next hop for data from LAN0 to LAN1. Moreover create an ip6ip6 tunnel between H0 and H1 and send 3 concurrent data streams (TCP/UDP/SCTP) from H0 to H1 through ip6ip6 tunnel (send buffer size is set to 16K). While data streams are active flush the route cache on HA multiple times. I have not been able to identify a given commit that introduced the issue since, using the reproducer described above, the kasan report has been triggered from 4.14 and I have not gone back further. Reported-by: Jianlin Shi <jishi@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-08 16:00:02 +00:00
skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt,
rd_len - 8);
}
void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
{
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
struct sock *sk = net->ipv6.ndisc_sk;
int optlen = 0;
struct inet_peer *peer;
struct sk_buff *buff;
struct rd_msg *msg;
struct in6_addr saddr_buf;
struct rt6_info *rt;
struct dst_entry *dst;
struct flowi6 fl6;
int rd_len;
u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
bool ret;
if (netif_is_l3_master(skb->dev)) {
dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
if (!dev)
return;
}
if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
dev->name);
return;
}
if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) &&
[IPv6]: Fix ICMPv6 redirect handling with target multicast address When the ICMPv6 Target address is multicast, Linux processes the redirect instead of dropping it. The problem is in this code in ndisc_redirect_rcv(): if (ipv6_addr_equal(dest, target)) { on_link = 1; } else if (!(ipv6_addr_type(target) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK2(KERN_WARNING "ICMPv6 Redirect: target address is not link-local.\n"); return; } This second check will succeed if the Target address is, for example, FF02::1 because it has link-local scope. Instead, it should be checking if it's a unicast link-local address, as stated in RFC 2461/4861 Section 8.1: - The ICMP Target Address is either a link-local address (when redirected to a router) or the same as the ICMP Destination Address (when redirected to the on-link destination). I know this doesn't explicitly say unicast link-local address, but it's implied. This bug is preventing Linux kernels from achieving IPv6 Logo Phase II certification because of a recent error that was found in the TAHI test suite - Neighbor Disovery suite test 206 (v6LC.2.3.6_G) had the multicast address in the Destination field instead of Target field, so we were passing the test. This won't be the case anymore. The patch below fixes this problem, and also fixes ndisc_send_redirect() to not send an invalid redirect with a multicast address in the Target field. I re-ran the TAHI Neighbor Discovery section to make sure Linux passes all 245 tests now. Signed-off-by: Brian Haley <brian.haley@hp.com> Acked-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-10-08 07:12:05 +00:00
ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
ND_PRINTK(2, warn,
"Redirect: target address is not link-local unicast\n");
[IPV6]: fix BUG of ndisc_send_redirect() When I tested IPv6 redirect function about kernel 2.6.19.1, and found that the kernel can send redirect packets whose target address is global address, and the target is not the actual endpoint of communication. But the criteria conform to RFC2461, the target address defines as following: Target Address An IP address that is a better first hop to use for he ICMP Destination Address. When the target is the actual endpoint of communication, i.e., the destination is a neighbor, the Target Address field MUST contain the same value as the ICMP Destination Address field. Otherwise the target is a better first-hop router and the Target Address MUST be the router's link-local address so that hosts can uniquely identify routers. According to this definition, when a router redirect to a host, the target address either the better first-hop router's link-local address or the same as the ICMP destination address field. But the function of ndisc_send_redirect() in net/ipv6/ndisc.c, does not check the target address correctly. There is another definition about receive Redirect message in RFC2461: 8.1. Validation of Redirect Messages A host MUST silently discard any received Redirect message that does not satisfy all of the following validity checks: ...... - The ICMP Target Address is either a link-local address (when redirected to a router) or the same as the ICMP Destination Address (when redirected to the on-link destination). ...... And the receive redirect function of ndisc_redirect_rcv() implemented this definition, checks the target address correctly. if (ipv6_addr_equal(dest, target)) { on_link = 1; } else if (!(ipv6_addr_type(target) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK2(KERN_WARNING "ICMPv6 Redirect: target address is not link-local.\n"); return; } So, I think the send redirect function must check the target address also. Signed-off-by: Li Yewang <lyw@nanjing-fnst.com> Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2007-01-30 22:33:20 +00:00
return;
}
icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
&saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
dst_release(dst);
return;
}
dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
if (IS_ERR(dst))
return;
rt = dst_rt6_info(dst);
if (rt->rt6i_flags & RTF_GATEWAY) {
ND_PRINTK(2, warn,
"Redirect: destination is not a neighbour\n");
goto release;
}
peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1);
ret = inet_peer_xrlim_allow(peer, 1*HZ);
if (peer)
inet_putpeer(peer);
if (!ret)
goto release;
if (dev->addr_len) {
struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target);
if (!neigh) {
ND_PRINTK(2, warn,
"Redirect: no neigh for target address\n");
goto release;
}
read_lock_bh(&neigh->lock);
if (neigh->nud_state & NUD_VALID) {
memcpy(ha_buf, neigh->ha, dev->addr_len);
read_unlock_bh(&neigh->lock);
ha = ha_buf;
optlen += ndisc_redirect_opt_addr_space(dev, neigh,
ops_data_buf,
&ops_data);
} else
read_unlock_bh(&neigh->lock);
neigh_release(neigh);
}
rd_len = min_t(unsigned int,
IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen,
skb->len + 8);
rd_len &= ~0x7;
optlen += rd_len;
buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!buff)
goto release;
msg = skb_put(buff, sizeof(*msg));
*msg = (struct rd_msg) {
.icmph = {
.icmp6_type = NDISC_REDIRECT,
},
.target = *target,
.dest = ipv6_hdr(skb)->daddr,
};
/*
* include target_address option
*/
if (ha)
ndisc_fill_redirect_addr_option(buff, ha, ops_data);
/*
* build redirect option and copy skb over to the new packet.
*/
if (rd_len)
ndisc_fill_redirect_hdr_option(buff, skb, rd_len);
skb_dst_set(buff, dst);
ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf);
return;
release:
dst_release(dst);
}
static void pndisc_redo(struct sk_buff *skb)
{
enum skb_drop_reason reason = ndisc_recv_ns(skb);
kfree_skb_reason(skb, reason);
}
static int ndisc_is_multicast(const void *pkey)
{
return ipv6_addr_is_multicast((struct in6_addr *)pkey);
}
static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
{
struct inet6_dev *idev = __in6_dev_get(skb->dev);
if (!idev)
return true;
if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED &&
READ_ONCE(idev->cnf.suppress_frag_ndisc)) {
net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n");
return true;
}
return false;
}
enum skb_drop_reason ndisc_rcv(struct sk_buff *skb)
{
struct nd_msg *msg;
SKB_DR(reason);
if (ndisc_suppress_frag_ndisc(skb))
return SKB_DROP_REASON_IPV6_NDISC_FRAG;
if (skb_linearize(skb))
return SKB_DROP_REASON_NOMEM;
msg = (struct nd_msg *)skb_transport_header(skb);
__skb_push(skb, skb->data - skb_transport_header(skb));
if (ipv6_hdr(skb)->hop_limit != 255) {
ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n",
ipv6_hdr(skb)->hop_limit);
return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT;
}
if (msg->icmph.icmp6_code != 0) {
ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n",
msg->icmph.icmp6_code);
return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE;
}
switch (msg->icmph.icmp6_type) {
case NDISC_NEIGHBOUR_SOLICITATION:
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
reason = ndisc_recv_ns(skb);
break;
case NDISC_NEIGHBOUR_ADVERTISEMENT:
reason = ndisc_recv_na(skb);
break;
case NDISC_ROUTER_SOLICITATION:
reason = ndisc_recv_rs(skb);
break;
case NDISC_ROUTER_ADVERTISEMENT:
reason = ndisc_router_discovery(skb);
break;
case NDISC_REDIRECT:
reason = ndisc_redirect_rcv(skb);
break;
}
return reason;
}
static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct netdev_notifier_change_info *change_info;
struct net *net = dev_net(dev);
struct inet6_dev *idev;
bool evict_nocarrier;
switch (event) {
case NETDEV_CHANGEADDR:
neigh_changeaddr(&nd_tbl, dev);
fib6_run_gc(0, net, false);
fallthrough;
case NETDEV_UP:
idev = in6_dev_get(dev);
if (!idev)
break;
if (READ_ONCE(idev->cnf.ndisc_notify) ||
READ_ONCE(net->ipv6.devconf_all->ndisc_notify))
ndisc_send_unsol_na(dev);
in6_dev_put(idev);
break;
case NETDEV_CHANGE:
idev = in6_dev_get(dev);
if (!idev)
evict_nocarrier = true;
else {
evict_nocarrier = READ_ONCE(idev->cnf.ndisc_evict_nocarrier) &&
READ_ONCE(net->ipv6.devconf_all->ndisc_evict_nocarrier);
in6_dev_put(idev);
}
change_info = ptr;
if (change_info->flags_changed & IFF_NOARP)
neigh_changeaddr(&nd_tbl, dev);
if (evict_nocarrier && !netif_carrier_ok(dev))
neigh_carrier_down(&nd_tbl, dev);
break;
case NETDEV_DOWN:
neigh_ifdown(&nd_tbl, dev);
fib6_run_gc(0, net, false);
break;
case NETDEV_NOTIFY_PEERS:
ndisc_send_unsol_na(dev);
break;
default:
break;
}
return NOTIFY_DONE;
}
static struct notifier_block ndisc_netdev_notifier = {
.notifier_call = ndisc_netdev_event,
.priority = ADDRCONF_NOTIFY_PRIORITY - 5,
};
#ifdef CONFIG_SYSCTL
static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
const char *func, const char *dev_name)
{
static char warncomm[TASK_COMM_LEN];
static int warned;
if (strcmp(warncomm, current->comm) && warned < 5) {
strcpy(warncomm, current->comm);
pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n",
warncomm, func,
dev_name, ctl->procname,
dev_name, ctl->procname);
warned++;
}
}
int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct net_device *dev = ctl->extra1;
struct inet6_dev *idev;
int ret;
if ((strcmp(ctl->procname, "retrans_time") == 0) ||
(strcmp(ctl->procname, "base_reachable_time") == 0))
ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default");
if (strcmp(ctl->procname, "retrans_time") == 0)
ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos);
else if (strcmp(ctl->procname, "base_reachable_time") == 0)
ret = neigh_proc_dointvec_jiffies(ctl, write,
buffer, lenp, ppos);
else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) ||
(strcmp(ctl->procname, "base_reachable_time_ms") == 0))
ret = neigh_proc_dointvec_ms_jiffies(ctl, write,
buffer, lenp, ppos);
else
ret = -1;
if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) {
if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME))
idev->nd_parms->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME));
WRITE_ONCE(idev->tstamp, jiffies);
inet6_ifinfo_notify(RTM_NEWLINK, idev);
in6_dev_put(idev);
}
return ret;
}
#endif
static int __net_init ndisc_net_init(struct net *net)
{
struct ipv6_pinfo *np;
struct sock *sk;
int err;
err = inet_ctl_sock_create(&sk, PF_INET6,
SOCK_RAW, IPPROTO_ICMPV6, net);
if (err < 0) {
ND_PRINTK(0, err,
"NDISC: Failed to initialize the control socket (err %d)\n",
err);
return err;
}
net->ipv6.ndisc_sk = sk;
np = inet6_sk(sk);
np->hop_limit = 255;
/* Do not loopback ndisc messages */
inet6_clear_bit(MC6_LOOP, sk);
return 0;
}
static void __net_exit ndisc_net_exit(struct net *net)
{
inet_ctl_sock_destroy(net->ipv6.ndisc_sk);
}
static struct pernet_operations ndisc_net_ops = {
.init = ndisc_net_init,
.exit = ndisc_net_exit,
};
int __init ndisc_init(void)
{
int err;
err = register_pernet_subsys(&ndisc_net_ops);
if (err)
return err;
/*
* Initialize the neighbour table
*/
neigh_table_init(NEIGH_ND_TABLE, &nd_tbl);
#ifdef CONFIG_SYSCTL
err = neigh_sysctl_register(NULL, &nd_tbl.parms,
ndisc_ifinfo_sysctl_change);
if (err)
goto out_unregister_pernet;
out:
#endif
return err;
#ifdef CONFIG_SYSCTL
out_unregister_pernet:
unregister_pernet_subsys(&ndisc_net_ops);
goto out;
#endif
}
int __init ndisc_late_init(void)
{
return register_netdevice_notifier(&ndisc_netdev_notifier);
}
void ndisc_late_cleanup(void)
{
unregister_netdevice_notifier(&ndisc_netdev_notifier);
}
void ndisc_cleanup(void)
{
#ifdef CONFIG_SYSCTL
neigh_sysctl_unregister(&nd_tbl.parms);
#endif
neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl);
unregister_pernet_subsys(&ndisc_net_ops);
}