From 2a7851bffb008ff4882eee673da74718997b4265 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 May 2013 03:56:10 +0000 Subject: [PATCH 1/5] netfilter: add nf_ipv6_ops hook to fix xt_addrtype with IPv6 Quoting https://bugzilla.netfilter.org/show_bug.cgi?id=812: [ ip6tables -m addrtype ] When I tried to use in the nat/PREROUTING it messes up the routing cache even if the rule didn't matched at all. [..] If I remove the --limit-iface-in from the non-working scenario, so just use the -m addrtype --dst-type LOCAL it works! This happens when LOCAL type matching is requested with --limit-iface-in, and the default ipv6 route is via the interface the packet we test arrived on. Because xt_addrtype uses ip6_route_output, the ipv6 routing implementation creates an unwanted cached entry, and the packet won't make it to the real/expected destination. Silently ignoring --limit-iface-in makes the routing work but it breaks rule matching (--dst-type LOCAL with limit-iface-in is supposed to only match if the dst address is configured on the incoming interface; without --limit-iface-in it will match if the address is reachable via lo). The test should call ipv6_chk_addr() instead. However, this would add a link-time dependency on ipv6. There are two possible solutions: 1) Revert the commit that moved ipt_addrtype to xt_addrtype, and put ipv6 specific code into ip6t_addrtype. 2) add new "nf_ipv6_ops" struct to register pointers to ipv6 functions. While the former might seem preferable, Pablo pointed out that there are more xt modules with link-time dependeny issues regarding ipv6, so lets go for 2). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 16 ++++++++++++++++ include/net/addrconf.h | 2 +- net/ipv6/addrconf.c | 2 +- net/ipv6/netfilter.c | 7 +++++++ net/netfilter/core.c | 2 ++ net/netfilter/xt_addrtype.c | 29 +++++++++++++++++------------ 6 files changed, 44 insertions(+), 14 deletions(-) diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 98ffb54988b6..2d4df6ce043e 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -17,6 +17,22 @@ extern __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, extern int ipv6_netfilter_init(void); extern void ipv6_netfilter_fini(void); + +/* + * Hook functions for ipv6 to allow xt_* modules to be built-in even + * if IPv6 is a module. + */ +struct nf_ipv6_ops { + int (*chk_addr)(struct net *net, const struct in6_addr *addr, + const struct net_device *dev, int strict); +}; + +extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops; +static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) +{ + return rcu_dereference(nf_ipv6_ops); +} + #else /* CONFIG_NETFILTER */ static inline int ipv6_netfilter_init(void) { return 0; } static inline void ipv6_netfilter_fini(void) { return; } diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 84a6440f1f19..21f702704f24 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -65,7 +65,7 @@ extern int addrconf_set_dstaddr(struct net *net, extern int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev, + const struct net_device *dev, int strict); #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d1ab6ab29a55..d1b2d8034b54 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1487,7 +1487,7 @@ static int ipv6_count_addresses(struct inet6_dev *idev) } int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev, int strict) + const struct net_device *dev, int strict) { struct inet6_ifaddr *ifp; unsigned int hash = inet6_addr_hash(addr); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 72836f40b730..95f3f1da0d7f 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -186,6 +187,10 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, return csum; }; +static const struct nf_ipv6_ops ipv6ops = { + .chk_addr = ipv6_chk_addr, +}; + static const struct nf_afinfo nf_ip6_afinfo = { .family = AF_INET6, .checksum = nf_ip6_checksum, @@ -198,6 +203,7 @@ static const struct nf_afinfo nf_ip6_afinfo = { int __init ipv6_netfilter_init(void) { + RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops); return nf_register_afinfo(&nf_ip6_afinfo); } @@ -206,5 +212,6 @@ int __init ipv6_netfilter_init(void) */ void ipv6_netfilter_fini(void) { + RCU_INIT_POINTER(nf_ipv6_ops, NULL); nf_unregister_afinfo(&nf_ip6_afinfo); } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 07c865a31a3d..857ca9f35177 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -30,6 +30,8 @@ static DEFINE_MUTEX(afinfo_mutex); const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; EXPORT_SYMBOL(nf_afinfo); +const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; +EXPORT_SYMBOL_GPL(nf_ipv6_ops); int nf_register_afinfo(const struct nf_afinfo *afinfo) { diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index 49c5ff7f6dd6..68ff29f60867 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -22,6 +22,7 @@ #include #endif +#include #include #include @@ -33,12 +34,12 @@ MODULE_ALIAS("ip6t_addrtype"); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, - const struct in6_addr *addr) + const struct in6_addr *addr, u16 mask) { const struct nf_afinfo *afinfo; struct flowi6 flow; struct rt6_info *rt; - u32 ret; + u32 ret = 0; int route_err; memset(&flow, 0, sizeof(flow)); @@ -49,12 +50,19 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, rcu_read_lock(); afinfo = nf_get_afinfo(NFPROTO_IPV6); - if (afinfo != NULL) - route_err = afinfo->route(net, (struct dst_entry **)&rt, - flowi6_to_flowi(&flow), !!dev); - else - route_err = 1; + if (afinfo != NULL) { + const struct nf_ipv6_ops *v6ops; + if (dev && (mask & XT_ADDRTYPE_LOCAL)) { + v6ops = nf_get_ipv6_ops(); + if (v6ops && v6ops->chk_addr(net, addr, dev, true)) + ret = XT_ADDRTYPE_LOCAL; + } + route_err = afinfo->route(net, (struct dst_entry **)&rt, + flowi6_to_flowi(&flow), false); + } else { + route_err = 1; + } rcu_read_unlock(); if (route_err) @@ -62,15 +70,12 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, if (rt->rt6i_flags & RTF_REJECT) ret = XT_ADDRTYPE_UNREACHABLE; - else - ret = 0; - if (rt->rt6i_flags & RTF_LOCAL) + if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) ret |= XT_ADDRTYPE_LOCAL; if (rt->rt6i_flags & RTF_ANYCAST) ret |= XT_ADDRTYPE_ANYCAST; - dst_release(&rt->dst); return ret; } @@ -90,7 +95,7 @@ static bool match_type6(struct net *net, const struct net_device *dev, if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | XT_ADDRTYPE_UNREACHABLE) & mask) - return !!(mask & match_lookup_rt6(net, dev, addr)); + return !!(mask & match_lookup_rt6(net, dev, addr, mask)); return true; } From 4f36ea6eed2081340c7a7aa98c73187ecfccebff Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 23 May 2013 01:50:46 +0000 Subject: [PATCH 2/5] netfilter: ipt_ULOG: fix non-null terminated string in the nf_log path If nf_log uses ipt_ULOG as logging output, we can deliver non-null terminated strings to user-space since the maximum length of the prefix that is passed by nf_log is NF_LOG_PREFIXLEN but pm->prefix is 32 bytes long (ULOG_PREFIX_LEN). This is actually happening already from nf_conntrack_tcp if ipt_ULOG is used, since it is passing strings longer than 32 bytes. Signed-off-by: Chen Gang Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_ULOG.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index cf08218ddbcf..ff4b781b1056 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -231,8 +231,10 @@ static void ipt_ulog_packet(struct net *net, put_unaligned(tv.tv_usec, &pm->timestamp_usec); put_unaligned(skb->mark, &pm->mark); pm->hook = hooknum; - if (prefix != NULL) - strncpy(pm->prefix, prefix, sizeof(pm->prefix)); + if (prefix != NULL) { + strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1); + pm->prefix[sizeof(pm->prefix) - 1] = '\0'; + } else if (loginfo->prefix[0] != '\0') strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); else From dc7b3eb900aab02e5cafbca3948d005be13fb4a5 Mon Sep 17 00:00:00 2001 From: Grzegorz Lyczba Date: Mon, 13 May 2013 23:56:24 +0200 Subject: [PATCH 3/5] ipvs: Fix reuse connection if real server is dead Expire cached connection for new TCP/SCTP connection if real server is down. Otherwise, IPVS uses the dead server for the reused connection, instead of a new working one. Signed-off-by: Grzegorz Lyczba Acked-by: Hans Schillstrom Acked-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 085b5880ab0d..05565d2b3a61 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1001,6 +1001,32 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) return th->rst; } +static inline bool is_new_conn(const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + switch (iph->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); + if (th == NULL) + return false; + return th->syn; + } + case IPPROTO_SCTP: { + sctp_chunkhdr_t *sch, schunk; + + sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t), + sizeof(schunk), &schunk); + if (sch == NULL) + return false; + return sch->type == SCTP_CID_INIT; + } + default: + return false; + } +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int @@ -1612,6 +1638,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) * Check if the packet belongs to an existing connection entry */ cp = pp->conn_in_get(af, skb, &iph, 0); + + if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest && + unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs && + is_new_conn(skb, &iph)) { + ip_vs_conn_expire_now(cp); + __ip_vs_conn_put(cp); + cp = NULL; + } + if (unlikely(!cp) && !iph.fragoffs) { /* No (second) fragments need to enter here, as nf_defrag_ipv6 * replayed fragment zero will already have created the cp From d660164d79b67f879db35a7d61e47d3b99bc714e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Tue, 28 May 2013 22:37:03 +0000 Subject: [PATCH 4/5] netfilter: xt_LOG: fix mark logging for IPv6 packets In dump_ipv6_packet(), the "recurse" parameter is zero only if dumping contents of a packet embedded into an ICMPv6 error message. Therefore we want to log packet mark if recurse is non-zero, not when it is zero. Signed-off-by: Michal Kubecek Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_LOG.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index 491c7d821a0b..5ab24843370a 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -737,7 +737,7 @@ static void dump_ipv6_packet(struct sbuff *m, dump_sk_uid_gid(m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ - if (!recurse && skb->mark) + if (recurse && skb->mark) sb_add(m, "MARK=0x%x ", skb->mark); } From a70b9641e6a90d6821e4354a2c2fede74015db29 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 29 May 2013 13:33:51 +0100 Subject: [PATCH 5/5] ipvs: ip_vs_sh: fix build kfree_rcu() requires offsetof(..., rcu_head) < 4096, which can get violated with a sufficiently high CONFIG_IP_VS_SH_TAB_BITS. Signed-off-by: Jan Beulich Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_sh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 0df269d7c99f..a65edfe4b16c 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -67,8 +67,8 @@ struct ip_vs_sh_bucket { #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) struct ip_vs_sh_state { - struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; struct rcu_head rcu_head; + struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE]; }; /*