From 0d98da5d845e0d0293055913ce65c9904b3b902a Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 7 Mar 2013 17:20:46 +0000 Subject: [PATCH 1/7] netfilter: nf_conntrack: register pernet subsystem before register L4 proto In (c296bb4 netfilter: nf_conntrack: refactor l4proto support for netns) the l4proto gre/dccp/udplite/sctp registration happened before the pernet subsystem, which is wrong. Register pernet subsystem before register L4proto since after register L4proto, init_conntrack may try to access the resources which allocated in register_pernet_subsys. Reported-by: Alexey Dobriyan Cc: Alexey Dobriyan Signed-off-by: Gao feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_dccp.c | 12 ++++++------ net/netfilter/nf_conntrack_proto_gre.c | 12 ++++++------ net/netfilter/nf_conntrack_proto_sctp.c | 12 ++++++------ net/netfilter/nf_conntrack_proto_udplite.c | 12 ++++++------ 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 432f95780003..ba65b2041eb4 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -969,6 +969,10 @@ static int __init nf_conntrack_proto_dccp_init(void) { int ret; + ret = register_pernet_subsys(&dccp_net_ops); + if (ret < 0) + goto out_pernet; + ret = nf_ct_l4proto_register(&dccp_proto4); if (ret < 0) goto out_dccp4; @@ -977,16 +981,12 @@ static int __init nf_conntrack_proto_dccp_init(void) if (ret < 0) goto out_dccp6; - ret = register_pernet_subsys(&dccp_net_ops); - if (ret < 0) - goto out_pernet; - return 0; -out_pernet: - nf_ct_l4proto_unregister(&dccp_proto6); out_dccp6: nf_ct_l4proto_unregister(&dccp_proto4); out_dccp4: + unregister_pernet_subsys(&dccp_net_ops); +out_pernet: return ret; } diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index bd7d01d9c7e7..155ce9f8a0db 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -420,18 +420,18 @@ static int __init nf_ct_proto_gre_init(void) { int ret; - ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4); - if (ret < 0) - goto out_gre4; - ret = register_pernet_subsys(&proto_gre_net_ops); if (ret < 0) goto out_pernet; + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre4); + if (ret < 0) + goto out_gre4; + return 0; -out_pernet: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4); out_gre4: + unregister_pernet_subsys(&proto_gre_net_ops); +out_pernet: return ret; } diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 480f616d5936..ec83536def9a 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -888,6 +888,10 @@ static int __init nf_conntrack_proto_sctp_init(void) { int ret; + ret = register_pernet_subsys(&sctp_net_ops); + if (ret < 0) + goto out_pernet; + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_sctp4); if (ret < 0) goto out_sctp4; @@ -896,16 +900,12 @@ static int __init nf_conntrack_proto_sctp_init(void) if (ret < 0) goto out_sctp6; - ret = register_pernet_subsys(&sctp_net_ops); - if (ret < 0) - goto out_pernet; - return 0; -out_pernet: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp6); out_sctp6: nf_ct_l4proto_unregister(&nf_conntrack_l4proto_sctp4); out_sctp4: + unregister_pernet_subsys(&sctp_net_ops); +out_pernet: return ret; } diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 157489581c31..ca969f6273f7 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -371,6 +371,10 @@ static int __init nf_conntrack_proto_udplite_init(void) { int ret; + ret = register_pernet_subsys(&udplite_net_ops); + if (ret < 0) + goto out_pernet; + ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udplite4); if (ret < 0) goto out_udplite4; @@ -379,16 +383,12 @@ static int __init nf_conntrack_proto_udplite_init(void) if (ret < 0) goto out_udplite6; - ret = register_pernet_subsys(&udplite_net_ops); - if (ret < 0) - goto out_pernet; - return 0; -out_pernet: - nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite6); out_udplite6: nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udplite4); out_udplite4: + unregister_pernet_subsys(&udplite_net_ops); +out_pernet: return ret; } From bae99f7a1d372374aaf9ed8910f3b825da995b36 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 14 Mar 2013 06:03:18 +0000 Subject: [PATCH 2/7] netfilter: nfnetlink_queue: fix incorrect initialization of copy range field 2^16 = 0xffff, not 0xfffff (note the extra 'f'). Not dangerous since you adjust it to min_t(data_len, skb->len) just after on. Reported-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 858fd52c1040..1cb48540f86a 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -112,7 +112,7 @@ instance_create(u_int16_t queue_num, int portid) inst->queue_num = queue_num; inst->peer_portid = portid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; - inst->copy_range = 0xfffff; + inst->copy_range = 0xffff; inst->copy_mode = NFQNL_COPY_NONE; spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); From a82783c91d5dce680dbd290ebf301a520b0e72a5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Mar 2013 20:11:01 +0000 Subject: [PATCH 3/7] netfilter: ip6t_NPT: restrict to mangle table As the translation is stateless, using it in nat table doesn't work (only initial packet is translated). filter table OUTPUT works but won't re-route the packet after translation. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_NPT.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c index 83acc1405a18..33608c610276 100644 --- a/net/ipv6/netfilter/ip6t_NPT.c +++ b/net/ipv6/netfilter/ip6t_NPT.c @@ -114,6 +114,7 @@ ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par) static struct xt_target ip6t_npt_target_reg[] __read_mostly = { { .name = "SNPT", + .table = "mangle", .target = ip6t_snpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .checkentry = ip6t_npt_checkentry, @@ -124,6 +125,7 @@ static struct xt_target ip6t_npt_target_reg[] __read_mostly = { }, { .name = "DNPT", + .table = "mangle", .target = ip6t_dnpt_tg, .targetsize = sizeof(struct ip6t_npt_tginfo), .checkentry = ip6t_npt_checkentry, From cf2e39429c245245db889fffdfbdf3f889a6cb22 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 9 Mar 2013 23:25:06 +0200 Subject: [PATCH 4/7] ipvs: fix sctp chunk length order Fix wrong but non-fatal access to chunk length. sch->length should be in network order, next chunk should be aligned to 4 bytes. Problem noticed in sparse output. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_proto_sctp.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index ae8ec6f27688..cd1d7298f7ba 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -906,7 +906,7 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, sctp_chunkhdr_t _sctpch, *sch; unsigned char chunk_type; int event, next_state; - int ihl; + int ihl, cofs; #ifdef CONFIG_IP_VS_IPV6 ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); @@ -914,8 +914,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, ihl = ip_hdrlen(skb); #endif - sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t), - sizeof(_sctpch), &_sctpch); + cofs = ihl + sizeof(sctp_sctphdr_t); + sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch); if (sch == NULL) return; @@ -933,10 +933,12 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, */ if ((sch->type == SCTP_CID_COOKIE_ECHO) || (sch->type == SCTP_CID_COOKIE_ACK)) { - sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) + - sch->length), sizeof(_sctpch), &_sctpch); - if (sch) { - if (sch->type == SCTP_CID_ABORT) + int clen = ntohs(sch->length); + + if (clen >= sizeof(sctp_chunkhdr_t)) { + sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4), + sizeof(_sctpch), &_sctpch); + if (sch && sch->type == SCTP_CID_ABORT) chunk_type = sch->type; } } From 0c12582fbcdea0cbb0dfd224e1c5f9a8428ffa18 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 9 Mar 2013 23:25:04 +0200 Subject: [PATCH 5/7] ipvs: add backup_only flag to avoid loops Dmitry Akindinov is reporting for a problem where SYNs are looping between the master and backup server when the backup server is used as real server in DR mode and has IPVS rules to function as director. Even when the backup function is enabled we continue to forward traffic and schedule new connections when the current master is using the backup server as real server. While this is not a problem for NAT, for DR and TUN method the backup server can not determine if a request comes from client or from director. To avoid such loops add new sysctl flag backup_only. It can be needed for DR/TUN setups that do not need backup and director function at the same time. When the backup function is enabled we stop any forwarding and pass the traffic to the local stack (real server mode). The flag disables the director function when the backup function is enabled. For setups that enable backup function for some virtual services and director function for other virtual services there should be another more complex solution to support DR/TUN mode, may be to assign per-virtual service syncid value, so that we can differentiate the requests. Reported-by: Dmitry Akindinov Tested-by: German Myzovsky Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- Documentation/networking/ipvs-sysctl.txt | 7 +++++++ include/net/ip_vs.h | 12 ++++++++++++ net/netfilter/ipvs/ip_vs_core.c | 12 ++++++++---- net/netfilter/ipvs/ip_vs_ctl.c | 7 +++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt index f2a2488f1bf3..9573d0c48c6e 100644 --- a/Documentation/networking/ipvs-sysctl.txt +++ b/Documentation/networking/ipvs-sysctl.txt @@ -15,6 +15,13 @@ amemthresh - INTEGER enabled and the variable is automatically set to 2, otherwise the strategy is disabled and the variable is set to 1. +backup_only - BOOLEAN + 0 - disabled (default) + not 0 - enabled + + If set, disable the director function while the server is + in backup mode to avoid packet loops for DR/TUN methods. + conntrack - BOOLEAN 0 - disabled (default) not 0 - enabled diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 68c69d54d392..fce8e6b66d55 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -976,6 +976,7 @@ struct netns_ipvs { int sysctl_sync_retries; int sysctl_nat_icmp_send; int sysctl_pmtu_disc; + int sysctl_backup_only; /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -1067,6 +1068,12 @@ static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) return ipvs->sysctl_pmtu_disc; } +static inline int sysctl_backup_only(struct netns_ipvs *ipvs) +{ + return ipvs->sync_state & IP_VS_STATE_BACKUP && + ipvs->sysctl_backup_only; +} + #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) @@ -1114,6 +1121,11 @@ static inline int sysctl_pmtu_disc(struct netns_ipvs *ipvs) return 1; } +static inline int sysctl_backup_only(struct netns_ipvs *ipvs) +{ + return 0; +} + #endif /* diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 47edf5a40a59..18b4bc55fa3d 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1577,7 +1577,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, &iph); @@ -1654,7 +1655,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ @@ -1815,13 +1815,15 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, { int r; struct net *net; + struct netns_ipvs *ipvs; if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; return ip_vs_in_icmp(skb, &r, hooknum); @@ -1835,6 +1837,7 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, { int r; struct net *net; + struct netns_ipvs *ipvs; struct ip_vs_iphdr iphdr; ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr); @@ -1843,7 +1846,8 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, /* ipvs enabled in this netns ? */ net = skb_net(skb); - if (!net_ipvs(net)->enable) + ipvs = net_ipvs(net); + if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; return ip_vs_in_icmp_v6(skb, &r, hooknum, &iphdr); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c68198bf9128..9e2d1cccd1eb 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1808,6 +1808,12 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "backup_only", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -3741,6 +3747,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct net *net) tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; ipvs->sysctl_pmtu_disc = 1; tbl[idx++].data = &ipvs->sysctl_pmtu_disc; + tbl[idx++].data = &ipvs->sysctl_backup_only; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); From bf93ad72cd8cfabe66a7b3d66236a1266d357189 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 9 Mar 2013 23:25:05 +0200 Subject: [PATCH 6/7] ipvs: remove extra rcu lock In 3.7 we added code that uses ipv4_update_pmtu but after commit c5ae7d4192 (ipv4: must use rcu protection while calling fib_lookup) the RCU lock is not needed. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 18b4bc55fa3d..61f49d241712 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1394,10 +1394,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) skb_reset_network_header(skb); IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); - rcu_read_lock(); ipv4_update_pmtu(skb, dev_net(skb->dev), mtu, 0, 0, 0, 0); - rcu_read_unlock(); /* Client uses PMTUD? */ if (!(cih->frag_off & htons(IP_DF))) goto ignore_ipip; From 3dd6664fac7e6041bfc8756ae9e8c78f59108cd9 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 19 Mar 2013 13:09:59 +0000 Subject: [PATCH 7/7] netfilter: remove unused "config IP_NF_QUEUE" Kconfig symbol IP_NF_QUEUE is unused since commit d16cf20e2f2f13411eece7f7fb72c17d141c4a84 ("netfilter: remove ip_queue support"). Let's remove it too. Signed-off-by: Paul Bolle Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index ce2d43e1f09f..0d755c50994b 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,19 +36,6 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. -config IP_NF_QUEUE - tristate "IP Userspace queueing via NETLINK (OBSOLETE)" - depends on NETFILTER_ADVANCED - help - Netfilter has the ability to queue packets to user space: the - netlink device can be used to access them using this driver. - - This option enables the old IPv4-only "ip_queue" implementation - which has been obsoleted by the new "nfnetlink_queue" code (see - CONFIG_NETFILTER_NETLINK_QUEUE). - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" default m if NETFILTER_ADVANCED=n