ipv4: Add FIB nexthop exceptions.
In a regime where we have subnetted route entries, we need a way to store persistent storage about destination specific learned values such as redirects and PMTU values. This is implemented here via nexthop exceptions. The initial implementation is a 2048 entry hash table with relaiming starting at chain length 5. A more sophisticated scheme can be devised if that proves necessary. Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
6700c2709c
commit
4895c771c7
@ -18,6 +18,7 @@
|
||||
|
||||
#include <net/flow.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <net/fib_rules.h>
|
||||
#include <net/inetpeer.h>
|
||||
|
||||
@ -46,6 +47,22 @@ struct fib_config {
|
||||
|
||||
struct fib_info;
|
||||
|
||||
struct fib_nh_exception {
|
||||
struct fib_nh_exception __rcu *fnhe_next;
|
||||
__be32 fnhe_daddr;
|
||||
u32 fnhe_pmtu;
|
||||
u32 fnhe_gw;
|
||||
unsigned long fnhe_expires;
|
||||
unsigned long fnhe_stamp;
|
||||
};
|
||||
|
||||
struct fnhe_hash_bucket {
|
||||
struct fib_nh_exception __rcu *chain;
|
||||
};
|
||||
|
||||
#define FNHE_HASH_SIZE 2048
|
||||
#define FNHE_RECLAIM_DEPTH 5
|
||||
|
||||
struct fib_nh {
|
||||
struct net_device *nh_dev;
|
||||
struct hlist_node nh_hash;
|
||||
@ -63,6 +80,7 @@ struct fib_nh {
|
||||
__be32 nh_gw;
|
||||
__be32 nh_saddr;
|
||||
int nh_saddr_genid;
|
||||
struct fnhe_hash_bucket *nh_exceptions;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
|
||||
},
|
||||
};
|
||||
|
||||
static void free_nh_exceptions(struct fib_nh *nh)
|
||||
{
|
||||
struct fnhe_hash_bucket *hash = nh->nh_exceptions;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < FNHE_HASH_SIZE; i++) {
|
||||
struct fib_nh_exception *fnhe;
|
||||
|
||||
fnhe = rcu_dereference(hash[i].chain);
|
||||
while (fnhe) {
|
||||
struct fib_nh_exception *next;
|
||||
|
||||
next = rcu_dereference(fnhe->fnhe_next);
|
||||
kfree(fnhe);
|
||||
|
||||
fnhe = next;
|
||||
}
|
||||
}
|
||||
kfree(hash);
|
||||
}
|
||||
|
||||
/* Release a nexthop info record */
|
||||
static void free_fib_info_rcu(struct rcu_head *head)
|
||||
{
|
||||
@ -148,6 +169,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
|
||||
change_nexthops(fi) {
|
||||
if (nexthop_nh->nh_dev)
|
||||
dev_put(nexthop_nh->nh_dev);
|
||||
if (nexthop_nh->nh_exceptions)
|
||||
free_nh_exceptions(nexthop_nh);
|
||||
} endfor_nexthops(fi);
|
||||
|
||||
release_net(fi->fib_net);
|
||||
|
256
net/ipv4/route.c
256
net/ipv4/route.c
@ -1275,14 +1275,130 @@ static void rt_del(unsigned int hash, struct rtable *rt)
|
||||
spin_unlock_bh(rt_hash_lock_addr(hash));
|
||||
}
|
||||
|
||||
static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
|
||||
static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
|
||||
const struct iphdr *iph,
|
||||
int oif, u8 tos,
|
||||
u8 prot, u32 mark, int flow_flags)
|
||||
{
|
||||
if (sk) {
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
|
||||
oif = sk->sk_bound_dev_if;
|
||||
mark = sk->sk_mark;
|
||||
tos = RT_CONN_FLAGS(sk);
|
||||
prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
|
||||
}
|
||||
flowi4_init_output(fl4, oif, mark, tos,
|
||||
RT_SCOPE_UNIVERSE, prot,
|
||||
flow_flags,
|
||||
iph->daddr, iph->saddr, 0, 0);
|
||||
}
|
||||
|
||||
static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
|
||||
{
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
int oif = skb->dev->ifindex;
|
||||
u8 tos = RT_TOS(iph->tos);
|
||||
u8 prot = iph->protocol;
|
||||
u32 mark = skb->mark;
|
||||
|
||||
__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
|
||||
}
|
||||
|
||||
static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
|
||||
{
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
struct ip_options_rcu *inet_opt;
|
||||
__be32 daddr = inet->inet_daddr;
|
||||
|
||||
rcu_read_lock();
|
||||
inet_opt = rcu_dereference(inet->inet_opt);
|
||||
if (inet_opt && inet_opt->opt.srr)
|
||||
daddr = inet_opt->opt.faddr;
|
||||
flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
|
||||
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
|
||||
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
|
||||
inet_sk_flowi_flags(sk),
|
||||
daddr, inet->inet_saddr, 0, 0);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
if (skb)
|
||||
build_skb_flow_key(fl4, skb, sk);
|
||||
else
|
||||
build_sk_flow_key(fl4, sk);
|
||||
}
|
||||
|
||||
static DEFINE_SPINLOCK(fnhe_lock);
|
||||
|
||||
static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
|
||||
{
|
||||
struct fib_nh_exception *fnhe, *oldest;
|
||||
|
||||
oldest = rcu_dereference(hash->chain);
|
||||
for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
|
||||
fnhe = rcu_dereference(fnhe->fnhe_next)) {
|
||||
if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
|
||||
oldest = fnhe;
|
||||
}
|
||||
return oldest;
|
||||
}
|
||||
|
||||
static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
|
||||
{
|
||||
struct fnhe_hash_bucket *hash = nh->nh_exceptions;
|
||||
struct fib_nh_exception *fnhe;
|
||||
int depth;
|
||||
u32 hval;
|
||||
|
||||
if (!hash) {
|
||||
hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
|
||||
GFP_ATOMIC);
|
||||
if (!hash)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
hval = (__force u32) daddr;
|
||||
hval ^= (hval >> 11) ^ (hval >> 22);
|
||||
hash += hval;
|
||||
|
||||
depth = 0;
|
||||
for (fnhe = rcu_dereference(hash->chain); fnhe;
|
||||
fnhe = rcu_dereference(fnhe->fnhe_next)) {
|
||||
if (fnhe->fnhe_daddr == daddr)
|
||||
goto out;
|
||||
depth++;
|
||||
}
|
||||
|
||||
if (depth > FNHE_RECLAIM_DEPTH) {
|
||||
fnhe = fnhe_oldest(hash + hval, daddr);
|
||||
goto out_daddr;
|
||||
}
|
||||
fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
|
||||
if (!fnhe)
|
||||
return NULL;
|
||||
|
||||
fnhe->fnhe_next = hash->chain;
|
||||
rcu_assign_pointer(hash->chain, fnhe);
|
||||
|
||||
out_daddr:
|
||||
fnhe->fnhe_daddr = daddr;
|
||||
out:
|
||||
fnhe->fnhe_stamp = jiffies;
|
||||
return fnhe;
|
||||
}
|
||||
|
||||
static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
|
||||
{
|
||||
__be32 new_gw = icmp_hdr(skb)->un.gateway;
|
||||
__be32 old_gw = ip_hdr(skb)->saddr;
|
||||
struct net_device *dev = skb->dev;
|
||||
struct in_device *in_dev;
|
||||
struct fib_result res;
|
||||
struct neighbour *n;
|
||||
struct rtable *rt;
|
||||
struct net *net;
|
||||
|
||||
switch (icmp_hdr(skb)->code & 7) {
|
||||
@ -1296,7 +1412,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
|
||||
return;
|
||||
}
|
||||
|
||||
rt = (struct rtable *) dst;
|
||||
if (rt->rt_gateway != old_gw)
|
||||
return;
|
||||
|
||||
@ -1320,11 +1435,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
|
||||
goto reject_redirect;
|
||||
}
|
||||
|
||||
n = ipv4_neigh_lookup(dst, NULL, &new_gw);
|
||||
n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
|
||||
if (n) {
|
||||
if (!(n->nud_state & NUD_VALID)) {
|
||||
neigh_event_send(n, NULL);
|
||||
} else {
|
||||
if (fib_lookup(net, fl4, &res) == 0) {
|
||||
struct fib_nh *nh = &FIB_RES_NH(res);
|
||||
struct fib_nh_exception *fnhe;
|
||||
|
||||
spin_lock_bh(&fnhe_lock);
|
||||
fnhe = find_or_create_fnhe(nh, fl4->daddr);
|
||||
if (fnhe)
|
||||
fnhe->fnhe_gw = new_gw;
|
||||
spin_unlock_bh(&fnhe_lock);
|
||||
}
|
||||
rt->rt_gateway = new_gw;
|
||||
rt->rt_flags |= RTCF_REDIRECTED;
|
||||
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
|
||||
@ -1349,6 +1474,17 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
|
||||
;
|
||||
}
|
||||
|
||||
static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
struct rtable *rt;
|
||||
struct flowi4 fl4;
|
||||
|
||||
rt = (struct rtable *) dst;
|
||||
|
||||
ip_rt_build_flow_key(&fl4, sk, skb);
|
||||
__ip_do_redirect(rt, skb, &fl4);
|
||||
}
|
||||
|
||||
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
|
||||
{
|
||||
struct rtable *rt = (struct rtable *)dst;
|
||||
@ -1508,33 +1644,51 @@ out: kfree_skb(skb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
|
||||
struct sk_buff *skb, u32 mtu)
|
||||
static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
|
||||
{
|
||||
struct rtable *rt = (struct rtable *) dst;
|
||||
|
||||
dst_confirm(dst);
|
||||
struct fib_result res;
|
||||
|
||||
if (mtu < ip_rt_min_pmtu)
|
||||
mtu = ip_rt_min_pmtu;
|
||||
|
||||
if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
|
||||
struct fib_nh *nh = &FIB_RES_NH(res);
|
||||
struct fib_nh_exception *fnhe;
|
||||
|
||||
spin_lock_bh(&fnhe_lock);
|
||||
fnhe = find_or_create_fnhe(nh, fl4->daddr);
|
||||
if (fnhe) {
|
||||
fnhe->fnhe_pmtu = mtu;
|
||||
fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
|
||||
}
|
||||
spin_unlock_bh(&fnhe_lock);
|
||||
}
|
||||
rt->rt_pmtu = mtu;
|
||||
dst_set_expires(&rt->dst, ip_rt_mtu_expires);
|
||||
}
|
||||
|
||||
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
|
||||
struct sk_buff *skb, u32 mtu)
|
||||
{
|
||||
struct rtable *rt = (struct rtable *) dst;
|
||||
struct flowi4 fl4;
|
||||
|
||||
ip_rt_build_flow_key(&fl4, sk, skb);
|
||||
__ip_rt_update_pmtu(rt, &fl4, mtu);
|
||||
}
|
||||
|
||||
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
|
||||
int oif, u32 mark, u8 protocol, int flow_flags)
|
||||
{
|
||||
const struct iphdr *iph = (const struct iphdr *)skb->data;
|
||||
const struct iphdr *iph = (const struct iphdr *) skb->data;
|
||||
struct flowi4 fl4;
|
||||
struct rtable *rt;
|
||||
|
||||
flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
|
||||
protocol, flow_flags,
|
||||
iph->daddr, iph->saddr, 0, 0);
|
||||
__build_flow_key(&fl4, NULL, iph, oif,
|
||||
RT_TOS(iph->tos), protocol, mark, flow_flags);
|
||||
rt = __ip_route_output_key(net, &fl4);
|
||||
if (!IS_ERR(rt)) {
|
||||
ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu);
|
||||
__ip_rt_update_pmtu(rt, &fl4, mtu);
|
||||
ip_rt_put(rt);
|
||||
}
|
||||
}
|
||||
@ -1542,27 +1696,31 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
|
||||
|
||||
void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
|
||||
{
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
const struct iphdr *iph = (const struct iphdr *) skb->data;
|
||||
struct flowi4 fl4;
|
||||
struct rtable *rt;
|
||||
|
||||
return ipv4_update_pmtu(skb, sock_net(sk), mtu,
|
||||
sk->sk_bound_dev_if, sk->sk_mark,
|
||||
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
|
||||
inet_sk_flowi_flags(sk));
|
||||
__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
|
||||
rt = __ip_route_output_key(sock_net(sk), &fl4);
|
||||
if (!IS_ERR(rt)) {
|
||||
__ip_rt_update_pmtu(rt, &fl4, mtu);
|
||||
ip_rt_put(rt);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
|
||||
|
||||
void ipv4_redirect(struct sk_buff *skb, struct net *net,
|
||||
int oif, u32 mark, u8 protocol, int flow_flags)
|
||||
{
|
||||
const struct iphdr *iph = (const struct iphdr *)skb->data;
|
||||
const struct iphdr *iph = (const struct iphdr *) skb->data;
|
||||
struct flowi4 fl4;
|
||||
struct rtable *rt;
|
||||
|
||||
flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
|
||||
protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
|
||||
__build_flow_key(&fl4, NULL, iph, oif,
|
||||
RT_TOS(iph->tos), protocol, mark, flow_flags);
|
||||
rt = __ip_route_output_key(net, &fl4);
|
||||
if (!IS_ERR(rt)) {
|
||||
ip_do_redirect(&rt->dst, NULL, skb);
|
||||
__ip_do_redirect(rt, skb, &fl4);
|
||||
ip_rt_put(rt);
|
||||
}
|
||||
}
|
||||
@ -1570,12 +1728,16 @@ EXPORT_SYMBOL_GPL(ipv4_redirect);
|
||||
|
||||
void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
|
||||
{
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
const struct iphdr *iph = (const struct iphdr *) skb->data;
|
||||
struct flowi4 fl4;
|
||||
struct rtable *rt;
|
||||
|
||||
return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
|
||||
sk->sk_mark,
|
||||
inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
|
||||
inet_sk_flowi_flags(sk));
|
||||
__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
|
||||
rt = __ip_route_output_key(sock_net(sk), &fl4);
|
||||
if (!IS_ERR(rt)) {
|
||||
__ip_do_redirect(rt, skb, &fl4);
|
||||
ip_rt_put(rt);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
|
||||
|
||||
@ -1722,14 +1884,46 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
|
||||
dst_init_metrics(&rt->dst, fi->fib_metrics, true);
|
||||
}
|
||||
|
||||
static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
|
||||
{
|
||||
struct fnhe_hash_bucket *hash = nh->nh_exceptions;
|
||||
struct fib_nh_exception *fnhe;
|
||||
u32 hval;
|
||||
|
||||
hval = (__force u32) daddr;
|
||||
hval ^= (hval >> 11) ^ (hval >> 22);
|
||||
|
||||
for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
|
||||
fnhe = rcu_dereference(fnhe->fnhe_next)) {
|
||||
if (fnhe->fnhe_daddr == daddr) {
|
||||
if (fnhe->fnhe_pmtu) {
|
||||
unsigned long expires = fnhe->fnhe_expires;
|
||||
unsigned long diff = jiffies - expires;
|
||||
|
||||
if (time_before(jiffies, expires)) {
|
||||
rt->rt_pmtu = fnhe->fnhe_pmtu;
|
||||
dst_set_expires(&rt->dst, diff);
|
||||
}
|
||||
}
|
||||
if (fnhe->fnhe_gw)
|
||||
rt->rt_gateway = fnhe->fnhe_gw;
|
||||
fnhe->fnhe_stamp = jiffies;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
|
||||
const struct fib_result *res,
|
||||
struct fib_info *fi, u16 type, u32 itag)
|
||||
{
|
||||
if (fi) {
|
||||
if (FIB_RES_GW(*res) &&
|
||||
FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
|
||||
rt->rt_gateway = FIB_RES_GW(*res);
|
||||
struct fib_nh *nh = &FIB_RES_NH(*res);
|
||||
|
||||
if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
|
||||
rt->rt_gateway = nh->nh_gw;
|
||||
if (unlikely(nh->nh_exceptions))
|
||||
rt_bind_exception(rt, nh, fl4->daddr);
|
||||
rt_init_metrics(rt, fl4, fi);
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
|
||||
|
Loading…
Reference in New Issue
Block a user