2756f68c31
This patch adds a new port attribute - IFLA_BRPORT_BACKUP_PORT, which allows to set a backup port to be used for known unicast traffic if the port has gone carrier down. The backup pointer is rcu protected and set only under RTNL, a counter is maintained so when deleting a port we know how many other ports reference it as a backup and we remove it from all. Also the pointer is in the first cache line which is hot at the time of the check and thus in the common case we only add one more test. The backup port will be used only for the non-flooding case since it's a part of the bridge and the flooded packets will be forwarded to it anyway. To remove the forwarding just send a 0/non-existing backup port. This is used to avoid numerous scalability problems when using MLAG most notably if we have thousands of fdbs one would need to change all of them on port carrier going down which takes too long and causes a storm of fdb notifications (and again when the port comes back up). In a Multi-chassis Link Aggregation setup usually hosts are connected to two different switches which act as a single logical switch. Those switches usually have a control and backup link between them called peerlink which might be used for communication in case a host loses connectivity to one of them. We need a fast way to failover in case a host port goes down and currently none of the solutions (like bond) cannot fulfill the requirements because the participating ports are actually the "master" devices and must have the same peerlink as their backup interface and at the same time all of them must participate in the bridge device. As Roopa noted it's normal practice in routing called fast re-route where a precalculated backup path is used when the main one is down. Another use case of this is with EVPN, having a single vxlan device which is backup of every port. Due to the nature of master devices it's not currently possible to use one device as a backup for many and still have all of them participate in the bridge (which is master itself). More detailed information about MLAG is available at the link below. https://docs.cumulusnetworks.com/display/DOCS/Multi-Chassis+Link+Aggregation+-+MLAG Further explanation and a diagram by Roopa: Two switches acting in a MLAG pair are connected by the peerlink interface which is a bridge port. the config on one of the switches looks like the below. The other switch also has a similar config. eth0 is connected to one port on the server. And the server is connected to both switches. br0 -- team0---eth0 | -- switch-peerlink Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
334 lines
7.6 KiB
C
334 lines
7.6 KiB
C
/*
|
|
* Forwarding decision
|
|
* Linux ethernet bridge
|
|
*
|
|
* Authors:
|
|
* Lennert Buytenhek <buytenh@gnu.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
|
|
#include <linux/err.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/netpoll.h>
|
|
#include <linux/skbuff.h>
|
|
#include <linux/if_vlan.h>
|
|
#include <linux/netfilter_bridge.h>
|
|
#include "br_private.h"
|
|
|
|
/* Don't forward packets to originating port or forwarding disabled */
|
|
static inline int should_deliver(const struct net_bridge_port *p,
|
|
const struct sk_buff *skb)
|
|
{
|
|
struct net_bridge_vlan_group *vg;
|
|
|
|
vg = nbp_vlan_group_rcu(p);
|
|
return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
|
|
br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
|
|
nbp_switchdev_allowed_egress(p, skb) &&
|
|
!br_skb_isolated(p, skb);
|
|
}
|
|
|
|
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
if (!is_skb_forwardable(skb->dev, skb))
|
|
goto drop;
|
|
|
|
skb_push(skb, ETH_HLEN);
|
|
br_drop_fake_rtable(skb);
|
|
|
|
if (skb->ip_summed == CHECKSUM_PARTIAL &&
|
|
(skb->protocol == htons(ETH_P_8021Q) ||
|
|
skb->protocol == htons(ETH_P_8021AD))) {
|
|
int depth;
|
|
|
|
if (!__vlan_get_protocol(skb, skb->protocol, &depth))
|
|
goto drop;
|
|
|
|
skb_set_network_header(skb, depth);
|
|
}
|
|
|
|
dev_queue_xmit(skb);
|
|
|
|
return 0;
|
|
|
|
drop:
|
|
kfree_skb(skb);
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);
|
|
|
|
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
|
|
net, sk, skb, NULL, skb->dev,
|
|
br_dev_queue_push_xmit);
|
|
|
|
}
|
|
EXPORT_SYMBOL_GPL(br_forward_finish);
|
|
|
|
static void __br_forward(const struct net_bridge_port *to,
|
|
struct sk_buff *skb, bool local_orig)
|
|
{
|
|
struct net_bridge_vlan_group *vg;
|
|
struct net_device *indev;
|
|
struct net *net;
|
|
int br_hook;
|
|
|
|
vg = nbp_vlan_group_rcu(to);
|
|
skb = br_handle_vlan(to->br, to, vg, skb);
|
|
if (!skb)
|
|
return;
|
|
|
|
indev = skb->dev;
|
|
skb->dev = to->dev;
|
|
if (!local_orig) {
|
|
if (skb_warn_if_lro(skb)) {
|
|
kfree_skb(skb);
|
|
return;
|
|
}
|
|
br_hook = NF_BR_FORWARD;
|
|
skb_forward_csum(skb);
|
|
net = dev_net(indev);
|
|
} else {
|
|
if (unlikely(netpoll_tx_running(to->br->dev))) {
|
|
if (!is_skb_forwardable(skb->dev, skb)) {
|
|
kfree_skb(skb);
|
|
} else {
|
|
skb_push(skb, ETH_HLEN);
|
|
br_netpoll_send_skb(to, skb);
|
|
}
|
|
return;
|
|
}
|
|
br_hook = NF_BR_LOCAL_OUT;
|
|
net = dev_net(skb->dev);
|
|
indev = NULL;
|
|
}
|
|
|
|
NF_HOOK(NFPROTO_BRIDGE, br_hook,
|
|
net, NULL, skb, indev, skb->dev,
|
|
br_forward_finish);
|
|
}
|
|
|
|
static int deliver_clone(const struct net_bridge_port *prev,
|
|
struct sk_buff *skb, bool local_orig)
|
|
{
|
|
struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
|
|
|
|
skb = skb_clone(skb, GFP_ATOMIC);
|
|
if (!skb) {
|
|
dev->stats.tx_dropped++;
|
|
return -ENOMEM;
|
|
}
|
|
|
|
__br_forward(prev, skb, local_orig);
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* br_forward - forward a packet to a specific port
|
|
* @to: destination port
|
|
* @skb: packet being forwarded
|
|
* @local_rcv: packet will be received locally after forwarding
|
|
* @local_orig: packet is locally originated
|
|
*
|
|
* Should be called with rcu_read_lock.
|
|
*/
|
|
void br_forward(const struct net_bridge_port *to,
|
|
struct sk_buff *skb, bool local_rcv, bool local_orig)
|
|
{
|
|
if (unlikely(!to))
|
|
goto out;
|
|
|
|
/* redirect to backup link if the destination port is down */
|
|
if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
|
|
struct net_bridge_port *backup_port;
|
|
|
|
backup_port = rcu_dereference(to->backup_port);
|
|
if (unlikely(!backup_port))
|
|
goto out;
|
|
to = backup_port;
|
|
}
|
|
|
|
if (should_deliver(to, skb)) {
|
|
if (local_rcv)
|
|
deliver_clone(to, skb, local_orig);
|
|
else
|
|
__br_forward(to, skb, local_orig);
|
|
return;
|
|
}
|
|
|
|
out:
|
|
if (!local_rcv)
|
|
kfree_skb(skb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(br_forward);
|
|
|
|
static struct net_bridge_port *maybe_deliver(
|
|
struct net_bridge_port *prev, struct net_bridge_port *p,
|
|
struct sk_buff *skb, bool local_orig)
|
|
{
|
|
int err;
|
|
|
|
if (!should_deliver(p, skb))
|
|
return prev;
|
|
|
|
if (!prev)
|
|
goto out;
|
|
|
|
err = deliver_clone(prev, skb, local_orig);
|
|
if (err)
|
|
return ERR_PTR(err);
|
|
|
|
out:
|
|
return p;
|
|
}
|
|
|
|
/* called under rcu_read_lock */
|
|
void br_flood(struct net_bridge *br, struct sk_buff *skb,
|
|
enum br_pkt_type pkt_type, bool local_rcv, bool local_orig)
|
|
{
|
|
u8 igmp_type = br_multicast_igmp_type(skb);
|
|
struct net_bridge_port *prev = NULL;
|
|
struct net_bridge_port *p;
|
|
|
|
list_for_each_entry_rcu(p, &br->port_list, list) {
|
|
/* Do not flood unicast traffic to ports that turn it off, nor
|
|
* other traffic if flood off, except for traffic we originate
|
|
*/
|
|
switch (pkt_type) {
|
|
case BR_PKT_UNICAST:
|
|
if (!(p->flags & BR_FLOOD))
|
|
continue;
|
|
break;
|
|
case BR_PKT_MULTICAST:
|
|
if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
|
|
continue;
|
|
break;
|
|
case BR_PKT_BROADCAST:
|
|
if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev)
|
|
continue;
|
|
break;
|
|
}
|
|
|
|
/* Do not flood to ports that enable proxy ARP */
|
|
if (p->flags & BR_PROXYARP)
|
|
continue;
|
|
if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) &&
|
|
BR_INPUT_SKB_CB(skb)->proxyarp_replied)
|
|
continue;
|
|
|
|
prev = maybe_deliver(prev, p, skb, local_orig);
|
|
if (IS_ERR(prev))
|
|
goto out;
|
|
if (prev == p)
|
|
br_multicast_count(p->br, p, skb, igmp_type,
|
|
BR_MCAST_DIR_TX);
|
|
}
|
|
|
|
if (!prev)
|
|
goto out;
|
|
|
|
if (local_rcv)
|
|
deliver_clone(prev, skb, local_orig);
|
|
else
|
|
__br_forward(prev, skb, local_orig);
|
|
return;
|
|
|
|
out:
|
|
if (!local_rcv)
|
|
kfree_skb(skb);
|
|
}
|
|
|
|
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
|
|
static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
|
|
const unsigned char *addr, bool local_orig)
|
|
{
|
|
struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
|
|
const unsigned char *src = eth_hdr(skb)->h_source;
|
|
|
|
if (!should_deliver(p, skb))
|
|
return;
|
|
|
|
/* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
|
|
if (skb->dev == p->dev && ether_addr_equal(src, addr))
|
|
return;
|
|
|
|
skb = skb_copy(skb, GFP_ATOMIC);
|
|
if (!skb) {
|
|
dev->stats.tx_dropped++;
|
|
return;
|
|
}
|
|
|
|
if (!is_broadcast_ether_addr(addr))
|
|
memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
|
|
|
|
__br_forward(p, skb, local_orig);
|
|
}
|
|
|
|
/* called with rcu_read_lock */
|
|
void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
|
|
struct sk_buff *skb,
|
|
bool local_rcv, bool local_orig)
|
|
{
|
|
struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
|
|
u8 igmp_type = br_multicast_igmp_type(skb);
|
|
struct net_bridge *br = netdev_priv(dev);
|
|
struct net_bridge_port *prev = NULL;
|
|
struct net_bridge_port_group *p;
|
|
struct hlist_node *rp;
|
|
|
|
rp = rcu_dereference(hlist_first_rcu(&br->router_list));
|
|
p = mdst ? rcu_dereference(mdst->ports) : NULL;
|
|
while (p || rp) {
|
|
struct net_bridge_port *port, *lport, *rport;
|
|
|
|
lport = p ? p->port : NULL;
|
|
rport = hlist_entry_safe(rp, struct net_bridge_port, rlist);
|
|
|
|
if ((unsigned long)lport > (unsigned long)rport) {
|
|
port = lport;
|
|
|
|
if (port->flags & BR_MULTICAST_TO_UNICAST) {
|
|
maybe_deliver_addr(lport, skb, p->eth_addr,
|
|
local_orig);
|
|
goto delivered;
|
|
}
|
|
} else {
|
|
port = rport;
|
|
}
|
|
|
|
prev = maybe_deliver(prev, port, skb, local_orig);
|
|
delivered:
|
|
if (IS_ERR(prev))
|
|
goto out;
|
|
if (prev == port)
|
|
br_multicast_count(port->br, port, skb, igmp_type,
|
|
BR_MCAST_DIR_TX);
|
|
|
|
if ((unsigned long)lport >= (unsigned long)port)
|
|
p = rcu_dereference(p->next);
|
|
if ((unsigned long)rport >= (unsigned long)port)
|
|
rp = rcu_dereference(hlist_next_rcu(rp));
|
|
}
|
|
|
|
if (!prev)
|
|
goto out;
|
|
|
|
if (local_rcv)
|
|
deliver_clone(prev, skb, local_orig);
|
|
else
|
|
__br_forward(prev, skb, local_orig);
|
|
return;
|
|
|
|
out:
|
|
if (!local_rcv)
|
|
kfree_skb(skb);
|
|
}
|
|
#endif
|