Revision 78c906e430b13d30a8cfbdef4ccbbe1686841a9e authored by Vlad Buslov on 31 August 2020, 13:17:29 UTC, committed by Saeed Mahameed on 05 November 2020, 20:17:05 UTC
In functions mlx5e_route_lookup_ipv{4|6}() route_dev can be arbitrary net
device and not necessary mlx5 eswitch port representor. As such, in order
to ensure that route_dev is not destroyed concurrent the code needs either
explicitly take reference to the device before releasing reference to
rtable instance or ensure that caller holds rtnl lock. First approach is
chosen as a fix since rtnl lock dependency was intentionally removed from
mlx5 TC layer.

To prevent unprotected usage of route_dev in encap code take a reference to
the device before releasing rt. Don't save direct pointer to the device in
mlx5_encap_entry structure and use ifindex instead. Modify users of
route_dev pointer to properly obtain the net device instance from its
ifindex.

Fixes: 61086f391044 ("net/mlx5e: Protect encap hash table with mutex")
Fixes: 6707f74be862 ("net/mlx5e: Update hw flows when encap source mac changed")
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
1 parent e68e28b
Raw File
dst.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * net/core/dst.c	Protocol independent destination cache.
 *
 * Authors:		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
#include <net/net_namespace.h>
#include <linux/sched.h>
#include <linux/prefetch.h>
#include <net/lwtunnel.h>
#include <net/xfrm.h>

#include <net/dst.h>
#include <net/dst_metadata.h>

int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	kfree_skb(skb);
	return 0;
}
EXPORT_SYMBOL(dst_discard_out);

const struct dst_metrics dst_default_metrics = {
	/* This initializer is needed to force linker to place this variable
	 * into const section. Otherwise it might end into bss section.
	 * We really want to avoid false sharing on this variable, and catch
	 * any writes on it.
	 */
	.refcnt = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL(dst_default_metrics);

void dst_init(struct dst_entry *dst, struct dst_ops *ops,
	      struct net_device *dev, int initial_ref, int initial_obsolete,
	      unsigned short flags)
{
	dst->dev = dev;
	if (dev)
		dev_hold(dev);
	dst->ops = ops;
	dst_init_metrics(dst, dst_default_metrics.metrics, true);
	dst->expires = 0UL;
#ifdef CONFIG_XFRM
	dst->xfrm = NULL;
#endif
	dst->input = dst_discard;
	dst->output = dst_discard_out;
	dst->error = 0;
	dst->obsolete = initial_obsolete;
	dst->header_len = 0;
	dst->trailer_len = 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
	dst->tclassid = 0;
#endif
	dst->lwtstate = NULL;
	atomic_set(&dst->__refcnt, initial_ref);
	dst->__use = 0;
	dst->lastuse = jiffies;
	dst->flags = flags;
	if (!(flags & DST_NOCOUNT))
		dst_entries_add(ops, 1);
}
EXPORT_SYMBOL(dst_init);

void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
		int initial_ref, int initial_obsolete, unsigned short flags)
{
	struct dst_entry *dst;

	if (ops->gc &&
	    !(flags & DST_NOCOUNT) &&
	    dst_entries_get_fast(ops) > ops->gc_thresh) {
		if (ops->gc(ops)) {
			pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n");
			return NULL;
		}
	}

	dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
	if (!dst)
		return NULL;

	dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);

	return dst;
}
EXPORT_SYMBOL(dst_alloc);

struct dst_entry *dst_destroy(struct dst_entry * dst)
{
	struct dst_entry *child = NULL;

	smp_rmb();

#ifdef CONFIG_XFRM
	if (dst->xfrm) {
		struct xfrm_dst *xdst = (struct xfrm_dst *) dst;

		child = xdst->child;
	}
#endif
	if (!(dst->flags & DST_NOCOUNT))
		dst_entries_add(dst->ops, -1);

	if (dst->ops->destroy)
		dst->ops->destroy(dst);
	if (dst->dev)
		dev_put(dst->dev);

	lwtstate_put(dst->lwtstate);

	if (dst->flags & DST_METADATA)
		metadata_dst_free((struct metadata_dst *)dst);
	else
		kmem_cache_free(dst->ops->kmem_cachep, dst);

	dst = child;
	if (dst)
		dst_release_immediate(dst);
	return NULL;
}
EXPORT_SYMBOL(dst_destroy);

static void dst_destroy_rcu(struct rcu_head *head)
{
	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);

	dst = dst_destroy(dst);
}

/* Operations to mark dst as DEAD and clean up the net device referenced
 * by dst:
 * 1. put the dst under blackhole interface and discard all tx/rx packets
 *    on this route.
 * 2. release the net_device
 * This function should be called when removing routes from the fib tree
 * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to
 * make the next dst_ops->check() fail.
 */
void dst_dev_put(struct dst_entry *dst)
{
	struct net_device *dev = dst->dev;

	dst->obsolete = DST_OBSOLETE_DEAD;
	if (dst->ops->ifdown)
		dst->ops->ifdown(dst, dev, true);
	dst->input = dst_discard;
	dst->output = dst_discard_out;
	dst->dev = blackhole_netdev;
	dev_hold(dst->dev);
	dev_put(dev);
}
EXPORT_SYMBOL(dst_dev_put);

void dst_release(struct dst_entry *dst)
{
	if (dst) {
		int newrefcnt;

		newrefcnt = atomic_dec_return(&dst->__refcnt);
		if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
					     __func__, dst, newrefcnt);
		if (!newrefcnt)
			call_rcu(&dst->rcu_head, dst_destroy_rcu);
	}
}
EXPORT_SYMBOL(dst_release);

void dst_release_immediate(struct dst_entry *dst)
{
	if (dst) {
		int newrefcnt;

		newrefcnt = atomic_dec_return(&dst->__refcnt);
		if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
					     __func__, dst, newrefcnt);
		if (!newrefcnt)
			dst_destroy(dst);
	}
}
EXPORT_SYMBOL(dst_release_immediate);

u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
	struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC);

	if (p) {
		struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
		unsigned long prev, new;

		refcount_set(&p->refcnt, 1);
		memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));

		new = (unsigned long) p;
		prev = cmpxchg(&dst->_metrics, old, new);

		if (prev != old) {
			kfree(p);
			p = (struct dst_metrics *)__DST_METRICS_PTR(prev);
			if (prev & DST_METRICS_READ_ONLY)
				p = NULL;
		} else if (prev & DST_METRICS_REFCOUNTED) {
			if (refcount_dec_and_test(&old_p->refcnt))
				kfree(old_p);
		}
	}
	BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0);
	return (u32 *)p;
}
EXPORT_SYMBOL(dst_cow_metrics_generic);

/* Caller asserts that dst_metrics_read_only(dst) is false.  */
void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
{
	unsigned long prev, new;

	new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY;
	prev = cmpxchg(&dst->_metrics, old, new);
	if (prev == old)
		kfree(__DST_METRICS_PTR(old));
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);

static struct dst_ops md_dst_ops = {
	.family =		AF_UNSPEC,
};

static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	WARN_ONCE(1, "Attempting to call output on metadata dst\n");
	kfree_skb(skb);
	return 0;
}

static int dst_md_discard(struct sk_buff *skb)
{
	WARN_ONCE(1, "Attempting to call input on metadata dst\n");
	kfree_skb(skb);
	return 0;
}

static void __metadata_dst_init(struct metadata_dst *md_dst,
				enum metadata_type type, u8 optslen)

{
	struct dst_entry *dst;

	dst = &md_dst->dst;
	dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
		 DST_METADATA | DST_NOCOUNT);

	dst->input = dst_md_discard;
	dst->output = dst_md_discard_out;

	memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
	md_dst->type = type;
}

struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
					gfp_t flags)
{
	struct metadata_dst *md_dst;

	md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
	if (!md_dst)
		return NULL;

	__metadata_dst_init(md_dst, type, optslen);

	return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc);

void metadata_dst_free(struct metadata_dst *md_dst)
{
#ifdef CONFIG_DST_CACHE
	if (md_dst->type == METADATA_IP_TUNNEL)
		dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
#endif
	kfree(md_dst);
}
EXPORT_SYMBOL_GPL(metadata_dst_free);

struct metadata_dst __percpu *
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
{
	int cpu;
	struct metadata_dst __percpu *md_dst;

	md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
				    __alignof__(struct metadata_dst), flags);
	if (!md_dst)
		return NULL;

	for_each_possible_cpu(cpu)
		__metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen);

	return md_dst;
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);

void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
{
#ifdef CONFIG_DST_CACHE
	int cpu;

	for_each_possible_cpu(cpu) {
		struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);

		if (one_md_dst->type == METADATA_IP_TUNNEL)
			dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
	}
#endif
	free_percpu(md_dst);
}
EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
back to top