Revision cfdd28beb3205dbd1e91571516807199c8ab84ca authored by Nikolay Aleksandrov on 17 February 2016, 17:00:31 UTC, committed by David S. Miller on 19 February 2016, 20:29:26 UTC
When I used netdev_for_each_lower_dev in commit bad531623253 ("vrf:
remove slave queue and private slave struct") I thought that it acts
like netdev_for_each_lower_private and can be used to remove the current
device from the list while walking, but unfortunately it acts more like
netdev_for_each_lower_private_rcu and doesn't allow it. The difference
is where the "iter" points to, right now it points to the current element
and that makes it impossible to remove it. Change the logic to be
similar to netdev_for_each_lower_private and make it point to the "next"
element so we can safely delete the current one. VRF is the only such
user right now, there's no change for the read-only users.

Here's what can happen now:
[98423.249858] general protection fault: 0000 [#1] SMP
[98423.250175] Modules linked in: vrf bridge(O) stp llc nfsd auth_rpcgss
oid_registry nfs_acl nfs lockd grace sunrpc crct10dif_pclmul
crc32_pclmul crc32c_intel ghash_clmulni_intel jitterentropy_rng
sha256_generic hmac drbg ppdev aesni_intel aes_x86_64 glue_helper lrw
gf128mul ablk_helper cryptd evdev serio_raw pcspkr virtio_balloon
parport_pc parport i2c_piix4 i2c_core virtio_console acpi_cpufreq button
9pnet_virtio 9p 9pnet fscache ipv6 autofs4 ext4 crc16 mbcache jbd2 sg
virtio_blk virtio_net sr_mod cdrom e1000 ata_generic ehci_pci uhci_hcd
ehci_hcd usbcore usb_common virtio_pci ata_piix libata floppy
virtio_ring virtio scsi_mod [last unloaded: bridge]
[98423.255040] CPU: 1 PID: 14173 Comm: ip Tainted: G           O
4.5.0-rc2+ #81
[98423.255386] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.8.1-20150318_183358- 04/01/2014
[98423.255777] task: ffff8800547f5540 ti: ffff88003428c000 task.ti:
ffff88003428c000
[98423.256123] RIP: 0010:[<ffffffff81514f3e>]  [<ffffffff81514f3e>]
netdev_lower_get_next+0x1e/0x30
[98423.256534] RSP: 0018:ffff88003428f940  EFLAGS: 00010207
[98423.256766] RAX: 0002000100000004 RBX: ffff880054ff9000 RCX:
0000000000000000
[98423.257039] RDX: ffff88003428f8b8 RSI: ffff88003428f950 RDI:
ffff880054ff90c0
[98423.257287] RBP: ffff88003428f940 R08: 0000000000000000 R09:
0000000000000000
[98423.257537] R10: 0000000000000001 R11: 0000000000000000 R12:
ffff88003428f9e0
[98423.257802] R13: ffff880054a5fd00 R14: ffff88003428f970 R15:
0000000000000001
[98423.258055] FS:  00007f3d76881700(0000) GS:ffff88005d000000(0000)
knlGS:0000000000000000
[98423.258418] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[98423.258650] CR2: 00007ffe5951ffa8 CR3: 0000000052077000 CR4:
00000000000406e0
[98423.258902] Stack:
[98423.259075]  ffff88003428f960 ffffffffa0442636 0002000100000004
ffff880054ff9000
[98423.259647]  ffff88003428f9b0 ffffffff81518205 ffff880054ff9000
ffff88003428f978
[98423.260208]  ffff88003428f978 ffff88003428f9e0 ffff88003428f9e0
ffff880035b35f00
[98423.260739] Call Trace:
[98423.260920]  [<ffffffffa0442636>] vrf_dev_uninit+0x76/0xa0 [vrf]
[98423.261156]  [<ffffffff81518205>]
rollback_registered_many+0x205/0x390
[98423.261401]  [<ffffffff815183ec>] unregister_netdevice_many+0x1c/0x70
[98423.261641]  [<ffffffff8153223c>] rtnl_delete_link+0x3c/0x50
[98423.271557]  [<ffffffff815335bb>] rtnl_dellink+0xcb/0x1d0
[98423.271800]  [<ffffffff811cd7da>] ? __inc_zone_state+0x4a/0x90
[98423.272049]  [<ffffffff815337b4>] rtnetlink_rcv_msg+0x84/0x200
[98423.272279]  [<ffffffff810cfe7d>] ? trace_hardirqs_on+0xd/0x10
[98423.272513]  [<ffffffff8153370b>] ? rtnetlink_rcv+0x1b/0x40
[98423.272755]  [<ffffffff81533730>] ? rtnetlink_rcv+0x40/0x40
[98423.272983]  [<ffffffff8155d6e7>] netlink_rcv_skb+0x97/0xb0
[98423.273209]  [<ffffffff8153371a>] rtnetlink_rcv+0x2a/0x40
[98423.273476]  [<ffffffff8155ce8b>] netlink_unicast+0x11b/0x1a0
[98423.273710]  [<ffffffff8155d2f1>] netlink_sendmsg+0x3e1/0x610
[98423.273947]  [<ffffffff814fbc98>] sock_sendmsg+0x38/0x70
[98423.274175]  [<ffffffff814fc253>] ___sys_sendmsg+0x2e3/0x2f0
[98423.274416]  [<ffffffff810d841e>] ? do_raw_spin_unlock+0xbe/0x140
[98423.274658]  [<ffffffff811e1bec>] ? handle_mm_fault+0x26c/0x2210
[98423.274894]  [<ffffffff811e19cd>] ? handle_mm_fault+0x4d/0x2210
[98423.275130]  [<ffffffff81269611>] ? __fget_light+0x91/0xb0
[98423.275365]  [<ffffffff814fcd42>] __sys_sendmsg+0x42/0x80
[98423.275595]  [<ffffffff814fcd92>] SyS_sendmsg+0x12/0x20
[98423.275827]  [<ffffffff81611bb6>] entry_SYSCALL_64_fastpath+0x16/0x7a
[98423.276073] Code: c3 31 c0 5d c3 0f 1f 84 00 00 00 00 00 66 66 66 66
90 48 8b 06 55 48 81 c7 c0 00 00 00 48 89 e5 48 8b 00 48 39 f8 74 09 48
89 06 <48> 8b 40 e8 5d c3 31 c0 5d c3 0f 1f 84 00 00 00 00 00 66 66 66
[98423.279639] RIP  [<ffffffff81514f3e>] netdev_lower_get_next+0x1e/0x30
[98423.279920]  RSP <ffff88003428f940>

CC: David Ahern <dsa@cumulusnetworks.com>
CC: David S. Miller <davem@davemloft.net>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
CC: Vlad Yasevich <vyasevic@redhat.com>
Fixes: bad531623253 ("vrf: remove slave queue and private slave struct")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Reviewed-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent f3bb237
Raw File
irq_work.c
/*
 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
 *
 * Provides a framework for enqueueing and running callbacks from hardirq
 * context. The enqueueing is NMI-safe.
 */

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/irq_work.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/tick.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <asm/processor.h>


static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);

/*
 * Claim the entry so that no one else will poke at it.
 */
static bool irq_work_claim(struct irq_work *work)
{
	unsigned long flags, oflags, nflags;

	/*
	 * Start with our best wish as a premise but only trust any
	 * flag value after cmpxchg() result.
	 */
	flags = work->flags & ~IRQ_WORK_PENDING;
	for (;;) {
		nflags = flags | IRQ_WORK_FLAGS;
		oflags = cmpxchg(&work->flags, flags, nflags);
		if (oflags == flags)
			break;
		if (oflags & IRQ_WORK_PENDING)
			return false;
		flags = oflags;
		cpu_relax();
	}

	return true;
}

void __weak arch_irq_work_raise(void)
{
	/*
	 * Lame architectures will get the timer tick callback
	 */
}

#ifdef CONFIG_SMP
/*
 * Enqueue the irq_work @work on @cpu unless it's already pending
 * somewhere.
 *
 * Can be re-enqueued while the callback is still in progress.
 */
bool irq_work_queue_on(struct irq_work *work, int cpu)
{
	/* All work should have been flushed before going offline */
	WARN_ON_ONCE(cpu_is_offline(cpu));

	/* Arch remote IPI send/receive backend aren't NMI safe */
	WARN_ON_ONCE(in_nmi());

	/* Only queue if not already pending */
	if (!irq_work_claim(work))
		return false;

	if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
		arch_send_call_function_single_ipi(cpu);

	return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue_on);
#endif

/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
	/* Only queue if not already pending */
	if (!irq_work_claim(work))
		return false;

	/* Queue the entry and raise the IPI if needed. */
	preempt_disable();

	/* If the work is "lazy", handle it from next tick if any */
	if (work->flags & IRQ_WORK_LAZY) {
		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
		    tick_nohz_tick_stopped())
			arch_irq_work_raise();
	} else {
		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
			arch_irq_work_raise();
	}

	preempt_enable();

	return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);

bool irq_work_needs_cpu(void)
{
	struct llist_head *raised, *lazy;

	raised = this_cpu_ptr(&raised_list);
	lazy = this_cpu_ptr(&lazy_list);

	if (llist_empty(raised) || arch_irq_work_has_interrupt())
		if (llist_empty(lazy))
			return false;

	/* All work should have been flushed before going offline */
	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));

	return true;
}

static void irq_work_run_list(struct llist_head *list)
{
	unsigned long flags;
	struct irq_work *work;
	struct llist_node *llnode;

	BUG_ON(!irqs_disabled());

	if (llist_empty(list))
		return;

	llnode = llist_del_all(list);
	while (llnode != NULL) {
		work = llist_entry(llnode, struct irq_work, llnode);

		llnode = llist_next(llnode);

		/*
		 * Clear the PENDING bit, after this point the @work
		 * can be re-used.
		 * Make it immediately visible so that other CPUs trying
		 * to claim that work don't rely on us to handle their data
		 * while we are in the middle of the func.
		 */
		flags = work->flags & ~IRQ_WORK_PENDING;
		xchg(&work->flags, flags);

		work->func(work);
		/*
		 * Clear the BUSY bit and return to the free state if
		 * no-one else claimed it meanwhile.
		 */
		(void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
	}
}

/*
 * hotplug calls this through:
 *  hotplug_cfd() -> flush_smp_call_function_queue()
 */
void irq_work_run(void)
{
	irq_work_run_list(this_cpu_ptr(&raised_list));
	irq_work_run_list(this_cpu_ptr(&lazy_list));
}
EXPORT_SYMBOL_GPL(irq_work_run);

void irq_work_tick(void)
{
	struct llist_head *raised = this_cpu_ptr(&raised_list);

	if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
		irq_work_run_list(raised);
	irq_work_run_list(this_cpu_ptr(&lazy_list));
}

/*
 * Synchronize against the irq_work @entry, ensures the entry is not
 * currently in use.
 */
void irq_work_sync(struct irq_work *work)
{
	WARN_ON_ONCE(irqs_disabled());

	while (work->flags & IRQ_WORK_BUSY)
		cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
back to top