https://github.com/torvalds/linux
Revision 21b5944350052d2583e82dd59b19a9ba94a007f0 authored by Eric W. Biederman on 19 December 2017, 17:27:56 UTC, committed by David S. Miller on 20 December 2017, 17:42:22 UTC
(I can trivially verify that that idr_remove in cleanup_net happens
 after the network namespace count has dropped to zero --EWB)

Function get_net_ns_by_id() does not check for net::count
after it has found a peer in netns_ids idr.

It may dereference a peer, after its count has already been
finaly decremented. This leads to double free and memory
corruption:

put_net(peer)                                   rtnl_lock()
atomic_dec_and_test(&peer->count) [count=0]     ...
__put_net(peer)                                 get_net_ns_by_id(net, id)
  spin_lock(&cleanup_list_lock)
  list_add(&net->cleanup_list, &cleanup_list)
  spin_unlock(&cleanup_list_lock)
queue_work()                                      peer = idr_find(&net->netns_ids, id)
  |                                               get_net(peer) [count=1]
  |                                               ...
  |                                               (use after final put)
  v                                               ...
  cleanup_net()                                   ...
    spin_lock(&cleanup_list_lock)                 ...
    list_replace_init(&cleanup_list, ..)          ...
    spin_unlock(&cleanup_list_lock)               ...
    ...                                           ...
    ...                                           put_net(peer)
    ...                                             atomic_dec_and_test(&peer->count) [count=0]
    ...                                               spin_lock(&cleanup_list_lock)
    ...                                               list_add(&net->cleanup_list, &cleanup_list)
    ...                                               spin_unlock(&cleanup_list_lock)
    ...                                             queue_work()
    ...                                           rtnl_unlock()
    rtnl_lock()                                   ...
    for_each_net(tmp) {                           ...
      id = __peernet2id(tmp, peer)                ...
      spin_lock_irq(&tmp->nsid_lock)              ...
      idr_remove(&tmp->netns_ids, id)             ...
      ...                                         ...
      net_drop_ns()                               ...
	net_free(peer)                            ...
    }                                             ...
  |
  v
  cleanup_net()
    ...
    (Second free of peer)

Also, put_net() on the right cpu may reorder with left's cpu
list_replace_init(&cleanup_list, ..), and then cleanup_list
will be corrupted.

Since cleanup_net() is executed in worker thread, while
put_net(peer) can happen everywhere, there should be
enough time for concurrent get_net_ns_by_id() to pick
the peer up, and the race does not seem to be unlikely.
The patch fixes the problem in standard way.

(Also, there is possible problem in peernet2id_alloc(), which requires
check for net::count under nsid_lock and maybe_get_net(peer), but
in current stable kernel it's used under rtnl_lock() and it has to be
safe. Openswitch begun to use peernet2id_alloc(), and possibly it should
be fixed too. While this is not in stable kernel yet, so I'll send
a separate message to netdev@ later).

Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Fixes: 0c7aecd4bde4 "netns: add rtnl cmd to add and get peer netns ids"
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent eda9873
Raw File
Tip revision: 21b5944350052d2583e82dd59b19a9ba94a007f0 authored by Eric W. Biederman on 19 December 2017, 17:27:56 UTC
net: Fix double free and memory corruption in get_net_ns_by_id()
Tip revision: 21b5944
blk-mq-sched.h
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_MQ_SCHED_H
#define BLK_MQ_SCHED_H

#include "blk-mq.h"
#include "blk-mq-tag.h"

void blk_mq_sched_free_hctx_data(struct request_queue *q,
				 void (*exit)(struct blk_mq_hw_ctx *));

void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio);

void blk_mq_sched_request_inserted(struct request *rq);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
				struct request **merged_request);
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);

void blk_mq_sched_insert_request(struct request *rq, bool at_head,
				 bool run_queue, bool async, bool can_block);
void blk_mq_sched_insert_requests(struct request_queue *q,
				  struct blk_mq_ctx *ctx,
				  struct list_head *list, bool run_queue_async);

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);

int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);

int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
			   unsigned int hctx_idx);
void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
			    unsigned int hctx_idx);

int blk_mq_sched_init(struct request_queue *q);

static inline bool
blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
{
	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
		return false;

	return __blk_mq_sched_bio_merge(q, bio);
}

static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
			 struct bio *bio)
{
	struct elevator_queue *e = q->elevator;

	if (e && e->type->ops.mq.allow_merge)
		return e->type->ops.mq.allow_merge(q, rq, bio);

	return true;
}

static inline void blk_mq_sched_completed_request(struct request *rq)
{
	struct elevator_queue *e = rq->q->elevator;

	if (e && e->type->ops.mq.completed_request)
		e->type->ops.mq.completed_request(rq);
}

static inline void blk_mq_sched_started_request(struct request *rq)
{
	struct request_queue *q = rq->q;
	struct elevator_queue *e = q->elevator;

	if (e && e->type->ops.mq.started_request)
		e->type->ops.mq.started_request(rq);
}

static inline void blk_mq_sched_requeue_request(struct request *rq)
{
	struct request_queue *q = rq->q;
	struct elevator_queue *e = q->elevator;

	if (e && e->type->ops.mq.requeue_request)
		e->type->ops.mq.requeue_request(rq);
}

static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
{
	struct elevator_queue *e = hctx->queue->elevator;

	if (e && e->type->ops.mq.has_work)
		return e->type->ops.mq.has_work(hctx);

	return false;
}

static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
{
	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}

#endif
back to top