Revision 88e8ac11d2ea3acc003cf01bb5a38c8aa76c3cfd authored by Charan Teja Reddy on 21 August 2020, 00:42:27 UTC, committed by Linus Torvalds on 21 August 2020, 16:52:53 UTC
The following race is observed with the repeated online, offline and a delay between two successive online of memory blocks of movable zone. P1 P2 Online the first memory block in the movable zone. The pcp struct values are initialized to default values,i.e., pcp->high = 0 & pcp->batch = 1. Allocate the pages from the movable zone. Try to Online the second memory block in the movable zone thus it entered the online_pages() but yet to call zone_pcp_update(). This process is entered into the exit path thus it tries to release the order-0 pages to pcp lists through free_unref_page_commit(). As pcp->high = 0, pcp->count = 1 proceed to call the function free_pcppages_bulk(). Update the pcp values thus the new pcp values are like, say, pcp->high = 378, pcp->batch = 63. Read the pcp's batch value using READ_ONCE() and pass the same to free_pcppages_bulk(), pcp values passed here are, batch = 63, count = 1. Since num of pages in the pcp lists are less than ->batch, then it will stuck in while(list_empty(list)) loop with interrupts disabled thus a core hung. Avoid this by ensuring free_pcppages_bulk() is called with proper count of pcp list pages. The mentioned race is some what easily reproducible without [1] because pcp's are not updated for the first memory block online and thus there is a enough race window for P2 between alloc+free and pcp struct values update through onlining of second memory block. With [1], the race still exists but it is very narrow as we update the pcp struct values for the first memory block online itself. This is not limited to the movable zone, it could also happen in cases with the normal zone (e.g., hotplug to a node that only has DMA memory, or no other memory yet). [1]: https://patchwork.kernel.org/patch/11696389/ Fixes: 5f8dcc21211a ("page-allocator: split per-cpu list into one-list-per-migrate-type") Signed-off-by: Charan Teja Reddy <charante@codeaurora.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: David Hildenbrand <david@redhat.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Vinayak Menon <vinmenon@codeaurora.org> Cc: <stable@vger.kernel.org> [2.6+] Link: http://lkml.kernel.org/r/1597150703-19003-1-git-send-email-charante@codeaurora.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent e08d3fd
pm.c
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
*
* Copyright (c) 2019, Intel Corporation.
*/
#define pr_fmt(fmt) "MPTCP: " fmt
#include <linux/kernel.h>
#include <net/tcp.h>
#include <net/mptcp.h>
#include "protocol.h"
/* path manager command handlers */
int mptcp_pm_announce_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
pr_debug("msk=%p, local_id=%d", msk, addr->id);
msk->pm.local = *addr;
WRITE_ONCE(msk->pm.addr_signal, true);
return 0;
}
int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id)
{
return -ENOTSUPP;
}
int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id)
{
return -ENOTSUPP;
}
/* path manager event handlers */
void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side)
{
struct mptcp_pm_data *pm = &msk->pm;
pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
WRITE_ONCE(pm->server_side, server_side);
}
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
int ret;
pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
pm->subflows_max, READ_ONCE(pm->accept_subflow));
/* try to avoid acquiring the lock below */
if (!READ_ONCE(pm->accept_subflow))
return false;
spin_lock_bh(&pm->lock);
ret = pm->subflows < pm->subflows_max;
if (ret && ++pm->subflows == pm->subflows_max)
WRITE_ONCE(pm->accept_subflow, false);
spin_unlock_bh(&pm->lock);
return ret;
}
/* return true if the new status bit is currently cleared, that is, this event
* can be server, eventually by an already scheduled work
*/
static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
enum mptcp_pm_status new_status)
{
pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
BIT(new_status));
if (msk->pm.status & BIT(new_status))
return false;
msk->pm.status |= BIT(new_status);
if (schedule_work(&msk->work))
sock_hold((struct sock *)msk);
return true;
}
void mptcp_pm_fully_established(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
pr_debug("msk=%p", msk);
/* try to avoid acquiring the lock below */
if (!READ_ONCE(pm->work_pending))
return;
spin_lock_bh(&pm->lock);
if (READ_ONCE(pm->work_pending))
mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
spin_unlock_bh(&pm->lock);
}
void mptcp_pm_connection_closed(struct mptcp_sock *msk)
{
pr_debug("msk=%p", msk);
}
void mptcp_pm_subflow_established(struct mptcp_sock *msk,
struct mptcp_subflow_context *subflow)
{
struct mptcp_pm_data *pm = &msk->pm;
pr_debug("msk=%p", msk);
if (!READ_ONCE(pm->work_pending))
return;
spin_lock_bh(&pm->lock);
if (READ_ONCE(pm->work_pending))
mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
spin_unlock_bh(&pm->lock);
}
void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id)
{
pr_debug("msk=%p", msk);
}
void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
struct mptcp_pm_data *pm = &msk->pm;
pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
READ_ONCE(pm->accept_addr));
/* avoid acquiring the lock if there is no room for fouther addresses */
if (!READ_ONCE(pm->accept_addr))
return;
spin_lock_bh(&pm->lock);
/* be sure there is something to signal re-checking under PM lock */
if (READ_ONCE(pm->accept_addr) &&
mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED))
pm->remote = *addr;
spin_unlock_bh(&pm->lock);
}
/* path manager helpers */
bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
struct mptcp_addr_info *saddr)
{
int ret = false;
spin_lock_bh(&msk->pm.lock);
/* double check after the lock is acquired */
if (!mptcp_pm_should_signal(msk))
goto out_unlock;
if (remaining < mptcp_add_addr_len(msk->pm.local.family))
goto out_unlock;
*saddr = msk->pm.local;
WRITE_ONCE(msk->pm.addr_signal, false);
ret = true;
out_unlock:
spin_unlock_bh(&msk->pm.lock);
return ret;
}
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
{
return mptcp_pm_nl_get_local_id(msk, skc);
}
void mptcp_pm_data_init(struct mptcp_sock *msk)
{
msk->pm.add_addr_signaled = 0;
msk->pm.add_addr_accepted = 0;
msk->pm.local_addr_used = 0;
msk->pm.subflows = 0;
WRITE_ONCE(msk->pm.work_pending, false);
WRITE_ONCE(msk->pm.addr_signal, false);
WRITE_ONCE(msk->pm.accept_addr, false);
WRITE_ONCE(msk->pm.accept_subflow, false);
msk->pm.status = 0;
spin_lock_init(&msk->pm.lock);
mptcp_pm_nl_data_init(msk);
}
void __init mptcp_pm_init(void)
{
mptcp_pm_nl_init();
}
Computing file changes ...