https://github.com/torvalds/linux
Raw File
Tip revision: 9c763584b7c8911106bb77af7e648bef09af9d80 authored by Linus Torvalds on 20 November 2016, 21:52:19 UTC
Linux 4.9-rc6
Tip revision: 9c76358
scif_mmap.c
/*
 * Intel MIC Platform Software Stack (MPSS)
 *
 * Copyright(c) 2015 Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * Intel SCIF driver.
 *
 */
#include "scif_main.h"

/*
 * struct scif_vma_info - Information about a remote memory mapping
 *			  created via scif_mmap(..)
 * @vma: VM area struct
 * @list: link to list of active vmas
 */
struct scif_vma_info {
	struct vm_area_struct *vma;
	struct list_head list;
};

void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg)
{
	struct scif_rma_req req;
	struct scif_window *window = NULL;
	struct scif_window *recv_window =
		(struct scif_window *)msg->payload[0];
	struct scif_endpt *ep;

	ep = (struct scif_endpt *)recv_window->ep;
	req.out_window = &window;
	req.offset = recv_window->offset;
	req.prot = recv_window->prot;
	req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT;
	req.type = SCIF_WINDOW_FULL;
	req.head = &ep->rma_info.reg_list;
	msg->payload[0] = ep->remote_ep;

	mutex_lock(&ep->rma_info.rma_lock);
	/* Does a valid window exist? */
	if (scif_query_window(&req)) {
		dev_err(&scifdev->sdev->dev,
			"%s %d -ENXIO\n", __func__, __LINE__);
		msg->uop = SCIF_UNREGISTER_ACK;
		goto error;
	}

	scif_put_window(window, window->nr_pages);

	if (!window->ref_count) {
		atomic_inc(&ep->rma_info.tw_refcount);
		ep->rma_info.async_list_del = 1;
		list_del_init(&window->list);
		scif_free_window_offset(ep, window, window->offset);
	}
error:
	mutex_unlock(&ep->rma_info.rma_lock);
	if (window && !window->ref_count)
		scif_queue_for_cleanup(window, &scif_info.rma);
}

/*
 * Remove valid remote memory mappings created via scif_mmap(..) from the
 * process address space since the remote node is lost
 */
static void __scif_zap_mmaps(struct scif_endpt *ep)
{
	struct list_head *item;
	struct scif_vma_info *info;
	struct vm_area_struct *vma;
	unsigned long size;

	spin_lock(&ep->lock);
	list_for_each(item, &ep->rma_info.vma_list) {
		info = list_entry(item, struct scif_vma_info, list);
		vma = info->vma;
		size = vma->vm_end - vma->vm_start;
		zap_vma_ptes(vma, vma->vm_start, size);
		dev_dbg(scif_info.mdev.this_device,
			"%s ep %p zap vma %p size 0x%lx\n",
			__func__, ep, info->vma, size);
	}
	spin_unlock(&ep->lock);
}

/*
 * Traverse the list of endpoints for a particular remote node and
 * zap valid remote memory mappings since the remote node is lost
 */
static void _scif_zap_mmaps(int node, struct list_head *head)
{
	struct scif_endpt *ep;
	struct list_head *item;

	mutex_lock(&scif_info.connlock);
	list_for_each(item, head) {
		ep = list_entry(item, struct scif_endpt, list);
		if (ep->remote_dev->node == node)
			__scif_zap_mmaps(ep);
	}
	mutex_unlock(&scif_info.connlock);
}

/*
 * Wrapper for removing remote memory mappings for a particular node. This API
 * is called by peer nodes as part of handling a lost node.
 */
void scif_zap_mmaps(int node)
{
	_scif_zap_mmaps(node, &scif_info.connected);
	_scif_zap_mmaps(node, &scif_info.disconnected);
}

/*
 * This API is only called while handling a lost node:
 * a) Remote node is dead.
 * b) Remote memory mappings have been zapped
 * So we can traverse the remote_reg_list without any locks. Since
 * the window has not yet been unregistered we can drop the ref count
 * and queue it to the cleanup thread.
 */
static void __scif_cleanup_rma_for_zombies(struct scif_endpt *ep)
{
	struct list_head *pos, *tmp;
	struct scif_window *window;

	list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) {
		window = list_entry(pos, struct scif_window, list);
		if (window->ref_count)
			scif_put_window(window, window->nr_pages);
		else
			dev_err(scif_info.mdev.this_device,
				"%s %d unexpected\n",
				__func__, __LINE__);
		if (!window->ref_count) {
			atomic_inc(&ep->rma_info.tw_refcount);
			list_del_init(&window->list);
			scif_queue_for_cleanup(window, &scif_info.rma);
		}
	}
}

/* Cleanup remote registration lists for zombie endpoints */
void scif_cleanup_rma_for_zombies(int node)
{
	struct scif_endpt *ep;
	struct list_head *item;

	mutex_lock(&scif_info.eplock);
	list_for_each(item, &scif_info.zombie) {
		ep = list_entry(item, struct scif_endpt, list);
		if (ep->remote_dev && ep->remote_dev->node == node)
			__scif_cleanup_rma_for_zombies(ep);
	}
	mutex_unlock(&scif_info.eplock);
	flush_work(&scif_info.misc_work);
}

/* Insert the VMA into the per endpoint VMA list */
static int scif_insert_vma(struct scif_endpt *ep, struct vm_area_struct *vma)
{
	struct scif_vma_info *info;
	int err = 0;

	info = kzalloc(sizeof(*info), GFP_KERNEL);
	if (!info) {
		err = -ENOMEM;
		goto done;
	}
	info->vma = vma;
	spin_lock(&ep->lock);
	list_add_tail(&info->list, &ep->rma_info.vma_list);
	spin_unlock(&ep->lock);
done:
	return err;
}

/* Delete the VMA from the per endpoint VMA list */
static void scif_delete_vma(struct scif_endpt *ep, struct vm_area_struct *vma)
{
	struct list_head *item;
	struct scif_vma_info *info;

	spin_lock(&ep->lock);
	list_for_each(item, &ep->rma_info.vma_list) {
		info = list_entry(item, struct scif_vma_info, list);
		if (info->vma == vma) {
			list_del(&info->list);
			kfree(info);
			break;
		}
	}
	spin_unlock(&ep->lock);
}

static phys_addr_t scif_get_phys(phys_addr_t phys, struct scif_endpt *ep)
{
	struct scif_dev *scifdev = (struct scif_dev *)ep->remote_dev;
	struct scif_hw_dev *sdev = scifdev->sdev;
	phys_addr_t out_phys, apt_base = 0;

	/*
	 * If the DMA address is card relative then we need to add the
	 * aperture base for mmap to work correctly
	 */
	if (!scifdev_self(scifdev) && sdev->aper && sdev->card_rel_da)
		apt_base = sdev->aper->pa;
	out_phys = apt_base + phys;
	return out_phys;
}

int scif_get_pages(scif_epd_t epd, off_t offset, size_t len,
		   struct scif_range **pages)
{
	struct scif_endpt *ep = (struct scif_endpt *)epd;
	struct scif_rma_req req;
	struct scif_window *window = NULL;
	int nr_pages, err, i;

	dev_dbg(scif_info.mdev.this_device,
		"SCIFAPI get_pinned_pages: ep %p offset 0x%lx len 0x%lx\n",
		ep, offset, len);
	err = scif_verify_epd(ep);
	if (err)
		return err;

	if (!len || (offset < 0) ||
	    (offset + len < offset) ||
	    (ALIGN(offset, PAGE_SIZE) != offset) ||
	    (ALIGN(len, PAGE_SIZE) != len))
		return -EINVAL;

	nr_pages = len >> PAGE_SHIFT;

	req.out_window = &window;
	req.offset = offset;
	req.prot = 0;
	req.nr_bytes = len;
	req.type = SCIF_WINDOW_SINGLE;
	req.head = &ep->rma_info.remote_reg_list;

	mutex_lock(&ep->rma_info.rma_lock);
	/* Does a valid window exist? */
	err = scif_query_window(&req);
	if (err) {
		dev_err(&ep->remote_dev->sdev->dev,
			"%s %d err %d\n", __func__, __LINE__, err);
		goto error;
	}

	/* Allocate scif_range */
	*pages = kzalloc(sizeof(**pages), GFP_KERNEL);
	if (!*pages) {
		err = -ENOMEM;
		goto error;
	}

	/* Allocate phys addr array */
	(*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t));
	if (!((*pages)->phys_addr)) {
		err = -ENOMEM;
		goto error;
	}

	if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev)) {
		/* Allocate virtual address array */
		((*pages)->va = scif_zalloc(nr_pages * sizeof(void *)));
		if (!(*pages)->va) {
			err = -ENOMEM;
			goto error;
		}
	}
	/* Populate the values */
	(*pages)->cookie = window;
	(*pages)->nr_pages = nr_pages;
	(*pages)->prot_flags = window->prot;

	for (i = 0; i < nr_pages; i++) {
		(*pages)->phys_addr[i] =
			__scif_off_to_dma_addr(window, offset +
					       (i * PAGE_SIZE));
		(*pages)->phys_addr[i] = scif_get_phys((*pages)->phys_addr[i],
							ep);
		if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev))
			(*pages)->va[i] =
				ep->remote_dev->sdev->aper->va +
				(*pages)->phys_addr[i] -
				ep->remote_dev->sdev->aper->pa;
	}

	scif_get_window(window, nr_pages);
error:
	mutex_unlock(&ep->rma_info.rma_lock);
	if (err) {
		if (*pages) {
			scif_free((*pages)->phys_addr,
				  nr_pages * sizeof(dma_addr_t));
			scif_free((*pages)->va,
				  nr_pages * sizeof(void *));
			kfree(*pages);
			*pages = NULL;
		}
		dev_err(&ep->remote_dev->sdev->dev,
			"%s %d err %d\n", __func__, __LINE__, err);
	}
	return err;
}
EXPORT_SYMBOL_GPL(scif_get_pages);

int scif_put_pages(struct scif_range *pages)
{
	struct scif_endpt *ep;
	struct scif_window *window;
	struct scifmsg msg;

	if (!pages || !pages->cookie)
		return -EINVAL;

	window = pages->cookie;

	if (!window || window->magic != SCIFEP_MAGIC)
		return -EINVAL;

	ep = (struct scif_endpt *)window->ep;
	/*
	 * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the
	 * callee should be allowed to release references to the pages,
	 * else the endpoint was not connected in the first place,
	 * hence the ENOTCONN.
	 */
	if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED)
		return -ENOTCONN;

	mutex_lock(&ep->rma_info.rma_lock);

	scif_put_window(window, pages->nr_pages);

	/* Initiate window destruction if ref count is zero */
	if (!window->ref_count) {
		list_del(&window->list);
		mutex_unlock(&ep->rma_info.rma_lock);
		scif_drain_dma_intr(ep->remote_dev->sdev,
				    ep->rma_info.dma_chan);
		/* Inform the peer about this window being destroyed. */
		msg.uop = SCIF_MUNMAP;
		msg.src = ep->port;
		msg.payload[0] = window->peer_window;
		/* No error handling for notification messages */
		scif_nodeqp_send(ep->remote_dev, &msg);
		/* Destroy this window from the peer's registered AS */
		scif_destroy_remote_window(window);
	} else {
		mutex_unlock(&ep->rma_info.rma_lock);
	}

	scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t));
	scif_free(pages->va, pages->nr_pages * sizeof(void *));
	kfree(pages);
	return 0;
}
EXPORT_SYMBOL_GPL(scif_put_pages);

/*
 * scif_rma_list_mmap:
 *
 * Traverse the remote registration list starting from start_window:
 * 1) Create VtoP mappings via remap_pfn_range(..)
 * 2) Once step 1) and 2) complete successfully then traverse the range of
 *    windows again and bump the reference count.
 * RMA lock must be held.
 */
static int scif_rma_list_mmap(struct scif_window *start_window, s64 offset,
			      int nr_pages, struct vm_area_struct *vma)
{
	s64 end_offset, loop_offset = offset;
	struct scif_window *window = start_window;
	int loop_nr_pages, nr_pages_left = nr_pages;
	struct scif_endpt *ep = (struct scif_endpt *)start_window->ep;
	struct list_head *head = &ep->rma_info.remote_reg_list;
	int i, err = 0;
	dma_addr_t phys_addr;
	struct scif_window_iter src_win_iter;
	size_t contig_bytes = 0;

	might_sleep();
	list_for_each_entry_from(window, head, list) {
		end_offset = window->offset +
			(window->nr_pages << PAGE_SHIFT);
		loop_nr_pages = min_t(int,
				      (end_offset - loop_offset) >> PAGE_SHIFT,
				      nr_pages_left);
		scif_init_window_iter(window, &src_win_iter);
		for (i = 0; i < loop_nr_pages; i++) {
			phys_addr = scif_off_to_dma_addr(window, loop_offset,
							 &contig_bytes,
							 &src_win_iter);
			phys_addr = scif_get_phys(phys_addr, ep);
			err = remap_pfn_range(vma,
					      vma->vm_start +
					      loop_offset - offset,
					      phys_addr >> PAGE_SHIFT,
					      PAGE_SIZE,
					      vma->vm_page_prot);
			if (err)
				goto error;
			loop_offset += PAGE_SIZE;
		}
		nr_pages_left -= loop_nr_pages;
		if (!nr_pages_left)
			break;
	}
	/*
	 * No more failures expected. Bump up the ref count for all
	 * the windows. Another traversal from start_window required
	 * for handling errors encountered across windows during
	 * remap_pfn_range(..).
	 */
	loop_offset = offset;
	nr_pages_left = nr_pages;
	window = start_window;
	head = &ep->rma_info.remote_reg_list;
	list_for_each_entry_from(window, head, list) {
		end_offset = window->offset +
			(window->nr_pages << PAGE_SHIFT);
		loop_nr_pages = min_t(int,
				      (end_offset - loop_offset) >> PAGE_SHIFT,
				      nr_pages_left);
		scif_get_window(window, loop_nr_pages);
		nr_pages_left -= loop_nr_pages;
		loop_offset += (loop_nr_pages << PAGE_SHIFT);
		if (!nr_pages_left)
			break;
	}
error:
	if (err)
		dev_err(scif_info.mdev.this_device,
			"%s %d err %d\n", __func__, __LINE__, err);
	return err;
}

/*
 * scif_rma_list_munmap:
 *
 * Traverse the remote registration list starting from window:
 * 1) Decrement ref count.
 * 2) If the ref count drops to zero then send a SCIF_MUNMAP message to peer.
 * RMA lock must be held.
 */
static void scif_rma_list_munmap(struct scif_window *start_window,
				 s64 offset, int nr_pages)
{
	struct scifmsg msg;
	s64 loop_offset = offset, end_offset;
	int loop_nr_pages, nr_pages_left = nr_pages;
	struct scif_endpt *ep = (struct scif_endpt *)start_window->ep;
	struct list_head *head = &ep->rma_info.remote_reg_list;
	struct scif_window *window = start_window, *_window;

	msg.uop = SCIF_MUNMAP;
	msg.src = ep->port;
	loop_offset = offset;
	nr_pages_left = nr_pages;
	list_for_each_entry_safe_from(window, _window, head, list) {
		end_offset = window->offset +
			(window->nr_pages << PAGE_SHIFT);
		loop_nr_pages = min_t(int,
				      (end_offset - loop_offset) >> PAGE_SHIFT,
				      nr_pages_left);
		scif_put_window(window, loop_nr_pages);
		if (!window->ref_count) {
			struct scif_dev *rdev = ep->remote_dev;

			scif_drain_dma_intr(rdev->sdev,
					    ep->rma_info.dma_chan);
			/* Inform the peer about this munmap */
			msg.payload[0] = window->peer_window;
			/* No error handling for Notification messages. */
			scif_nodeqp_send(ep->remote_dev, &msg);
			list_del(&window->list);
			/* Destroy this window from the peer's registered AS */
			scif_destroy_remote_window(window);
		}
		nr_pages_left -= loop_nr_pages;
		loop_offset += (loop_nr_pages << PAGE_SHIFT);
		if (!nr_pages_left)
			break;
	}
}

/*
 * The private data field of each VMA used to mmap a remote window
 * points to an instance of struct vma_pvt
 */
struct vma_pvt {
	struct scif_endpt *ep;	/* End point for remote window */
	s64 offset;		/* offset within remote window */
	bool valid_offset;	/* offset is valid only if the original
				 * mmap request was for a single page
				 * else the offset within the vma is
				 * the correct offset
				 */
	struct kref ref;
};

static void vma_pvt_release(struct kref *ref)
{
	struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref);

	kfree(vmapvt);
}

/**
 * scif_vma_open - VMA open driver callback
 * @vma: VMM memory area.
 * The open method is called by the kernel to allow the subsystem implementing
 * the VMA to initialize the area. This method is invoked any time a new
 * reference to the VMA is made (when a process forks, for example).
 * The one exception happens when the VMA is first created by mmap;
 * in this case, the driver's mmap method is called instead.
 * This function is also invoked when an existing VMA is split by the kernel
 * due to a call to munmap on a subset of the VMA resulting in two VMAs.
 * The kernel invokes this function only on one of the two VMAs.
 */
static void scif_vma_open(struct vm_area_struct *vma)
{
	struct vma_pvt *vmapvt = vma->vm_private_data;

	dev_dbg(scif_info.mdev.this_device,
		"SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n",
		vma->vm_start, vma->vm_end);
	scif_insert_vma(vmapvt->ep, vma);
	kref_get(&vmapvt->ref);
}

/**
 * scif_munmap - VMA close driver callback.
 * @vma: VMM memory area.
 * When an area is destroyed, the kernel calls its close operation.
 * Note that there's no usage count associated with VMA's; the area
 * is opened and closed exactly once by each process that uses it.
 */
static void scif_munmap(struct vm_area_struct *vma)
{
	struct scif_endpt *ep;
	struct vma_pvt *vmapvt = vma->vm_private_data;
	int nr_pages = vma_pages(vma);
	s64 offset;
	struct scif_rma_req req;
	struct scif_window *window = NULL;
	int err;

	might_sleep();
	dev_dbg(scif_info.mdev.this_device,
		"SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
		vma->vm_start, vma->vm_end);
	ep = vmapvt->ep;
	offset = vmapvt->valid_offset ? vmapvt->offset :
		(vma->vm_pgoff) << PAGE_SHIFT;
	dev_dbg(scif_info.mdev.this_device,
		"SCIFAPI munmap: ep %p nr_pages 0x%x offset 0x%llx\n",
		ep, nr_pages, offset);
	req.out_window = &window;
	req.offset = offset;
	req.nr_bytes = vma->vm_end - vma->vm_start;
	req.prot = vma->vm_flags & (VM_READ | VM_WRITE);
	req.type = SCIF_WINDOW_PARTIAL;
	req.head = &ep->rma_info.remote_reg_list;

	mutex_lock(&ep->rma_info.rma_lock);

	err = scif_query_window(&req);
	if (err)
		dev_err(scif_info.mdev.this_device,
			"%s %d err %d\n", __func__, __LINE__, err);
	else
		scif_rma_list_munmap(window, offset, nr_pages);

	mutex_unlock(&ep->rma_info.rma_lock);
	/*
	 * The kernel probably zeroes these out but we still want
	 * to clean up our own mess just in case.
	 */
	vma->vm_ops = NULL;
	vma->vm_private_data = NULL;
	kref_put(&vmapvt->ref, vma_pvt_release);
	scif_delete_vma(ep, vma);
}

static const struct vm_operations_struct scif_vm_ops = {
	.open = scif_vma_open,
	.close = scif_munmap,
};

/**
 * scif_mmap - Map pages in virtual address space to a remote window.
 * @vma: VMM memory area.
 * @epd: endpoint descriptor
 *
 * Return: Upon successful completion, scif_mmap() returns zero
 * else an apt error is returned as documented in scif.h
 */
int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd)
{
	struct scif_rma_req req;
	struct scif_window *window = NULL;
	struct scif_endpt *ep = (struct scif_endpt *)epd;
	s64 start_offset = vma->vm_pgoff << PAGE_SHIFT;
	int nr_pages = vma_pages(vma);
	int err;
	struct vma_pvt *vmapvt;

	dev_dbg(scif_info.mdev.this_device,
		"SCIFAPI mmap: ep %p start_offset 0x%llx nr_pages 0x%x\n",
		ep, start_offset, nr_pages);
	err = scif_verify_epd(ep);
	if (err)
		return err;

	might_sleep();

	err = scif_insert_vma(ep, vma);
	if (err)
		return err;

	vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL);
	if (!vmapvt) {
		scif_delete_vma(ep, vma);
		return -ENOMEM;
	}

	vmapvt->ep = ep;
	kref_init(&vmapvt->ref);

	req.out_window = &window;
	req.offset = start_offset;
	req.nr_bytes = vma->vm_end - vma->vm_start;
	req.prot = vma->vm_flags & (VM_READ | VM_WRITE);
	req.type = SCIF_WINDOW_PARTIAL;
	req.head = &ep->rma_info.remote_reg_list;

	mutex_lock(&ep->rma_info.rma_lock);
	/* Does a valid window exist? */
	err = scif_query_window(&req);
	if (err) {
		dev_err(&ep->remote_dev->sdev->dev,
			"%s %d err %d\n", __func__, __LINE__, err);
		goto error_unlock;
	}

	/* Default prot for loopback */
	if (!scifdev_self(ep->remote_dev))
		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);

	/*
	 * VM_DONTCOPY - Do not copy this vma on fork
	 * VM_DONTEXPAND - Cannot expand with mremap()
	 * VM_RESERVED - Count as reserved_vm like IO
	 * VM_PFNMAP - Page-ranges managed without "struct page"
	 * VM_IO - Memory mapped I/O or similar
	 *
	 * We do not want to copy this VMA automatically on a fork(),
	 * expand this VMA due to mremap() or swap out these pages since
	 * the VMA is actually backed by physical pages in the remote
	 * node's physical memory and not via a struct page.
	 */
	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;

	if (!scifdev_self(ep->remote_dev))
		vma->vm_flags |= VM_IO | VM_PFNMAP;

	/* Map this range of windows */
	err = scif_rma_list_mmap(window, start_offset, nr_pages, vma);
	if (err) {
		dev_err(&ep->remote_dev->sdev->dev,
			"%s %d err %d\n", __func__, __LINE__, err);
		goto error_unlock;
	}
	/* Set up the driver call back */
	vma->vm_ops = &scif_vm_ops;
	vma->vm_private_data = vmapvt;
error_unlock:
	mutex_unlock(&ep->rma_info.rma_lock);
	if (err) {
		kfree(vmapvt);
		dev_err(&ep->remote_dev->sdev->dev,
			"%s %d err %d\n", __func__, __LINE__, err);
		scif_delete_vma(ep, vma);
	}
	return err;
}
back to top