Revision a0fc7fafeee3a6678887d8c8e86c47def4572526 authored by Alexander Root on 04 February 2021, 01:13:53 UTC, committed by GitHub on 04 February 2021, 01:13:53 UTC
* add fixes to overflow analysis in bounds inference Co-authored-by: Steven Johnson <srj@google.com>
1 parent dadbcbf
cuda.cpp
#include "HalideRuntimeCuda.h"
#include "device_buffer_utils.h"
#include "device_interface.h"
#include "gpu_context_common.h"
#include "mini_cuda.h"
#include "printer.h"
#include "scoped_mutex_lock.h"
#include "scoped_spin_lock.h"
namespace Halide {
namespace Runtime {
namespace Internal {
namespace Cuda {
// Define the function pointers for the CUDA API.
#define CUDA_FN(ret, fn, args) WEAK ret(CUDAAPI *fn) args;
#define CUDA_FN_OPTIONAL(ret, fn, args) WEAK ret(CUDAAPI *fn) args;
#define CUDA_FN_3020(ret, fn, fn_3020, args) WEAK ret(CUDAAPI *fn) args;
#define CUDA_FN_4000(ret, fn, fn_4000, args) WEAK ret(CUDAAPI *fn) args;
#include "cuda_functions.h"
#undef CUDA_FN
#undef CUDA_FN_OPTIONAL
#undef CUDA_FN_3020
#undef CUDA_FN_4000
// The default implementation of halide_cuda_get_symbol attempts to load
// the CUDA shared library/DLL, and then get the symbol from it.
WEAK void *lib_cuda = nullptr;
volatile ScopedSpinLock::AtomicFlag WEAK lib_cuda_lock = 0;
extern "C" WEAK void *halide_cuda_get_symbol(void *user_context, const char *name) {
// Only try to load the library if we can't already get the symbol
// from the library. Even if the library is nullptr, the symbols may
// already be available in the process.
void *symbol = halide_get_library_symbol(lib_cuda, name);
if (symbol) {
return symbol;
}
const char *lib_names[] = {
#ifdef WINDOWS
"nvcuda.dll",
#else
"libcuda.so",
"libcuda.dylib",
"/Library/Frameworks/CUDA.framework/CUDA",
#endif
};
for (size_t i = 0; i < sizeof(lib_names) / sizeof(lib_names[0]); i++) {
lib_cuda = halide_load_library(lib_names[i]);
if (lib_cuda) {
debug(user_context) << " Loaded CUDA runtime library: " << lib_names[i] << "\n";
break;
}
}
return halide_get_library_symbol(lib_cuda, name);
}
template<typename T>
ALWAYS_INLINE T get_cuda_symbol(void *user_context, const char *name, bool optional = false) {
T s = (T)halide_cuda_get_symbol(user_context, name);
if (!optional && !s) {
error(user_context) << "CUDA API not found: " << name << "\n";
}
return s;
}
// Load a CUDA shared object/dll and get the CUDA API function pointers from it.
WEAK void load_libcuda(void *user_context) {
debug(user_context) << " load_libcuda (user_context: " << user_context << ")\n";
halide_assert(user_context, cuInit == nullptr);
#define CUDA_FN(ret, fn, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn);
#define CUDA_FN_OPTIONAL(ret, fn, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn, true);
#define CUDA_FN_3020(ret, fn, fn_3020, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn_3020);
#define CUDA_FN_4000(ret, fn, fn_4000, args) fn = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn_4000);
#include "cuda_functions.h"
#undef CUDA_FN
#undef CUDA_FN_OPTIONAL
#undef CUDA_FN_3020
#undef CUDA_FN_4000
}
// Call load_libcuda() if CUDA library has not been loaded.
// This function is thread safe.
// Note that initialization might fail. The caller can detect such failure by checking whether cuInit is nullptr.
WEAK void ensure_libcuda_init(void *user_context) {
ScopedSpinLock spinlock(&lib_cuda_lock);
if (!cuInit) {
load_libcuda(user_context);
}
}
extern WEAK halide_device_interface_t cuda_device_interface;
WEAK const char *get_error_name(CUresult error);
WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx);
// A cuda context defined in this module with weak linkage
CUcontext WEAK context = nullptr;
// This lock protexts the above context variable.
WEAK halide_mutex context_lock;
// A free list, used when allocations are being cached.
WEAK struct FreeListItem {
CUdeviceptr ptr;
CUcontext ctx;
CUstream stream;
size_t size;
FreeListItem *next;
} *free_list = nullptr;
WEAK halide_mutex free_list_lock;
} // namespace Cuda
} // namespace Internal
} // namespace Runtime
} // namespace Halide
using namespace Halide::Runtime::Internal;
using namespace Halide::Runtime::Internal::Cuda;
extern "C" {
// The default implementation of halide_cuda_acquire_context uses the global
// pointers above, and serializes access with a spin lock.
// Overriding implementations of acquire/release must implement the following
// behavior:
// - halide_cuda_acquire_context should always store a valid context/command
// queue in ctx/q, or return an error code.
// - A call to halide_cuda_acquire_context is followed by a matching call to
// halide_cuda_release_context. halide_cuda_acquire_context should block while a
// previous call (if any) has not yet been released via halide_cuda_release_context.
WEAK int halide_cuda_acquire_context(void *user_context, CUcontext *ctx, bool create = true) {
// TODO: Should we use a more "assertive" assert? these asserts do
// not block execution on failure.
halide_assert(user_context, ctx != nullptr);
// If the context has not been initialized, initialize it now.
halide_assert(user_context, &context != nullptr);
// Note that this null-check of the context is *not* locked with
// respect to device_release, so we may get a non-null context
// that's in the process of being destroyed. Things will go badly
// in general if you call device_release while other Halide code
// is running though.
CUcontext local_val = context;
if (local_val == nullptr) {
if (!create) {
*ctx = nullptr;
return 0;
}
{
ScopedMutexLock spinlock(&context_lock);
local_val = context;
if (local_val == nullptr) {
CUresult error = create_cuda_context(user_context, &local_val);
if (error != CUDA_SUCCESS) {
return error;
}
}
// Normally in double-checked locking you need a release
// fence here that synchronizes with an acquire fence
// above to ensure context is fully constructed before
// assigning to the global, but there's no way that
// create_cuda_context can access the "context" global, so
// we should be OK just storing to it here.
context = local_val;
} // spinlock
}
*ctx = local_val;
return 0;
}
WEAK int halide_cuda_release_context(void *user_context) {
return 0;
}
// Return the stream to use for executing kernels and synchronization. Only called
// for versions of cuda which support streams. Default is to use the main stream
// for the context (nullptr stream). The context is passed in for convenience, but
// any sort of scoping must be handled by that of the
// halide_cuda_acquire_context/halide_cuda_release_context pair, not this call.
WEAK int halide_cuda_get_stream(void *user_context, CUcontext ctx, CUstream *stream) {
// There are two default streams we could use. stream 0 is fully
// synchronous. stream 2 gives a separate non-blocking stream per
// thread.
*stream = nullptr;
return 0;
}
} // extern "C"
namespace Halide {
namespace Runtime {
namespace Internal {
namespace Cuda {
// Helper object to acquire and release the cuda context.
class Context {
void *user_context;
public:
CUcontext context;
int error;
// Constructor sets 'error' if any occurs.
ALWAYS_INLINE Context(void *user_context)
: user_context(user_context),
context(nullptr),
error(CUDA_SUCCESS) {
#ifdef DEBUG_RUNTIME
halide_start_clock(user_context);
#endif
error = halide_cuda_acquire_context(user_context, &context);
if (error != 0) {
return;
}
// The default acquire_context loads libcuda as a
// side-effect. However, if acquire_context has been
// overridden, we may still need to load libcuda
ensure_libcuda_init(user_context);
halide_assert(user_context, context != nullptr);
halide_assert(user_context, cuInit != nullptr);
error = cuCtxPushCurrent(context);
}
ALWAYS_INLINE ~Context() {
if (error == 0) {
CUcontext old;
cuCtxPopCurrent(&old);
}
halide_cuda_release_context(user_context);
}
};
WEAK Halide::Internal::GPUCompilationCache<CUcontext, CUmodule> compilation_cache;
WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
// Initialize CUDA
ensure_libcuda_init(user_context);
if (!cuInit) {
error(user_context) << "Could not find cuda system libraries";
return CUDA_ERROR_FILE_NOT_FOUND;
}
CUresult err = cuInit(0);
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuInit failed: "
<< get_error_name(err);
return err;
}
// Make sure we have a device
int deviceCount = 0;
err = cuDeviceGetCount(&deviceCount);
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuGetDeviceCount failed: "
<< get_error_name(err);
return err;
}
if (deviceCount <= 0) {
halide_error(user_context, "CUDA: No devices available");
return CUDA_ERROR_NO_DEVICE;
}
int device = halide_get_gpu_device(user_context);
if (device == -1 && deviceCount == 1) {
device = 0;
} else if (device == -1) {
debug(user_context) << "CUDA: Multiple CUDA devices detected. Selecting the one with the most cores.\n";
int best_core_count = 0;
for (int i = 0; i < deviceCount; i++) {
CUdevice dev;
CUresult status = cuDeviceGet(&dev, i);
if (status != CUDA_SUCCESS) {
debug(user_context) << " Failed to get device " << i << "\n";
continue;
}
int core_count = 0;
status = cuDeviceGetAttribute(&core_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
debug(user_context) << " Device " << i << " has " << core_count << " cores\n";
if (status != CUDA_SUCCESS) {
continue;
}
if (core_count >= best_core_count) {
device = i;
best_core_count = core_count;
}
}
}
// Get device
CUdevice dev;
CUresult status = cuDeviceGet(&dev, device);
if (status != CUDA_SUCCESS) {
halide_error(user_context, "CUDA: Failed to get device\n");
return status;
}
debug(user_context) << " Got device " << dev << "\n";
// Dump device attributes
#ifdef DEBUG_RUNTIME
{
char name[256];
name[0] = 0;
err = cuDeviceGetName(name, 256, dev);
debug(user_context) << " " << name << "\n";
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuDeviceGetName failed: "
<< get_error_name(err);
return err;
}
size_t memory = 0;
err = cuDeviceTotalMem(&memory, dev);
debug(user_context) << " total memory: " << (int)(memory >> 20) << " MB\n";
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuDeviceTotalMem failed: "
<< get_error_name(err);
return err;
}
// Declare variables for other state we want to query.
int max_threads_per_block = 0, warp_size = 0, num_cores = 0;
int max_block_size[] = {0, 0, 0};
int max_grid_size[] = {0, 0, 0};
int max_shared_mem = 0, max_constant_mem = 0;
int cc_major = 0, cc_minor = 0;
struct {
int *dst;
CUdevice_attribute attr;
} attrs[] = {
{&max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK},
{&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE},
{&num_cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT},
{&max_block_size[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X},
{&max_block_size[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y},
{&max_block_size[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z},
{&max_grid_size[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X},
{&max_grid_size[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y},
{&max_grid_size[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z},
{&max_shared_mem, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK},
{&max_constant_mem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY},
{&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR},
{&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR},
{nullptr, CU_DEVICE_ATTRIBUTE_MAX}};
// Do all the queries.
for (int i = 0; attrs[i].dst; i++) {
err = cuDeviceGetAttribute(attrs[i].dst, attrs[i].attr, dev);
if (err != CUDA_SUCCESS) {
error(user_context)
<< "CUDA: cuDeviceGetAttribute failed ("
<< get_error_name(err)
<< ") for attribute " << (int)attrs[i].attr;
return err;
}
}
// threads per core is a function of the compute capability
int threads_per_core;
switch (cc_major) {
case 1:
threads_per_core = 8;
break;
case 2:
threads_per_core = (cc_minor == 0 ? 32 : 48);
break;
case 3:
threads_per_core = 192;
break;
case 5:
threads_per_core = 128;
break;
case 6:
threads_per_core = (cc_minor == 0 ? 64 : 128);
break;
case 7:
threads_per_core = 64;
break;
default:
threads_per_core = 0;
break;
}
debug(user_context)
<< " max threads per block: " << max_threads_per_block << "\n"
<< " warp size: " << warp_size << "\n"
<< " max block size: " << max_block_size[0]
<< " " << max_block_size[1] << " " << max_block_size[2] << "\n"
<< " max grid size: " << max_grid_size[0]
<< " " << max_grid_size[1] << " " << max_grid_size[2] << "\n"
<< " max shared memory per block: " << max_shared_mem << "\n"
<< " max constant memory per block: " << max_constant_mem << "\n"
<< " compute capability " << cc_major << "." << cc_minor << "\n"
<< " cuda cores: " << num_cores << " x " << threads_per_core
<< " = " << num_cores * threads_per_core << "\n";
}
#endif
// Create context
debug(user_context) << " cuCtxCreate " << dev << " -> ";
err = cuCtxCreate(ctx, 0, dev);
if (err != CUDA_SUCCESS) {
debug(user_context) << get_error_name(err) << "\n";
error(user_context) << "CUDA: cuCtxCreate failed: "
<< get_error_name(err);
return err;
} else {
unsigned int version = 0;
cuCtxGetApiVersion(*ctx, &version);
debug(user_context) << *ctx << "(" << version << ")\n";
}
// Creation automatically pushes the context, but we'll pop to allow the caller
// to decide when to push.
CUcontext dummy;
err = cuCtxPopCurrent(&dummy);
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuCtxPopCurrent failed: "
<< get_error_name(err);
return err;
}
return CUDA_SUCCESS;
}
// This feature may be useful during CUDA backend or runtime
// development. It does not seem to find many errors in general Halide
// use and causes false positives in at least one environment, where
// it prevents using debug mode with cuda.
#define ENABLE_POINTER_VALIDATION 0
WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) {
// The technique using cuPointerGetAttribute and CU_POINTER_ATTRIBUTE_CONTEXT
// requires unified virtual addressing is enabled and that is not the case
// for 32-bit processes on Mac OS X. So for now, as a total hack, just return true
// in 32-bit. This could of course be wrong the other way for cards that only
// support 32-bit addressing in 64-bit processes, but I expect those cards do not
// support unified addressing at all.
// TODO: figure out a way to validate pointers in all cases if strictly necessary.
#if defined(BITS_32) || !ENABLE_POINTER_VALIDATION
return true;
#else
if (buf->device == 0)
return true;
CUdeviceptr dev_ptr = (CUdeviceptr)buf->device;
CUcontext ctx;
CUresult result = cuPointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
if (result != CUDA_SUCCESS) {
error(user_context) << "Bad device pointer " << (void *)dev_ptr
<< ": cuPointerGetAttribute returned "
<< get_error_name(result);
return false;
}
return true;
#endif
}
WEAK CUmodule compile_kernel(void *user_context, const char *ptx_src, int size) {
debug(user_context) << "CUDA: compile_kernel cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
unsigned int max_regs_per_thread = 64;
// A hack to enable control over max register count for
// testing. This should be surfaced in the schedule somehow
// instead.
char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
if (regs) {
max_regs_per_thread = atoi(regs);
}
void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
CUmodule loaded_module;
CUresult err = cuModuleLoadDataEx(&loaded_module, ptx_src, 1, options, optionValues);
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuModuleLoadData failed: "
<< get_error_name(err);
return nullptr;
} else {
debug(user_context) << (void *)(loaded_module) << "\n";
}
return loaded_module;
}
} // namespace Cuda
} // namespace Internal
} // namespace Runtime
} // namespace Halide
extern "C" {
WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, const char *ptx_src, int size) {
debug(user_context) << "CUDA: halide_cuda_initialize_kernels (user_context: " << user_context
<< ", state_ptr: " << state_ptr
<< ", ptx_src: " << (void *)ptx_src
<< ", size: " << size << "\n";
Context ctx(user_context);
if (ctx.error != 0) {
return ctx.error;
}
#ifdef DEBUG_RUNTIME
uint64_t t_before = halide_current_time_ns(user_context);
#endif
CUmodule loaded_module;
if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, loaded_module,
compile_kernel, user_context, ptx_src, size)) {
return halide_error_code_generic_error;
}
halide_assert(user_context, loaded_module != nullptr);
#ifdef DEBUG_RUNTIME
uint64_t t_after = halide_current_time_ns(user_context);
debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
#endif
return 0;
}
WEAK void halide_cuda_finalize_kernels(void *user_context, void *state_ptr) {
Context ctx(user_context);
if (ctx.error == 0) {
compilation_cache.release_hold(user_context, ctx.context, state_ptr);
}
}
WEAK int halide_cuda_release_unused_device_allocations(void *user_context) {
FreeListItem *to_free;
{
ScopedMutexLock lock(&free_list_lock);
to_free = free_list;
free_list = nullptr;
}
while (to_free) {
debug(user_context) << " cuMemFree " << (void *)(to_free->ptr) << "\n";
cuMemFree(to_free->ptr);
FreeListItem *next = to_free->next;
free(to_free);
to_free = next;
}
return 0;
}
namespace Halide {
namespace Runtime {
namespace Internal {
WEAK halide_device_allocation_pool cuda_allocation_pool;
WEAK __attribute__((constructor)) void register_cuda_allocation_pool() {
cuda_allocation_pool.release_unused = &halide_cuda_release_unused_device_allocations;
halide_register_device_allocation_pool(&cuda_allocation_pool);
}
ALWAYS_INLINE uint64_t quantize_allocation_size(uint64_t sz) {
int z = __builtin_clzll(sz);
if (z < 60) {
sz--;
sz = sz >> (60 - z);
sz++;
sz = sz << (60 - z);
}
return sz;
}
} // namespace Internal
} // namespace Runtime
} // namespace Halide
WEAK int halide_cuda_device_free(void *user_context, halide_buffer_t *buf) {
// halide_device_free, at present, can be exposed to clients and they
// should be allowed to call halide_device_free on any halide_buffer_t
// including ones that have never been used with a GPU.
if (buf->device == 0) {
return 0;
}
CUdeviceptr dev_ptr = (CUdeviceptr)buf->device;
debug(user_context)
<< "CUDA: halide_cuda_device_free (user_context: " << user_context
<< ", buf: " << buf << ")\n";
Context ctx(user_context);
if (ctx.error != CUDA_SUCCESS) {
return ctx.error;
}
#ifdef DEBUG_RUNTIME
uint64_t t_before = halide_current_time_ns(user_context);
#endif
halide_assert(user_context, validate_device_pointer(user_context, buf));
CUresult err = CUDA_SUCCESS;
if (halide_can_reuse_device_allocations(user_context)) {
debug(user_context) << " caching allocation for later use: " << (void *)(dev_ptr) << "\n";
FreeListItem *item = (FreeListItem *)malloc(sizeof(FreeListItem));
item->ctx = ctx.context;
item->size = quantize_allocation_size(buf->size_in_bytes());
item->ptr = dev_ptr;
if (cuStreamSynchronize) {
// We don't want to use a buffer freed one stream on
// another, as there are no synchronization guarantees and
// everything is async.
int result = halide_cuda_get_stream(user_context, ctx.context, &item->stream);
if (result != 0) {
error(user_context) << "CUDA: In halide_cuda_device_free, halide_cuda_get_stream returned " << result << "\n";
}
} else {
item->stream = nullptr;
}
{
ScopedMutexLock lock(&free_list_lock);
item->next = free_list;
free_list = item;
}
} else {
debug(user_context) << " cuMemFree " << (void *)(dev_ptr) << "\n";
err = cuMemFree(dev_ptr);
// If cuMemFree fails, it isn't likely to succeed later, so just drop
// the reference.
}
buf->device_interface->impl->release_module();
buf->device_interface = nullptr;
buf->device = 0;
if (err != CUDA_SUCCESS) {
// We may be called as a destructor, so don't raise an error here.
return err;
}
#ifdef DEBUG_RUNTIME
uint64_t t_after = halide_current_time_ns(user_context);
debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
#endif
return 0;
}
WEAK int halide_cuda_device_release(void *user_context) {
debug(user_context)
<< "CUDA: halide_cuda_device_release (user_context: " << user_context << ")\n";
// If we haven't even loaded libcuda, don't load it just to quit.
if (!cuInit) {
return 0;
}
int err;
CUcontext ctx;
err = halide_cuda_acquire_context(user_context, &ctx, false);
if (err != CUDA_SUCCESS) {
return err;
}
if (ctx) {
// It's possible that this is being called from the destructor of
// a static variable, in which case the driver may already be
// shutting down.
err = cuCtxPushCurrent(ctx);
if (err != CUDA_SUCCESS) {
err = cuCtxSynchronize();
}
halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
// Dump the contents of the free list, ignoring errors.
halide_cuda_release_unused_device_allocations(user_context);
compilation_cache.delete_context(user_context, ctx, cuModuleUnload);
CUcontext old_ctx;
cuCtxPopCurrent(&old_ctx);
// Only destroy the context if we own it
{
ScopedMutexLock spinlock(&context_lock);
if (ctx == context) {
debug(user_context) << " cuCtxDestroy " << context << "\n";
err = cuProfilerStop();
err = cuCtxDestroy(context);
halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
context = nullptr;
}
} // spinlock
}
halide_cuda_release_context(user_context);
return 0;
}
WEAK int halide_cuda_device_malloc(void *user_context, halide_buffer_t *buf) {
debug(user_context)
<< "CUDA: halide_cuda_device_malloc (user_context: " << user_context
<< ", buf: " << buf << ")\n";
Context ctx(user_context);
if (ctx.error != CUDA_SUCCESS) {
return ctx.error;
}
size_t size = buf->size_in_bytes();
if (halide_can_reuse_device_allocations(user_context)) {
size = quantize_allocation_size(size);
}
halide_assert(user_context, size != 0);
if (buf->device) {
// This buffer already has a device allocation
halide_assert(user_context, validate_device_pointer(user_context, buf, size));
return 0;
}
// Check all strides positive.
for (int i = 0; i < buf->dimensions; i++) {
halide_assert(user_context, buf->dim[i].stride >= 0);
}
debug(user_context) << " allocating " << *buf << "\n";
#ifdef DEBUG_RUNTIME
uint64_t t_before = halide_current_time_ns(user_context);
#endif
CUdeviceptr p = 0;
FreeListItem *to_free = nullptr;
if (halide_can_reuse_device_allocations(user_context)) {
CUstream stream = nullptr;
if (cuStreamSynchronize != nullptr) {
int result = halide_cuda_get_stream(user_context, ctx.context, &stream);
if (result != 0) {
error(user_context) << "CUDA: In halide_cuda_device_malloc, halide_cuda_get_stream returned " << result << "\n";
}
}
ScopedMutexLock lock(&free_list_lock);
// Best-fit allocation. There are three tunable constants
// here. A bucket is claimed if the size requested is at least
// 7/8 of the size of the bucket. We keep at most 32 unused
// allocations. We round up each allocation size to its top 4
// most significant bits (see quantize_allocation_size).
FreeListItem *best = nullptr, *item = free_list;
FreeListItem **best_prev = nullptr, **prev_ptr = &free_list;
int depth = 0;
while (item) {
if ((size <= item->size) && // Fits
(size >= (item->size / 8) * 7) && // Not too much slop
(ctx.context == item->ctx) && // Same cuda context
(stream == item->stream) && // Can only safely re-use on the same stream on which it was freed
((best == nullptr) || (best->size > item->size))) { // Better than previous best fit
best = item;
best_prev = prev_ptr;
prev_ptr = &item->next;
item = item->next;
} else if (depth > 32) {
// Allocations after here have not been used for a
// long time. Just detach the rest of the free list
// and defer the actual cuMemFree calls until after we
// release the free_list_lock.
to_free = item;
*prev_ptr = nullptr;
item = nullptr;
break;
} else {
prev_ptr = &item->next;
item = item->next;
}
depth++;
}
if (best) {
p = best->ptr;
*best_prev = best->next;
free(best);
}
}
while (to_free) {
FreeListItem *next = to_free->next;
cuMemFree(to_free->ptr);
free(to_free);
to_free = next;
}
if (!p) {
debug(user_context) << " cuMemAlloc " << (uint64_t)size << " -> ";
// Quantize all allocation sizes to the top 4 bits, to make
// reuse likelier. Wastes on average 4% memory per allocation.
CUresult err = cuMemAlloc(&p, size);
if (err == CUDA_ERROR_OUT_OF_MEMORY) {
halide_cuda_release_unused_device_allocations(user_context);
err = cuMemAlloc(&p, size);
}
if (err != CUDA_SUCCESS) {
debug(user_context) << get_error_name(err) << "\n";
error(user_context) << "CUDA: cuMemAlloc failed: "
<< get_error_name(err);
return err;
} else {
debug(user_context) << (void *)p << "\n";
}
}
halide_assert(user_context, p);
buf->device = p;
buf->device_interface = &cuda_device_interface;
buf->device_interface->impl->use_module();
#ifdef DEBUG_RUNTIME
uint64_t t_after = halide_current_time_ns(user_context);
debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
#endif
return 0;
}
namespace {
WEAK int cuda_do_multidimensional_copy(void *user_context, const device_copy &c,
uint64_t src, uint64_t dst, int d, bool from_host, bool to_host) {
if (d > MAX_COPY_DIMS) {
error(user_context) << "Buffer has too many dimensions to copy to/from GPU\n";
return -1;
} else if (d == 0) {
CUresult err = CUDA_SUCCESS;
const char *copy_name;
debug(user_context) << " from " << (from_host ? "host" : "device")
<< " to " << (to_host ? "host" : "device") << ", "
<< (void *)src << " -> " << (void *)dst << ", " << c.chunk_size << " bytes\n";
if (!from_host && to_host) {
debug(user_context) << "cuMemcpyDtoH(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
copy_name = "cuMemcpyDtoH";
err = cuMemcpyDtoH((void *)dst, (CUdeviceptr)src, c.chunk_size);
} else if (from_host && !to_host) {
debug(user_context) << "cuMemcpyHtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
copy_name = "cuMemcpyHtoD";
err = cuMemcpyHtoD((CUdeviceptr)dst, (void *)src, c.chunk_size);
} else if (!from_host && !to_host) {
debug(user_context) << "cuMemcpyDtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
copy_name = "cuMemcpyDtoD";
err = cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, c.chunk_size);
} else if (dst != src) {
debug(user_context) << "memcpy(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
// Could reach here if a user called directly into the
// cuda API for a device->host copy on a source buffer
// with device_dirty = false.
memcpy((void *)dst, (void *)src, c.chunk_size);
}
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: " << copy_name << " failed: " << get_error_name(err);
return (int)err;
}
} else {
ssize_t src_off = 0, dst_off = 0;
for (int i = 0; i < (int)c.extent[d - 1]; i++) {
int err = cuda_do_multidimensional_copy(user_context, c, src + src_off, dst + dst_off, d - 1, from_host, to_host);
dst_off += c.dst_stride_bytes[d - 1];
src_off += c.src_stride_bytes[d - 1];
if (err) {
return err;
}
}
}
return 0;
}
} // namespace
WEAK int halide_cuda_buffer_copy(void *user_context, struct halide_buffer_t *src,
const struct halide_device_interface_t *dst_device_interface,
struct halide_buffer_t *dst) {
// We only handle copies to cuda or to host
halide_assert(user_context, dst_device_interface == nullptr ||
dst_device_interface == &cuda_device_interface);
if ((src->device_dirty() || src->host == nullptr) &&
src->device_interface != &cuda_device_interface) {
halide_assert(user_context, dst_device_interface == &cuda_device_interface);
// This is handled at the higher level.
return halide_error_code_incompatible_device_interface;
}
bool from_host = (src->device_interface != &cuda_device_interface) ||
(src->device == 0) ||
(src->host_dirty() && src->host != nullptr);
bool to_host = !dst_device_interface;
halide_assert(user_context, from_host || src->device);
halide_assert(user_context, to_host || dst->device);
device_copy c = make_buffer_copy(src, from_host, dst, to_host);
int err = 0;
{
Context ctx(user_context);
if (ctx.error != CUDA_SUCCESS) {
return ctx.error;
}
debug(user_context)
<< "CUDA: halide_cuda_buffer_copy (user_context: " << user_context
<< ", src: " << src << ", dst: " << dst << ")\n";
#ifdef DEBUG_RUNTIME
uint64_t t_before = halide_current_time_ns(user_context);
if (!from_host) {
halide_assert(user_context, validate_device_pointer(user_context, src));
}
if (!to_host) {
halide_assert(user_context, validate_device_pointer(user_context, dst));
}
#endif
err = cuda_do_multidimensional_copy(user_context, c, c.src + c.src_begin, c.dst, dst->dimensions, from_host, to_host);
#ifdef DEBUG_RUNTIME
uint64_t t_after = halide_current_time_ns(user_context);
debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
#endif
}
return err;
}
namespace {
WEAK int cuda_device_crop_from_offset(const struct halide_buffer_t *src,
int64_t offset,
struct halide_buffer_t *dst) {
dst->device = src->device + offset;
dst->device_interface = src->device_interface;
dst->set_device_dirty(src->device_dirty());
return 0;
}
} // namespace
WEAK int halide_cuda_device_crop(void *user_context, const struct halide_buffer_t *src,
struct halide_buffer_t *dst) {
debug(user_context)
<< "CUDA: halide_cuda_device_crop (user_context: " << user_context
<< ", src: " << src << ", dst: " << dst << ")\n";
// Pointer arithmetic works fine.
const int64_t offset = calc_device_crop_byte_offset(src, dst);
return cuda_device_crop_from_offset(src, offset, dst);
}
WEAK int halide_cuda_device_slice(void *user_context, const struct halide_buffer_t *src,
int slice_dim, int slice_pos,
struct halide_buffer_t *dst) {
debug(user_context)
<< "CUDA: halide_cuda_device_slice (user_context: " << user_context
<< ", src: " << src << ", slice_dim " << slice_dim << ", slice_pos "
<< slice_pos << ", dst: " << dst << ")\n";
// Pointer arithmetic works fine.
const int64_t offset = calc_device_slice_byte_offset(src, slice_dim, slice_pos);
return cuda_device_crop_from_offset(src, offset, dst);
}
WEAK int halide_cuda_device_release_crop(void *user_context, struct halide_buffer_t *dst) {
debug(user_context)
<< "CUDA: halide_cuda_release_crop (user_context: " << user_context
<< ", dst: " << dst << ")\n";
return 0;
}
WEAK int halide_cuda_copy_to_device(void *user_context, halide_buffer_t *buf) {
return halide_cuda_buffer_copy(user_context, buf, &cuda_device_interface, buf);
}
WEAK int halide_cuda_copy_to_host(void *user_context, halide_buffer_t *buf) {
return halide_cuda_buffer_copy(user_context, buf, nullptr, buf);
}
// Used to generate correct timings when tracing
WEAK int halide_cuda_device_sync(void *user_context, struct halide_buffer_t *) {
debug(user_context)
<< "CUDA: halide_cuda_device_sync (user_context: " << user_context << ")\n";
Context ctx(user_context);
if (ctx.error != CUDA_SUCCESS) {
return ctx.error;
}
#ifdef DEBUG_RUNTIME
uint64_t t_before = halide_current_time_ns(user_context);
#endif
CUresult err;
if (cuStreamSynchronize != nullptr) {
CUstream stream;
int result = halide_cuda_get_stream(user_context, ctx.context, &stream);
if (result != 0) {
error(user_context) << "CUDA: In halide_cuda_device_sync, halide_cuda_get_stream returned " << result << "\n";
}
err = cuStreamSynchronize(stream);
} else {
err = cuCtxSynchronize();
}
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuCtxSynchronize failed: "
<< get_error_name(err);
return err;
}
#ifdef DEBUG_RUNTIME
uint64_t t_after = halide_current_time_ns(user_context);
debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
#endif
return 0;
}
WEAK int halide_cuda_run(void *user_context,
void *state_ptr,
const char *entry_name,
int blocksX, int blocksY, int blocksZ,
int threadsX, int threadsY, int threadsZ,
int shared_mem_bytes,
size_t arg_sizes[],
void *args[],
int8_t arg_is_buffer[],
int num_attributes,
float *vertex_buffer,
int num_coords_dim0,
int num_coords_dim1) {
debug(user_context) << "CUDA: halide_cuda_run ("
<< "user_context: " << user_context << ", "
<< "entry: " << entry_name << ", "
<< "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", "
<< "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", "
<< "shmem: " << shared_mem_bytes << "\n";
CUresult err;
Context ctx(user_context);
if (ctx.error != CUDA_SUCCESS) {
return ctx.error;
}
debug(user_context) << "Got context.\n";
#ifdef DEBUG_RUNTIME
uint64_t t_before = halide_current_time_ns(user_context);
#endif
CUmodule mod{};
bool found = compilation_cache.lookup(ctx.context, state_ptr, mod);
halide_assert(user_context, found && mod != nullptr);
debug(user_context) << "Got module " << mod << "\n";
CUfunction f;
err = cuModuleGetFunction(&f, mod, entry_name);
debug(user_context) << "Got function " << f << "\n";
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuModuleGetFunction failed: "
<< get_error_name(err);
return err;
}
size_t num_args = 0;
while (arg_sizes[num_args] != 0) {
debug(user_context) << " halide_cuda_run " << (int)num_args
<< " " << (int)arg_sizes[num_args]
<< " [" << (*((void **)args[num_args])) << " ...] "
<< arg_is_buffer[num_args] << "\n";
num_args++;
}
// We need storage for both the arg and the pointer to it if if
// has to be translated.
void **translated_args = (void **)malloc((num_args + 1) * sizeof(void *));
uint64_t *dev_handles = (uint64_t *)malloc(num_args * sizeof(uint64_t));
for (size_t i = 0; i <= num_args; i++) { // Get nullptr at end.
if (arg_is_buffer[i]) {
halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t));
dev_handles[i] = ((halide_buffer_t *)args[i])->device;
translated_args[i] = &(dev_handles[i]);
debug(user_context) << " halide_cuda_run translated arg" << (int)i
<< " [" << (*((void **)translated_args[i])) << " ...]\n";
} else {
translated_args[i] = args[i];
}
}
CUstream stream = nullptr;
// We use whether this routine was defined in the cuda driver library
// as a test for streams support in the cuda implementation.
if (cuStreamSynchronize != nullptr) {
int result = halide_cuda_get_stream(user_context, ctx.context, &stream);
if (result != 0) {
error(user_context) << "CUDA: In halide_cuda_run, halide_cuda_get_stream returned " << result << "\n";
free(dev_handles);
free(translated_args);
return result;
}
}
err = cuLaunchKernel(f,
blocksX, blocksY, blocksZ,
threadsX, threadsY, threadsZ,
shared_mem_bytes,
stream,
translated_args,
nullptr);
free(dev_handles);
free(translated_args);
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuLaunchKernel failed: "
<< get_error_name(err);
return err;
}
#ifdef DEBUG_RUNTIME
err = cuCtxSynchronize();
if (err != CUDA_SUCCESS) {
error(user_context) << "CUDA: cuCtxSynchronize failed: "
<< get_error_name(err);
return err;
}
uint64_t t_after = halide_current_time_ns(user_context);
debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
#endif
return 0;
}
WEAK int halide_cuda_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
return halide_default_device_and_host_malloc(user_context, buf, &cuda_device_interface);
}
WEAK int halide_cuda_device_and_host_free(void *user_context, struct halide_buffer_t *buf) {
return halide_default_device_and_host_free(user_context, buf, &cuda_device_interface);
}
WEAK int halide_cuda_wrap_device_ptr(void *user_context, struct halide_buffer_t *buf, uint64_t device_ptr) {
halide_assert(user_context, buf->device == 0);
if (buf->device != 0) {
return -2;
}
buf->device = device_ptr;
buf->device_interface = &cuda_device_interface;
buf->device_interface->impl->use_module();
#ifdef DEBUG_RUNTIME
if (!validate_device_pointer(user_context, buf)) {
buf->device_interface->impl->release_module();
buf->device = 0;
buf->device_interface = nullptr;
return -3;
}
#endif
return 0;
}
WEAK int halide_cuda_detach_device_ptr(void *user_context, struct halide_buffer_t *buf) {
if (buf->device == 0) {
return 0;
}
halide_assert(user_context, buf->device_interface == &cuda_device_interface);
buf->device_interface->impl->release_module();
buf->device = 0;
buf->device_interface = nullptr;
return 0;
}
WEAK uintptr_t halide_cuda_get_device_ptr(void *user_context, struct halide_buffer_t *buf) {
if (buf->device == 0) {
return 0;
}
halide_assert(user_context, buf->device_interface == &cuda_device_interface);
return (uintptr_t)buf->device;
}
WEAK const halide_device_interface_t *halide_cuda_device_interface() {
return &cuda_device_interface;
}
WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *minor) {
if (!lib_cuda && !cuInit) {
// If cuda can't be found, we want to return 0, 0 and it's not
// considered an error. So we should be very careful about
// looking for libcuda without tripping any errors in the rest
// of this runtime.
void *sym = halide_cuda_get_symbol(user_context, "cuInit");
if (!sym) {
*major = *minor = 0;
return 0;
}
}
{
Context ctx(user_context);
if (ctx.error != 0) {
return ctx.error;
}
CUresult err;
CUdevice dev;
err = cuCtxGetDevice(&dev);
if (err != CUDA_SUCCESS) {
error(user_context)
<< "CUDA: cuCtxGetDevice failed ("
<< Halide::Runtime::Internal::Cuda::get_error_name(err)
<< ")";
return err;
}
err = cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
if (err == CUDA_SUCCESS) {
err = cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
}
if (err != CUDA_SUCCESS) {
error(user_context)
<< "CUDA: cuDeviceGetAttribute failed ("
<< Halide::Runtime::Internal::Cuda::get_error_name(err)
<< ")";
return err;
}
}
return 0;
}
namespace {
WEAK __attribute__((destructor)) void halide_cuda_cleanup() {
compilation_cache.release_all(nullptr, cuModuleUnload);
halide_cuda_device_release(nullptr);
}
} // namespace
} // extern "C" linkage
namespace Halide {
namespace Runtime {
namespace Internal {
namespace Cuda {
WEAK const char *get_error_name(CUresult err) {
switch (err) {
case CUDA_SUCCESS:
return "CUDA_SUCCESS";
case CUDA_ERROR_INVALID_VALUE:
return "CUDA_ERROR_INVALID_VALUE";
case CUDA_ERROR_OUT_OF_MEMORY:
return "CUDA_ERROR_OUT_OF_MEMORY";
case CUDA_ERROR_NOT_INITIALIZED:
return "CUDA_ERROR_NOT_INITIALIZED";
case CUDA_ERROR_DEINITIALIZED:
return "CUDA_ERROR_DEINITIALIZED";
case CUDA_ERROR_PROFILER_DISABLED:
return "CUDA_ERROR_PROFILER_DISABLED";
case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
case CUDA_ERROR_PROFILER_ALREADY_STARTED:
return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
case CUDA_ERROR_NO_DEVICE:
return "CUDA_ERROR_NO_DEVICE";
case CUDA_ERROR_INVALID_DEVICE:
return "CUDA_ERROR_INVALID_DEVICE";
case CUDA_ERROR_INVALID_IMAGE:
return "CUDA_ERROR_INVALID_IMAGE";
case CUDA_ERROR_INVALID_CONTEXT:
return "CUDA_ERROR_INVALID_CONTEXT";
case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
case CUDA_ERROR_MAP_FAILED:
return "CUDA_ERROR_MAP_FAILED";
case CUDA_ERROR_UNMAP_FAILED:
return "CUDA_ERROR_UNMAP_FAILED";
case CUDA_ERROR_ARRAY_IS_MAPPED:
return "CUDA_ERROR_ARRAY_IS_MAPPED";
case CUDA_ERROR_ALREADY_MAPPED:
return "CUDA_ERROR_ALREADY_MAPPED";
case CUDA_ERROR_NO_BINARY_FOR_GPU:
return "CUDA_ERROR_NO_BINARY_FOR_GPU";
case CUDA_ERROR_ALREADY_ACQUIRED:
return "CUDA_ERROR_ALREADY_ACQUIRED";
case CUDA_ERROR_NOT_MAPPED:
return "CUDA_ERROR_NOT_MAPPED";
case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
case CUDA_ERROR_ECC_UNCORRECTABLE:
return "CUDA_ERROR_ECC_UNCORRECTABLE";
case CUDA_ERROR_UNSUPPORTED_LIMIT:
return "CUDA_ERROR_UNSUPPORTED_LIMIT";
case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
case CUDA_ERROR_INVALID_PTX:
return "CUDA_ERROR_INVALID_PTX";
case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
case CUDA_ERROR_NVLINK_UNCORRECTABLE:
return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
case CUDA_ERROR_JIT_COMPILER_NOT_FOUND:
return "CUDA_ERROR_JIT_COMPILER_NOT_FOUND";
case CUDA_ERROR_INVALID_SOURCE:
return "CUDA_ERROR_INVALID_SOURCE";
case CUDA_ERROR_FILE_NOT_FOUND:
return "CUDA_ERROR_FILE_NOT_FOUND";
case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
case CUDA_ERROR_OPERATING_SYSTEM:
return "CUDA_ERROR_OPERATING_SYSTEM";
case CUDA_ERROR_INVALID_HANDLE:
return "CUDA_ERROR_INVALID_HANDLE";
case CUDA_ERROR_NOT_FOUND:
return "CUDA_ERROR_NOT_FOUND";
case CUDA_ERROR_NOT_READY:
return "CUDA_ERROR_NOT_READY";
case CUDA_ERROR_ILLEGAL_ADDRESS:
return "CUDA_ERROR_ILLEGAL_ADDRESS";
case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
case CUDA_ERROR_LAUNCH_TIMEOUT:
return "CUDA_ERROR_LAUNCH_TIMEOUT";
case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
case CUDA_ERROR_CONTEXT_IS_DESTROYED:
return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
// A trap instruction produces the below error, which is how we codegen asserts on GPU
case CUDA_ERROR_ILLEGAL_INSTRUCTION:
return "Illegal instruction or Halide assertion failure inside kernel";
case CUDA_ERROR_MISALIGNED_ADDRESS:
return "CUDA_ERROR_MISALIGNED_ADDRESS";
case CUDA_ERROR_INVALID_ADDRESS_SPACE:
return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
case CUDA_ERROR_INVALID_PC:
return "CUDA_ERROR_INVALID_PC";
case CUDA_ERROR_LAUNCH_FAILED:
return "CUDA_ERROR_LAUNCH_FAILED";
case CUDA_ERROR_NOT_PERMITTED:
return "CUDA_ERROR_NOT_PERMITTED";
case CUDA_ERROR_NOT_SUPPORTED:
return "CUDA_ERROR_NOT_SUPPORTED";
case CUDA_ERROR_UNKNOWN:
return "CUDA_ERROR_UNKNOWN";
default:
// This is unfortunate as usually get_cuda_error is called in the middle of
// an error print, but dropping the number on the floor is worse.
error(nullptr) << "Unknown cuda error " << err << "\n";
return "<Unknown error>";
}
}
WEAK halide_device_interface_impl_t cuda_device_interface_impl = {
halide_use_jit_module,
halide_release_jit_module,
halide_cuda_device_malloc,
halide_cuda_device_free,
halide_cuda_device_sync,
halide_cuda_device_release,
halide_cuda_copy_to_host,
halide_cuda_copy_to_device,
halide_cuda_device_and_host_malloc,
halide_cuda_device_and_host_free,
halide_cuda_buffer_copy,
halide_cuda_device_crop,
halide_cuda_device_slice,
halide_cuda_device_release_crop,
halide_cuda_wrap_device_ptr,
halide_cuda_detach_device_ptr,
};
WEAK halide_device_interface_t cuda_device_interface = {
halide_device_malloc,
halide_device_free,
halide_device_sync,
halide_device_release,
halide_copy_to_host,
halide_copy_to_device,
halide_device_and_host_malloc,
halide_device_and_host_free,
halide_buffer_copy,
halide_device_crop,
halide_device_slice,
halide_device_release_crop,
halide_device_wrap_native,
halide_device_detach_native,
halide_cuda_compute_capability,
&cuda_device_interface_impl};
} // namespace Cuda
} // namespace Internal
} // namespace Runtime
} // namespace Halide
Computing file changes ...