Content - 064622e3e2d641ef34ad4426d8ba151a0e1161c8 - 328d19a/cuda.cpp

cuda.cpp
#include "HalideRuntimeCuda.h"
#include "device_buffer_utils.h"
#include "device_interface.h"
#include "printer.h"
#include "mini_cuda.h"

#define INLINE inline __attribute__((always_inline))

namespace Halide { namespace Runtime { namespace Internal { namespace Cuda {

// Define the function pointers for the CUDA API.
#define CUDA_FN(ret, fn, args) WEAK ret (CUDAAPI *fn)args;
#define CUDA_FN_3020(ret, fn, fn_3020, args) WEAK ret (CUDAAPI *fn)args;
#define CUDA_FN_4000(ret, fn, fn_4000, args) WEAK ret (CUDAAPI *fn)args;
#include "cuda_functions.h"

// The default implementation of halide_cuda_get_symbol attempts to load
// the CUDA shared library/DLL, and then get the symbol from it.
WEAK void *lib_cuda = NULL;

extern "C" WEAK void *halide_cuda_get_symbol(void *user_context, const char *name) {
    // Only try to load the library if we can't already get the symbol
    // from the library. Even if the library is NULL, the symbols may
    // already be available in the process.
    void *symbol = halide_get_library_symbol(lib_cuda, name);
    if (symbol) {
        return symbol;
    }

    const char *lib_names[] = {
#ifdef WINDOWS
        "nvcuda.dll",
#else
        "libcuda.so",
        "libcuda.dylib",
        "/Library/Frameworks/CUDA.framework/CUDA",
#endif
    };
    for (size_t i = 0; i < sizeof(lib_names) / sizeof(lib_names[0]); i++) {
        lib_cuda = halide_load_library(lib_names[i]);
        if (lib_cuda) {
            debug(user_context) << "    Loaded CUDA runtime library: " << lib_names[i] << "\n";
            break;
        }
    }

    return halide_get_library_symbol(lib_cuda, name);
}

template <typename T>
INLINE T get_cuda_symbol(void *user_context, const char *name) {
    T s = (T)halide_cuda_get_symbol(user_context, name);
    if (!s) {
        error(user_context) << "CUDA API not found: " << name << "\n";
    }
    return s;
}

// Load a CUDA shared object/dll and get the CUDA API function pointers from it.
WEAK void load_libcuda(void *user_context) {
    debug(user_context) << "    load_libcuda (user_context: " << user_context << ")\n";
    halide_assert(user_context, cuInit == NULL);

    #define CUDA_FN(ret, fn, args) fn = get_cuda_symbol<ret (CUDAAPI *)args>(user_context, #fn);
    #define CUDA_FN_3020(ret, fn, fn_3020, args) fn = get_cuda_symbol<ret (CUDAAPI *)args>(user_context, #fn_3020);
    #define CUDA_FN_4000(ret, fn, fn_4000, args) fn = get_cuda_symbol<ret (CUDAAPI *)args>(user_context, #fn_4000);
    #include "cuda_functions.h"
}

extern WEAK halide_device_interface_t cuda_device_interface;

WEAK const char *get_error_name(CUresult error);
WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx);

// A cuda context defined in this module with weak linkage
CUcontext WEAK context = 0;
volatile int WEAK thread_lock = 0;

}}}} // namespace Halide::Runtime::Internal::Cuda

using namespace Halide::Runtime::Internal;
using namespace Halide::Runtime::Internal::Cuda;

extern "C" {

// The default implementation of halide_cuda_acquire_context uses the global
// pointers above, and serializes access with a spin lock.
// Overriding implementations of acquire/release must implement the following
// behavior:
// - halide_cuda_acquire_context should always store a valid context/command
//   queue in ctx/q, or return an error code.
// - A call to halide_cuda_acquire_context is followed by a matching call to
//   halide_cuda_release_context. halide_cuda_acquire_context should block while a
//   previous call (if any) has not yet been released via halide_cuda_release_context.
WEAK int halide_cuda_acquire_context(void *user_context, CUcontext *ctx, bool create = true) {
    // TODO: Should we use a more "assertive" assert? these asserts do
    // not block execution on failure.
    halide_assert(user_context, ctx != NULL);

    halide_assert(user_context, &thread_lock != NULL);
    while (__sync_lock_test_and_set(&thread_lock, 1)) { }

    // If the context has not been initialized, initialize it now.
    halide_assert(user_context, &context != NULL);
    if (context == NULL && create) {
        CUresult error = create_cuda_context(user_context, &context);
        if (error != CUDA_SUCCESS) {
            __sync_lock_release(&thread_lock);
            return error;
        }
    }

    *ctx = context;
    return 0;
}

WEAK int halide_cuda_release_context(void *user_context) {
    __sync_lock_release(&thread_lock);
    return 0;
}

} // extern "C"

namespace Halide { namespace Runtime { namespace Internal { namespace Cuda {

// Helper object to acquire and release the cuda context.
class Context {
    void *user_context;

public:
    CUcontext context;
    int error;

    // Constructor sets 'error' if any occurs.
    Context(void *user_context) : user_context(user_context),
                                  context(NULL),
                                  error(CUDA_SUCCESS) {
        if (cuInit == NULL) {
            load_libcuda(user_context);
        }

#ifdef DEBUG_RUNTIME
        halide_start_clock(user_context);
#endif
        error = halide_cuda_acquire_context(user_context, &context);
        halide_assert(user_context, context != NULL);
        if (error != 0) {
            return;
        }

        error = cuCtxPushCurrent(context);
    }

    ~Context() {
        CUcontext old;
        cuCtxPopCurrent(&old);

        halide_cuda_release_context(user_context);
    }
};

// Structure to hold the state of a module attached to the context.
// Also used as a linked-list to keep track of all the different
// modules that are attached to a context in order to release them all
// when then context is released.
struct module_state {
    CUmodule module;
    module_state *next;
};
WEAK module_state *state_list = NULL;

WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
    // Initialize CUDA
    CUresult err = cuInit(0);
    if (err != CUDA_SUCCESS) {
        error(user_context) << "CUDA: cuInit failed: "
                            << get_error_name(err);
        return err;
    }

    // Make sure we have a device
    int deviceCount = 0;
    err = cuDeviceGetCount(&deviceCount);
    if (err != CUDA_SUCCESS) {
        error(user_context) << "CUDA: cuGetDeviceCount failed: "
                            << get_error_name(err);
        return err;
    }
    if (deviceCount <= 0) {
        halide_error(user_context, "CUDA: No devices available");
        return CUDA_ERROR_NO_DEVICE;
    }

    int device = halide_get_gpu_device(user_context);
    if (device == -1 && deviceCount == 1) {
        device = 0;
    } else if (device == -1) {
        debug(user_context) << "CUDA: Multiple CUDA devices detected. Selecting the one with the most cores.\n";
        int best_core_count = 0;
        for (int i = 0; i < deviceCount; i++) {
            CUdevice dev;
            CUresult status = cuDeviceGet(&dev, i);
            if (status != CUDA_SUCCESS) {
                debug(user_context) << "      Failed to get device " << i << "\n";
                continue;
            }
            int core_count = 0;
            status = cuDeviceGetAttribute(&core_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
            debug(user_context) << "      Device " << i << " has " << core_count << " cores\n";
            if (status != CUDA_SUCCESS) {
                continue;
            }
            if (core_count >= best_core_count) {
                device = i;
                best_core_count = core_count;
            }
        }
    }

    // Get device
    CUdevice dev;
    CUresult status = cuDeviceGet(&dev, device);
    if (status != CUDA_SUCCESS) {
        halide_error(user_context, "CUDA: Failed to get device\n");
        return status;
    }

    debug(user_context) <<  "    Got device " << dev << "\n";

    // Dump device attributes
    #ifdef DEBUG_RUNTIME
    {
        char name[256];
        name[0] = 0;
        err = cuDeviceGetName(name, 256, dev);
        debug(user_context) << "      " << name << "\n";

        if (err != CUDA_SUCCESS) {
            error(user_context) << "CUDA: cuDeviceGetName failed: "
                                << get_error_name(err);
            return err;
        }

        size_t memory = 0;
        err = cuDeviceTotalMem(&memory, dev);
        debug(user_context) << "      total memory: " << (int)(memory >> 20) << " MB\n";

        if (err != CUDA_SUCCESS) {
            error(user_context) << "CUDA: cuDeviceTotalMem failed: "
                                << get_error_name(err);
            return err;
        }

        // Declare variables for other state we want to query.
        int max_threads_per_block = 0, warp_size = 0, num_cores = 0;
        int max_block_size[] = {0, 0, 0};
        int max_grid_size[] = {0, 0, 0};
        int max_shared_mem = 0, max_constant_mem = 0;
        int cc_major = 0, cc_minor = 0;

        struct {int *dst; CUdevice_attribute attr;} attrs[] = {
            {&max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK},
            {&warp_size,             CU_DEVICE_ATTRIBUTE_WARP_SIZE},
            {&num_cores,             CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT},
            {&max_block_size[0],     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X},
            {&max_block_size[1],     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y},
            {&max_block_size[2],     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z},
            {&max_grid_size[0],      CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X},
            {&max_grid_size[1],      CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y},
            {&max_grid_size[2],      CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z},
            {&max_shared_mem,        CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK},
            {&max_constant_mem,      CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY},
            {&cc_major,              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR},
            {&cc_minor,              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR},
            {NULL,                   CU_DEVICE_ATTRIBUTE_MAX}};

        // Do all the queries.
        for (int i = 0; attrs[i].dst; i++) {
            err = cuDeviceGetAttribute(attrs[i].dst, attrs[i].attr, dev);
            if (err != CUDA_SUCCESS) {
                error(user_context)
                    << "CUDA: cuDeviceGetAttribute failed ("
                    << get_error_name(err)
                    << ") for attribute " << (int)attrs[i].attr;
                return err;
            }
        }

        // threads per core is a function of the compute capability
        int threads_per_core = (cc_major == 1 ? 8 :
                                cc_major == 2 ? (cc_minor == 0 ? 32 : 48) :
                                cc_major == 3 ? 192 :
                                cc_major == 5 ? 128 : 0);

        debug(user_context)
            << "      max threads per block: " << max_threads_per_block << "\n"
            << "      warp size: " << warp_size << "\n"
            << "      max block size: " << max_block_size[0]
            << " " << max_block_size[1] << " " << max_block_size[2] << "\n"
            << "      max grid size: " << max_grid_size[0]
            << " " << max_grid_size[1] << " " << max_grid_size[2] << "\n"
            << "      max shared memory per block: " << max_shared_mem << "\n"
            << "      max constant memory per block: " << max_constant_mem << "\n"
            << "      compute capability " << cc_major << "." << cc_minor << "\n"
            << "      cuda cores: " << num_cores << " x " << threads_per_core << " = " << threads_per_core << "\n";
    }
    #endif

    // Create context
    debug(user_context) <<  "    cuCtxCreate " << dev << " -> ";
    err = cuCtxCreate(ctx, 0, dev);
    if (err != CUDA_SUCCESS) {
        debug(user_context) << get_error_name(err) << "\n";
        error(user_context) << "CUDA: cuCtxCreate failed: "
                            << get_error_name(err);
        return err;
    } else {
        unsigned int version = 0;
        cuCtxGetApiVersion(*ctx, &version);
        debug(user_context) << *ctx << "(" << version << ")\n";
    }
    // Creation automatically pushes the context, but we'll pop to allow the caller
    // to decide when to push.
    err = cuCtxPopCurrent(&context);
    if (err != CUDA_SUCCESS) {
      error(user_context) << "CUDA: cuCtxPopCurrent failed: "
                          << get_error_name(err);
      return err;
    }

    return CUDA_SUCCESS;
}


WEAK bool validate_device_pointer(void *user_context, buffer_t* buf, size_t size=0) {
// The technique using cuPointerGetAttribute and CU_POINTER_ATTRIBUTE_CONTEXT
// requires unified virtual addressing is enabled and that is not the case
// for 32-bit processes on Mac OS X. So for now, as a total hack, just return true
// in 32-bit. This could of course be wrong the other way for cards that only
// support 32-bit addressing in 64-bit processes, but I expect those cards do not
// support unified addressing at all.
// TODO: figure out a way to validate pointers in all cases if strictly necessary.
#ifdef BITS_32
    return true;
#else
    if (buf->dev == 0)
        return true;

    CUdeviceptr dev_ptr = (CUdeviceptr)halide_get_device_handle(buf->dev);

    CUcontext ctx;
    CUresult result = cuPointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dev_ptr);
    if (result) {
        error(user_context) << "Bad device pointer " << (void *)dev_ptr
                            << ": cuPointerGetAttribute returned "
                            << get_error_name(result);
        return false;
    }
    return true;
#endif
}

}}}} // namespace Halide::Runtime::Internal

extern "C" {
WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, const char* ptx_src, int size) {
    debug(user_context) << "CUDA: halide_cuda_initialize_kernels (user_context: " << user_context
                        << ", state_ptr: " << state_ptr
                        << ", ptx_src: " << (void *)ptx_src
                        << ", size: " << size << "\n";

    Context ctx(user_context);
    if (ctx.error != 0) {
        return ctx.error;
    }

    #ifdef DEBUG_RUNTIME
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    // Create the state object if necessary. This only happens once, regardless
    // of how many times halide_initialize_kernels/halide_release is called.
    // halide_release traverses this list and releases the module objects, but
    // it does not modify the list nodes created/inserted here.
    module_state **state = (module_state**)state_ptr;
    if (!(*state)) {
        *state = (module_state*)malloc(sizeof(module_state));
        (*state)->module = NULL;
        (*state)->next = state_list;
        state_list = *state;
    }

    // Create the module itself if necessary.
    if (!(*state)->module) {
        debug(user_context) <<  "    cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";

        CUjit_option options[] = { CU_JIT_MAX_REGISTERS };
        unsigned int max_regs_per_thread = 64;

        // A hack to enable control over max register count for
        // testing. This should be surfaced in the schedule somehow
        // instead.
        char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
        if (regs) {
            max_regs_per_thread = atoi(regs);
        }
        void *optionValues[] = { (void*)(uintptr_t) max_regs_per_thread };
        CUresult err = cuModuleLoadDataEx(&(*state)->module, ptx_src, 1, options, optionValues);

        if (err != CUDA_SUCCESS) {
            debug(user_context) << get_error_name(err) << "\n";
            error(user_context) << "CUDA: cuModuleLoadData failed: "
                                << get_error_name(err);
            return err;
        } else {
            debug(user_context) << (void *)((*state)->module) << "\n";
        }
    }

    #ifdef DEBUG_RUNTIME
    uint64_t t_after = halide_current_time_ns(user_context);
    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
    #endif

    return 0;
}

WEAK int halide_cuda_device_free(void *user_context, buffer_t* buf) {
    // halide_device_free, at present, can be exposed to clients and they
    // should be allowed to call halide_device_free on any buffer_t
    // including ones that have never been used with a GPU.
    if (buf->dev == 0) {
        return 0;
    }

    CUdeviceptr dev_ptr = (CUdeviceptr)halide_get_device_handle(buf->dev);

    debug(user_context)
        <<  "CUDA: halide_cuda_device_free (user_context: " << user_context
        << ", buf: " << buf << ")\n";

    Context ctx(user_context);
    if (ctx.error != CUDA_SUCCESS)
        return ctx.error;

    #ifdef DEBUG_RUNTIME
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, validate_device_pointer(user_context, buf));

    debug(user_context) <<  "    cuMemFree " << (void *)(dev_ptr) << "\n";
    CUresult err = cuMemFree(dev_ptr);
    // If cuMemFree fails, it isn't likely to succeed later, so just drop
    // the reference.
    halide_delete_device_wrapper(buf->dev);
    buf->dev = 0;
    if (err != CUDA_SUCCESS) {
        // We may be called as a destructor, so don't raise an error here.
        return err;
    }

    #ifdef DEBUG_RUNTIME
    uint64_t t_after = halide_current_time_ns(user_context);
    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
    #endif

    return 0;
}

WEAK int halide_cuda_device_release(void *user_context) {
    debug(user_context)
        << "CUDA: halide_cuda_device_release (user_context: " <<  user_context << ")\n";

    int err;
    CUcontext ctx;
    err = halide_cuda_acquire_context(user_context, &ctx, false);
    if (err != CUDA_SUCCESS) {
        return err;
    }

    if (ctx) {
        // It's possible that this is being called from the destructor of
        // a static variable, in which case the driver may already be
        // shutting down.
        err = cuCtxPushCurrent(ctx);
        if (err != CUDA_SUCCESS) {
            err = cuCtxSynchronize();
        }
        halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);

        // Unload the modules attached to this context. Note that the list
        // nodes themselves are not freed, only the module objects are
        // released. Subsequent calls to halide_init_kernels might re-create
        // the program object using the same list node to store the module
        // object.
        module_state *state = state_list;
        while (state) {
            if (state->module) {
                debug(user_context) << "    cuModuleUnload " << state->module << "\n";
                err = cuModuleUnload(state->module);
                halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
                state->module = 0;
            }
            state = state->next;
        }

        CUcontext old_ctx;
        cuCtxPopCurrent(&old_ctx);

        // Only destroy the context if we own it
        if (ctx == context) {
            debug(user_context) << "    cuCtxDestroy " << context << "\n";
            err = cuProfilerStop();
            err = cuCtxDestroy(context);
            halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
            context = NULL;
        }
    }

    halide_cuda_release_context(user_context);

    return 0;
}

WEAK int halide_cuda_device_malloc(void *user_context, buffer_t *buf) {
    debug(user_context)
        << "CUDA: halide_cuda_device_malloc (user_context: " << user_context
        << ", buf: " << buf << ")\n";

    Context ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    size_t size = buf_size(buf);
    halide_assert(user_context, size != 0);
    if (buf->dev) {
        // This buffer already has a device allocation
        halide_assert(user_context, validate_device_pointer(user_context, buf, size));
        return 0;
    }

    halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 &&
                                buf->stride[2] >= 0 && buf->stride[3] >= 0);

    debug(user_context) << "    allocating buffer of " << (uint64_t)size << " bytes, "
                        << "extents: "
                        << buf->extent[0] << "x"
                        << buf->extent[1] << "x"
                        << buf->extent[2] << "x"
                        << buf->extent[3] << " "
                        << "strides: "
                        << buf->stride[0] << "x"
                        << buf->stride[1] << "x"
                        << buf->stride[2] << "x"
                        << buf->stride[3] << " "
                        << "(" << buf->elem_size << " bytes per element)\n";

    #ifdef DEBUG_RUNTIME
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    CUdeviceptr p;
    debug(user_context) << "    cuMemAlloc " << (uint64_t)size << " -> ";
    CUresult err = cuMemAlloc(&p, size);
    if (err != CUDA_SUCCESS) {
        debug(user_context) << get_error_name(err) << "\n";
        error(user_context) << "CUDA: cuMemAlloc failed: "
                            << get_error_name(err);
        return err;
    } else {
        debug(user_context) << (void *)p << "\n";
    }
    halide_assert(user_context, p);
    buf->dev = halide_new_device_wrapper((uint64_t)p, &cuda_device_interface);
    if (buf->dev == 0) {
        error(user_context) << "CUDA: out of memory allocating device wrapper.\n";
        cuMemFree(p);
        return -1;
    }

    #ifdef DEBUG_RUNTIME
    uint64_t t_after = halide_current_time_ns(user_context);
    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
    #endif

    return 0;
}

WEAK int halide_cuda_copy_to_device(void *user_context, buffer_t* buf) {
    debug(user_context)
        <<  "CUDA: halide_cuda_copy_to_device (user_context: " << user_context
        << ", buf: " << buf << ")\n";

    Context ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG_RUNTIME
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, buf->host && buf->dev);
    halide_assert(user_context, validate_device_pointer(user_context, buf));

    device_copy c = make_host_to_device_copy(buf);

    // TODO: Is this 32-bit or 64-bit? Leaving signed for now
    // in case negative strides.
    for (int w = 0; w < (int)c.extent[3]; w++) {
        for (int z = 0; z < (int)c.extent[2]; z++) {
            for (int y = 0; y < (int)c.extent[1]; y++) {
                for (int x = 0; x < (int)c.extent[0]; x++) {
                    uint64_t off = (x * c.stride_bytes[0] +
                                    y * c.stride_bytes[1] +
                                    z * c.stride_bytes[2] +
                                    w * c.stride_bytes[3]);
                    void *src = (void *)(c.src + off);
                    CUdeviceptr dst = (CUdeviceptr)(c.dst + off);
                    uint64_t size = c.chunk_size;
                    debug(user_context) << "    cuMemcpyHtoD "
                                        << "(" << x << ", " << y << ", " << z << ", " << w << "), "
                                        << src << " -> " << (void *)dst << ", " << size << " bytes\n";
                    CUresult err = cuMemcpyHtoD(dst, src, size);
                    if (err != CUDA_SUCCESS) {
                        error(user_context) << "CUDA: cuMemcpyHtoD failed: "
                                            << get_error_name(err);
                        return err;
                    }
                }
            }
        }
    }


    #ifdef DEBUG_RUNTIME
    uint64_t t_after = halide_current_time_ns(user_context);
    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
    #endif

    return 0;
}

WEAK int halide_cuda_copy_to_host(void *user_context, buffer_t* buf) {
    debug(user_context)
        << "CUDA: halide_cuda_copy_to_host (user_context: " << user_context
        << ", buf: " << buf << ")\n";

    Context ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG_RUNTIME
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, buf->dev && buf->dev);
    halide_assert(user_context, validate_device_pointer(user_context, buf));

    device_copy c = make_device_to_host_copy(buf);

    // TODO: Is this 32-bit or 64-bit? Leaving signed for now
    // in case negative strides.
    for (int w = 0; w < (int)c.extent[3]; w++) {
        for (int z = 0; z < (int)c.extent[2]; z++) {
            for (int y = 0; y < (int)c.extent[1]; y++) {
                for (int x = 0; x < (int)c.extent[0]; x++) {
                    uint64_t off = (x * c.stride_bytes[0] +
                                    y * c.stride_bytes[1] +
                                    z * c.stride_bytes[2] +
                                    w * c.stride_bytes[3]);
                    CUdeviceptr src = (CUdeviceptr)(c.src + off);
                    void *dst = (void *)(c.dst + off);
                    uint64_t size = c.chunk_size;

                    debug(user_context) << "    cuMemcpyDtoH "
                                        << "(" << x << ", " << y << ", " << z << ", " << w << "), "
                                        << (void *)src << " -> " << dst << ", " << size << " bytes\n";

                    CUresult err = cuMemcpyDtoH(dst, src, size);
                    if (err != CUDA_SUCCESS) {
                        error(user_context) << "CUDA: cuMemcpyDtoH failed: "
                                            << get_error_name(err);
                        return err;
                    }
                }
            }
        }
    }

    #ifdef DEBUG_RUNTIME
    uint64_t t_after = halide_current_time_ns(user_context);
    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
    #endif

    return 0;
}

// Used to generate correct timings when tracing
WEAK int halide_cuda_device_sync(void *user_context, struct buffer_t *) {
    debug(user_context)
        << "CUDA: halide_cuda_device_sync (user_context: " << user_context << ")\n";

    Context ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    #ifdef DEBUG_RUNTIME
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    CUresult err = cuCtxSynchronize();
    if (err != CUDA_SUCCESS) {
        error(user_context) << "CUDA: cuCtxSynchronize failed: "
                            << get_error_name(err);
        return err;
    }

    #ifdef DEBUG_RUNTIME
    uint64_t t_after = halide_current_time_ns(user_context);
    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
    #endif

    return 0;
}

WEAK int halide_cuda_run(void *user_context,
                         void *state_ptr,
                         const char* entry_name,
                         int blocksX, int blocksY, int blocksZ,
                         int threadsX, int threadsY, int threadsZ,
                         int shared_mem_bytes,
                         size_t arg_sizes[],
                         void* args[],
                         int8_t arg_is_buffer[],
                         int num_attributes,
                         float* vertex_buffer,
                         int num_coords_dim0,
                         int num_coords_dim1) {

    debug(user_context) << "CUDA: halide_cuda_run ("
                        << "user_context: " << user_context << ", "
                        << "entry: " << entry_name << ", "
                        << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", "
                        << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", "
                        << "shmem: " << shared_mem_bytes << "\n";

    CUresult err;
    Context ctx(user_context);
    if (ctx.error != CUDA_SUCCESS) {
        return ctx.error;
    }

    debug(user_context) << "Got context.\n";

    #ifdef DEBUG_RUNTIME
    uint64_t t_before = halide_current_time_ns(user_context);
    #endif

    halide_assert(user_context, state_ptr);
    CUmodule mod = ((module_state*)state_ptr)->module;
    debug(user_context) << "Got module " << mod << "\n";
    halide_assert(user_context, mod);
    CUfunction f;
    err = cuModuleGetFunction(&f, mod, entry_name);
    debug(user_context) << "Got function " << f << "\n";
    if (err != CUDA_SUCCESS) {
        error(user_context) << "CUDA: cuModuleGetFunction failed: "
                            << get_error_name(err);
        return err;
    }

    size_t num_args = 0;
    while (arg_sizes[num_args] != 0) {
        debug(user_context) << "    halide_cuda_run " << (int)num_args
                            << " " << (int)arg_sizes[num_args]
                            << " [" << (*((void **)args[num_args])) << " ...] "
                            << arg_is_buffer[num_args] << "\n";
        num_args++;
    }

    // We need storage for both the arg and the pointer to it if if
    // has to be translated.
    void** translated_args = (void **)malloc((num_args + 1) * sizeof(void *));
    uint64_t *dev_handles = (uint64_t *)malloc(num_args * sizeof(uint64_t));
    for (size_t i = 0; i <= num_args; i++) { // Get NULL at end.
        if (arg_is_buffer[i]) {
            halide_assert(user_context, arg_sizes[i] == sizeof(uint64_t));
            dev_handles[i] = halide_get_device_handle(*(uint64_t *)args[i]);
            translated_args[i] = &(dev_handles[i]);
            debug(user_context) << "    halide_cuda_run translated arg" << (int)i
                                << " [" << (*((void **)translated_args[i])) << " ...]\n";
        } else {
            translated_args[i] = args[i];
        }
    }

    err = cuLaunchKernel(f,
                         blocksX,  blocksY,  blocksZ,
                         threadsX, threadsY, threadsZ,
                         shared_mem_bytes,
                         NULL, // stream
                         translated_args,
                         NULL);
    free(dev_handles);
    free(translated_args);
    if (err != CUDA_SUCCESS) {
        error(user_context) << "CUDA: cuLaunchKernel failed: "
                            << get_error_name(err);
        return err;
    }

    #ifdef DEBUG_RUNTIME
    err = cuCtxSynchronize();
    if (err != CUDA_SUCCESS) {
        error(user_context) << "CUDA: cuCtxSynchronize failed: "
                            << get_error_name(err);
        return err;
    }
    uint64_t t_after = halide_current_time_ns(user_context);
    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
    #endif
    return 0;
}

WEAK int halide_cuda_device_and_host_malloc(void *user_context, struct buffer_t *buf) {
    return halide_default_device_and_host_malloc(user_context, buf, &cuda_device_interface);
}

WEAK int halide_cuda_device_and_host_free(void *user_context, struct buffer_t *buf) {
    return halide_default_device_and_host_free(user_context, buf, &cuda_device_interface);
}

WEAK int halide_cuda_wrap_device_ptr(void *user_context, struct buffer_t *buf, uintptr_t device_ptr) {
    halide_assert(user_context, buf->dev == 0);
    if (buf->dev != 0) {
        return -2;
    }
    buf->dev = halide_new_device_wrapper(device_ptr, &cuda_device_interface);
    if (buf->dev == 0) {
        return -1;
    }
#if DEBUG_RUNTIME
    if (!validate_device_pointer(user_context, buf)) {
        halide_delete_device_wrapper(buf->dev);
        buf->dev = 0;
        return -3;
    }
#endif
    return 0;
}

WEAK uintptr_t halide_cuda_detach_device_ptr(void *user_context, struct buffer_t *buf) {
    if (buf->dev == NULL) {
        return 0;
    }
    halide_assert(user_context, halide_get_device_interface(buf->dev) == &cuda_device_interface);
    uint64_t dev_ptr = halide_get_device_handle(buf->dev);
    halide_delete_device_wrapper(buf->dev);
    buf->dev = 0;
    return (uintptr_t)dev_ptr;
}

WEAK uintptr_t halide_cuda_get_device_ptr(void *user_context, struct buffer_t *buf) {
    if (buf->dev == NULL) {
        return 0;
    }
    halide_assert(user_context, halide_get_device_interface(buf->dev) == &cuda_device_interface);
    uint64_t dev_ptr = halide_get_device_handle(buf->dev);
    return (uintptr_t)dev_ptr;
}

WEAK const halide_device_interface_t *halide_cuda_device_interface() {
    return &cuda_device_interface;
}

namespace {
__attribute__((destructor))
WEAK void halide_cuda_cleanup() {
    halide_cuda_device_release(NULL);
}
}

} // extern "C" linkage

namespace Halide { namespace Runtime { namespace Internal { namespace Cuda {

WEAK const char *get_error_name(CUresult error) {
    switch(error) {
    case CUDA_SUCCESS: return "CUDA_SUCCESS";
    case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
    case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
    case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED";
    case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE";
    case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
    case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE";
    case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
    case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED";
    case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED";
    case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED";
    case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED";
    case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU";
    case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED";
    case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED";
    case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
    case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
    case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE";
    case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND";
    case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY";
    case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED";
    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
    case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT";
    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
    case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN";
    // A trap instruction produces the below error, which is how we codegen asserts on GPU
    case CUDA_ERROR_ILLEGAL_INSTRUCTION:
        return "Illegal instruction or Halide assertion failure inside kernel";
    default: return "<Unknown error>";
    }
}

WEAK halide_device_interface_t cuda_device_interface = {
    halide_use_jit_module,
    halide_release_jit_module,
    halide_cuda_device_malloc,
    halide_cuda_device_free,
    halide_cuda_device_sync,
    halide_cuda_device_release,
    halide_cuda_copy_to_host,
    halide_cuda_copy_to_device,
    halide_cuda_device_and_host_malloc,
    halide_cuda_device_and_host_free,
};

}}}} // namespace Halide::Runtime::Internal::Cuda