Revision 8b9c0814b421281f85078830cb47e83fc4143881 authored by Alex Reinking on 02 September 2022, 01:55:43 UTC, committed by GitHub on 02 September 2022, 01:55:43 UTC
1. TargetExportScript was running into an Xcode bug with its handling of linker flags. Now using XCODE_ATTRIBUTE_EXPORTED_SYMBOLS_LIST as a workaround. 2. Added a missing dependency in Python module definition code. Fixes #6987
1 parent ce2e7f3
FuseGPUThreadLoops.h
#ifndef HALIDE_FUSE_GPU_THREAD_LOOPS_H
#define HALIDE_FUSE_GPU_THREAD_LOOPS_H
/** \file
* Defines the lowering pass that fuses and normalizes loops over gpu
* threads to target CUDA, OpenCL, and Metal.
*/
#include "Expr.h"
namespace Halide {
namespace Internal {
/** Rewrite all GPU loops to have a min of zero. */
Stmt zero_gpu_loop_mins(const Stmt &s);
/** Converts Halide's GPGPU IR to the OpenCL/CUDA/Metal model. Within
* every loop over gpu block indices, fuse the inner loops over thread
* indices into a single loop (with predication to turn off
* threads). Push if conditions between GPU blocks to the innermost GPU threads.
* Also injects synchronization points as needed, and hoists
* shared allocations at the block level out into a single shared
* memory array, and heap allocations into a slice of a global pool
* allocated outside the kernel. */
Stmt fuse_gpu_thread_loops(Stmt s);
} // namespace Internal
} // namespace Halide
#endif
Computing file changes ...