Revision 8b9c0814b421281f85078830cb47e83fc4143881 authored by Alex Reinking on 02 September 2022, 01:55:43 UTC, committed by GitHub on 02 September 2022, 01:55:43 UTC
1. TargetExportScript was running into an Xcode bug with its handling of
   linker flags. Now using XCODE_ATTRIBUTE_EXPORTED_SYMBOLS_LIST as a
   workaround.
2. Added a missing dependency in Python module definition code.

Fixes #6987
1 parent ce2e7f3
Raw File
FuseGPUThreadLoops.h
#ifndef HALIDE_FUSE_GPU_THREAD_LOOPS_H
#define HALIDE_FUSE_GPU_THREAD_LOOPS_H

/** \file
 * Defines the lowering pass that fuses and normalizes loops over gpu
 * threads to target CUDA, OpenCL, and Metal.
 */

#include "Expr.h"

namespace Halide {
namespace Internal {

/** Rewrite all GPU loops to have a min of zero. */
Stmt zero_gpu_loop_mins(const Stmt &s);

/** Converts Halide's GPGPU IR to the OpenCL/CUDA/Metal model. Within
 * every loop over gpu block indices, fuse the inner loops over thread
 * indices into a single loop (with predication to turn off
 * threads). Push if conditions between GPU blocks to the innermost GPU threads.
 * Also injects synchronization points as needed, and hoists
 * shared allocations at the block level out into a single shared
 * memory array, and heap allocations into a slice of a global pool
 * allocated outside the kernel. */
Stmt fuse_gpu_thread_loops(Stmt s);

}  // namespace Internal
}  // namespace Halide

#endif
back to top