Revision 9a94756d01d9071ff1610bfc4cb170bd47f701a8 authored by Alexander Root on 21 July 2022, 15:01:16 UTC, committed by GitHub on 21 July 2022, 15:01:16 UTC
* use pmaddubsw 8-bit horizontal widening adds * add SSE3 versions too * add pmaddubsw tests
1 parent 967c3bf
LowerWarpShuffles.h
#ifndef HALIDE_LOWER_WARP_SHUFFLES_H
#define HALIDE_LOWER_WARP_SHUFFLES_H
/** \file
* Defines the lowering pass that injects CUDA warp shuffle
* instructions to access storage outside of a GPULane loop.
*/
#include "Expr.h"
namespace Halide {
namespace Internal {
/** Rewrite access to things stored outside the loop over GPU lanes to
* use nvidia's warp shuffle instructions. */
Stmt lower_warp_shuffles(Stmt s, const Target &t);
} // namespace Internal
} // namespace Halide
#endif
![swh spinner](/static/img/swh-spinner.gif)
Computing file changes ...