// slang-emit-cuda.cpp #include "slang-emit-cuda.h" #include "../core/slang-writer.h" #include "slang-emit-source-writer.h" #include "slang-mangled-lexer.h" #include namespace Slang { void CUDAExtensionTracker::finalize() { if (isBaseTypeRequired(BaseType::Half)) { // The cuda_fp16.hpp header indicates the need is for version 5.3, but when this is tried // NVRTC says it cannot load builtins. // The lowest version that this does work for is 6.0, so that's what we use here. // https://docs.nvidia.com/cuda/nvrtc/index.html#group__options requireSMVersion(SemanticVersion(6, 0)); } } UnownedStringSlice CUDASourceEmitter::getBuiltinTypeName(IROp op) { switch (op) { case kIROp_VoidType: return UnownedStringSlice("void"); case kIROp_BoolType: return UnownedStringSlice("bool"); case kIROp_Int8Type: return UnownedStringSlice("char"); case kIROp_Int16Type: return UnownedStringSlice("short"); case kIROp_IntType: return UnownedStringSlice("int"); case kIROp_Int64Type: return UnownedStringSlice("longlong"); case kIROp_UInt8Type: return UnownedStringSlice("uchar"); case kIROp_UInt16Type: return UnownedStringSlice("ushort"); case kIROp_UIntType: return UnownedStringSlice("uint"); case kIROp_UInt64Type: return UnownedStringSlice("ulonglong"); #if SLANG_PTR_IS_64 case kIROp_IntPtrType: return UnownedStringSlice("int64_t"); case kIROp_UIntPtrType: return UnownedStringSlice("uint64_t"); #else case kIROp_IntPtrType: return UnownedStringSlice("int"); case kIROp_UIntPtrType: return UnownedStringSlice("uint"); #endif case kIROp_HalfType: { m_extensionTracker->requireBaseType(BaseType::Half); return UnownedStringSlice("__half"); } case kIROp_FloatType: return UnownedStringSlice("float"); case kIROp_DoubleType: return UnownedStringSlice("double"); default: return UnownedStringSlice(); } } UnownedStringSlice CUDASourceEmitter::getVectorPrefix(IROp op) { switch (op) { case kIROp_BoolType: return UnownedStringSlice("bool"); case kIROp_Int8Type: return UnownedStringSlice("char"); case kIROp_Int16Type: return UnownedStringSlice("short"); case kIROp_IntType: return UnownedStringSlice("int"); case kIROp_Int64Type: return UnownedStringSlice("longlong"); case kIROp_UInt8Type: return UnownedStringSlice("uchar"); case kIROp_UInt16Type: return UnownedStringSlice("ushort"); case kIROp_UIntType: return UnownedStringSlice("uint"); case kIROp_UInt64Type: return UnownedStringSlice("ulonglong"); case kIROp_HalfType: { m_extensionTracker->requireBaseType(BaseType::Half); return UnownedStringSlice("__half"); } case kIROp_FloatType: return UnownedStringSlice("float"); case kIROp_DoubleType: return UnownedStringSlice("double"); default: return UnownedStringSlice(); } } void CUDASourceEmitter::emitTempModifiers(IRInst* temp) { CPPSourceEmitter::emitTempModifiers(temp); if (as(temp->getParent())) { m_writer->emit("__device__ "); } } SlangResult CUDASourceEmitter::_calcCUDATextureTypeName(IRTextureTypeBase* texType, StringBuilder& outName) { // Not clear how to do this yet if (texType->isMultisample()) { return SLANG_FAIL; } switch (texType->getAccess()) { case SLANG_RESOURCE_ACCESS_READ: { outName << "CUtexObject"; return SLANG_OK; } case SLANG_RESOURCE_ACCESS_READ_WRITE: { outName << "CUsurfObject"; return SLANG_OK; } default: break; } return SLANG_FAIL; } SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out) { SLANG_UNUSED(target); // The names CUDA produces are all compatible with 'C' (ie they aren't templated types) SLANG_ASSERT(target == CodeGenTarget::CUDASource || target == CodeGenTarget::CSource); switch (type->getOp()) { case kIROp_VectorType: { auto vecType = static_cast(type); auto vecCount = int(getIntVal(vecType->getElementCount())); const IROp elemType = vecType->getElementType()->getOp(); UnownedStringSlice prefix = getVectorPrefix(elemType); if (prefix.getLength() <= 0) { return SLANG_FAIL; } out << prefix << vecCount; return SLANG_OK; } case kIROp_TensorViewType: { out << "TensorView"; return SLANG_OK; } default: { if (isNominalOp(type->getOp())) { out << getName(type); return SLANG_OK; } if (IRBasicType::isaImpl(type->getOp())) { out << getBuiltinTypeName(type->getOp()); return SLANG_OK; } if (auto texType = as(type)) { // We don't support TextureSampler, so ignore that if (texType->getOp() != kIROp_TextureSamplerType) { return _calcCUDATextureTypeName(texType, out); } } switch (type->getOp()) { case kIROp_SamplerStateType: out << "SamplerState"; return SLANG_OK; case kIROp_SamplerComparisonStateType: out << "SamplerComparisonState"; return SLANG_OK; default: break; } break; } } if (auto untypedBufferType = as(type)) { switch (untypedBufferType->getOp()) { case kIROp_RaytracingAccelerationStructureType: { m_writer->emit("OptixTraversableHandle"); return SLANG_OK; break; } default: break; } } return Super::calcTypeName(type, target, out); } void CUDASourceEmitter::emitLayoutSemanticsImpl(IRInst* inst, char const* uniformSemanticSpelling) { Super::emitLayoutSemanticsImpl(inst, uniformSemanticSpelling); } void CUDASourceEmitter::emitParameterGroupImpl(IRGlobalParam* varDecl, IRUniformParameterGroupType* type) { auto elementType = type->getElementType(); m_writer->emit("extern \"C\" __constant__ "); emitType(elementType, "SLANG_globalParams"); m_writer->emit(";\n"); m_writer->emit("#define "); m_writer->emit(getName(varDecl)); m_writer->emit(" (&SLANG_globalParams)\n"); } void CUDASourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPointDecoration* entryPointDecor) { SLANG_UNUSED(irFunc); SLANG_UNUSED(entryPointDecor); } void CUDASourceEmitter::emitFunctionPreambleImpl(IRInst* inst) { if (!inst) return; if (inst->findDecoration()) { m_writer->emit("extern \"C\" __global__ "); return; } if (inst->findDecoration()) { m_writer->emit("__global__ "); } else if (inst->findDecoration()) { m_writer->emit("__host__ "); } else { m_writer->emit("__device__ "); } } String CUDASourceEmitter::generateEntryPointNameImpl(IREntryPointDecoration* entryPointDecor) { // We have an entry-point function in the IR module, which we // will want to emit as a `__global__` function in the generated // CUDA C++. // // The most common case will be a compute kernel, in which case // we will emit the function more or less as-is, including // usingits original name as the name of the global symbol. // String funcName = Super::generateEntryPointNameImpl(entryPointDecor); String globalSymbolName = funcName; // We also suport emitting ray tracing kernels for use with // OptiX, and in that case the name of the global symbol // must be prefixed to indicate to the OptiX runtime what // stage it is to be compiled for. // auto stage = entryPointDecor->getProfile().getStage(); switch( stage ) { default: break; #define CASE(STAGE, PREFIX) \ case Stage::STAGE: globalSymbolName = #PREFIX + funcName; break // Optix 7 Guide, Section 6.1 (Program input) // // > The input PTX should include one or more NVIDIA OptiX programs. // > The type of program affects how the program can be used during // > the execution of the pipeline. These program types are specified // by prefixing the program name with the following: // // > Program type Function name prefix CASE( RayGeneration, __raygen__); CASE( Intersection, __intersection__); CASE( AnyHit, __anyhit__); CASE( ClosestHit, __closesthit__); CASE( Miss, __miss__); CASE( Callable, __direct_callable__); // // There are two stages (or "program types") supported by OptiX // that Slang currently cannot target: // // CASE(ContinuationCallable, __continuation_callable__); // CASE(Exception, __exception__); // #undef CASE } return globalSymbolName; } void CUDASourceEmitter::emitGlobalRTTISymbolPrefix() { m_writer->emit("__constant__ "); } void CUDASourceEmitter::emitLoopControlDecorationImpl(IRLoopControlDecoration* decl) { if (decl->getMode() == kIRLoopControl_Unroll) { m_writer->emit("#pragma unroll\n"); } } void CUDASourceEmitter::_emitInitializerListValue(IRType* dstType, IRInst* value) { // When constructing a matrix or vector from a single value this is handled by the default path switch (value->getOp()) { case kIROp_MakeVector: case kIROp_MakeMatrix: { IRType* type = value->getDataType(); // If the types are the same, we can can just break down and use if (dstType == type) { if (auto vecType = as(type)) { if (UInt(getIntVal(vecType->getElementCount())) == value->getOperandCount()) { emitType(type); _emitInitializerList(vecType->getElementType(), value->getOperands(), value->getOperandCount()); return; } } else if (auto matType = as(type)) { const Index colCount = Index(getIntVal(matType->getColumnCount())); const Index rowCount = Index(getIntVal(matType->getRowCount())); // TODO(JS): If num cols = 1, then it *doesn't* actually return a vector. // That could be argued is an error because we want swizzling or [] to work. IRBuilder builder(matType->getModule()); builder.setInsertBefore(matType); const Index operandCount = Index(value->getOperandCount()); // Can init, with vectors. // For now special case if the rowVectorType is not actually a vector (when elementSize == 1) if (operandCount == rowCount) { // Emit the braces for the Matrix struct, and then each row vector in its own line. emitType(matType); m_writer->emit("{\n"); m_writer->indent(); for (Index i = 0; i < rowCount; ++i) { if (i != 0) m_writer->emit(",\n"); emitType(matType->getElementType()); m_writer->emit(colCount); _emitInitializerList(matType->getElementType(), value->getOperand(i)->getOperands(), colCount); } m_writer->dedent(); m_writer->emit("\n}"); return; } else if (operandCount == rowCount * colCount) { // Handle if all are explicitly defined IRType* elementType = matType->getElementType(); IRUse* operands = value->getOperands(); // Emit the braces for the Matrix struct, and the elements of each row in its own line. emitType(matType); m_writer->emit("{\n"); m_writer->indent(); for (Index i = 0; i < rowCount; ++i) { if (i != 0) m_writer->emit(",\n"); _emitInitializerListContent(elementType, operands, colCount); operands += colCount; } m_writer->dedent(); m_writer->emit("\n}"); return; } } } break; } } // All other cases we just use the default emitting - might not work on arrays defined in global scope on CUDA though emitOperand(value, getInfo(EmitOp::General)); } void CUDASourceEmitter::_emitInitializerListContent(IRType* elementType, IRUse* operands, Index operandCount) { for (Index i = 0; i < operandCount; ++i) { if (i != 0) m_writer->emit(", "); _emitInitializerListValue(elementType, operands[i].get()); } } void CUDASourceEmitter::_emitInitializerList(IRType* elementType, IRUse* operands, Index operandCount) { m_writer->emit("{\n"); m_writer->indent(); _emitInitializerListContent(elementType, operands, operandCount); m_writer->dedent(); m_writer->emit("\n}"); } void CUDASourceEmitter::emitIntrinsicCallExprImpl(IRCall* inst, IRTargetIntrinsicDecoration* targetIntrinsic, EmitOpInfo const& inOuterPrec) { if (targetIntrinsic->getDefinition().startsWith("__half")) m_extensionTracker->requireBaseType(BaseType::Half); Super::emitIntrinsicCallExprImpl(inst, targetIntrinsic, inOuterPrec); } bool CUDASourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) { switch(inst->getOp()) { case kIROp_MakeVector: case kIROp_MakeVectorFromScalar: { m_writer->emit("make_"); emitType(inst->getDataType()); m_writer->emit("("); bool isFirst = true; char xyzwNames[] = "xyzw"; for (UInt i = 0; i < inst->getOperandCount(); i++) { auto arg = inst->getOperand(i); if (auto vectorType = as(arg->getDataType())) { for (int j = 0; j < cast(vectorType->getElementCount())->getValue(); j++) { if (isFirst) isFirst = false; else m_writer->emit(", "); auto outerPrec = getInfo(EmitOp::General); auto prec = getInfo(EmitOp::Postfix); emitOperand(arg, leftSide(outerPrec, prec)); m_writer->emit("."); m_writer->emitChar(xyzwNames[j]); } } else { if (isFirst) isFirst = false; else m_writer->emit(", "); emitOperand(arg, getInfo(EmitOp::General)); } } m_writer->emit(")"); return true; } case kIROp_FloatCast: case kIROp_CastIntToFloat: case kIROp_IntCast: case kIROp_CastFloatToInt: { if (auto dstVectorType = as(inst->getDataType())) { m_writer->emit("make_"); emitType(inst->getDataType()); m_writer->emit("("); bool isFirst = true; char xyzwNames[] = "xyzw"; for (UInt i = 0; i < inst->getOperandCount(); i++) { auto arg = inst->getOperand(i); if (auto vectorType = as(arg->getDataType())) { for (int j = 0; j < cast(vectorType->getElementCount())->getValue(); j++) { if (isFirst) isFirst = false; else m_writer->emit(", "); m_writer->emit("("); emitType(dstVectorType->getElementType()); m_writer->emit(")"); auto outerPrec = getInfo(EmitOp::General); auto prec = getInfo(EmitOp::Postfix); emitOperand(arg, leftSide(outerPrec, prec)); m_writer->emit("."); m_writer->emitChar(xyzwNames[j]); } } else { if (isFirst) isFirst = false; else m_writer->emit(", "); m_writer->emit("("); emitType(dstVectorType->getElementType()); m_writer->emit(")"); emitOperand(arg, getInfo(EmitOp::General)); } } m_writer->emit(")"); return true; } else if (auto matrixType = as(inst->getDataType())) { m_writer->emit("make"); emitType(inst->getDataType()); m_writer->emit("("); for (UInt i = 0; i < inst->getOperandCount(); i++) { auto arg = inst->getOperand(i); if (i > 0) m_writer->emit(", "); emitOperand(arg, getInfo(EmitOp::General)); } m_writer->emit(")"); return true; } return false; } case kIROp_MakeMatrix: case kIROp_MakeMatrixFromScalar: case kIROp_MatrixReshape: { m_writer->emit("make"); emitType(inst->getDataType()); m_writer->emit("("); for (UInt i = 0; i < inst->getOperandCount(); i++) { auto arg = inst->getOperand(i); if (i > 0) m_writer->emit(", "); emitOperand(arg, getInfo(EmitOp::General)); } m_writer->emit(")"); return true; } case kIROp_MakeArray: { IRType* dataType = inst->getDataType(); IRArrayType* arrayType = as(dataType); IRType* elementType = arrayType->getElementType(); // Emit braces for the FixedArray struct. _emitInitializerList(elementType, inst->getOperands(), Index(inst->getOperandCount())); return true; } case kIROp_WaveMaskBallot: { m_extensionTracker->requireSMVersion(SemanticVersion(7, 0)); m_writer->emit("__ballot_sync("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(")"); return true; } case kIROp_WaveMaskMatch: { m_extensionTracker->requireSMVersion(SemanticVersion(7, 0)); m_writer->emit("__match_any_sync("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); m_writer->emit(")"); return true; } case kIROp_GetOptiXRayPayloadPtr: { m_writer->emit("("); emitType(inst->getDataType()); m_writer->emit(")getOptiXRayPayloadPtr()"); return true; } case kIROp_GetOptiXHitAttribute: { auto typeToFetch = inst->getOperand(0); auto idxInst = as(inst->getOperand(1)); IRIntegerValue idx = idxInst->getValue(); if (typeToFetch->getOp() == kIROp_FloatType) { m_writer->emit("__int_as_float(optixGetAttribute_"); } else { m_writer->emit("optixGetAttribute_"); } m_writer->emit(idx); if (typeToFetch->getOp() == kIROp_FloatType) { m_writer->emit("())"); } else { m_writer->emit("()"); } return true; } case kIROp_GetOptiXSbtDataPtr: { m_writer->emit("(("); emitType(inst->getDataType()); m_writer->emit(")optixGetSbtDataPointer())"); return true; } case kIROp_DispatchKernel: { auto dispatchInst = as(inst); emitOperand(dispatchInst->getBaseFn(), getInfo(EmitOp::Atomic)); m_writer->emit("<<<"); emitOperand(dispatchInst->getThreadGroupSize(), getInfo(EmitOp::General)); m_writer->emit(", "); emitOperand(dispatchInst->getDispatchSize(), getInfo(EmitOp::General)); m_writer->emit(">>>("); for (UInt i = 0; i < dispatchInst->getArgCount(); i++) { if (i > 0) m_writer->emit(", "); emitOperand(dispatchInst->getArg(i), getInfo(EmitOp::General)); } m_writer->emit(")"); return true; } default: break; } return Super::tryEmitInstExprImpl(inst, inOuterPrec); } void CUDASourceEmitter::handleRequiredCapabilitiesImpl(IRInst* inst) { // Does this function declare any requirements on CUDA capabilities // that should affect output? for (auto decoration : inst->getDecorations()) { if( auto smDecoration = as(decoration)) { SemanticVersion version; version.setFromInteger(SemanticVersion::IntegerType(smDecoration->getCUDASMVersion())); m_extensionTracker->requireSMVersion(version); } } } void CUDASourceEmitter::emitVectorTypeNameImpl(IRType* elementType, IRIntegerValue elementCount) { m_writer->emit(getVectorPrefix(elementType->getOp())); m_writer->emit(elementCount); } void CUDASourceEmitter::emitSimpleTypeImpl(IRType* type) { switch (type->getOp()) { case kIROp_VectorType: { auto vectorType = as(type); m_writer->emit(getVectorPrefix(vectorType->getElementType()->getOp())); m_writer->emit(as(vectorType->getElementCount())->getValue()); break; } default: m_writer->emit(_getTypeName(type)); break; } } void CUDASourceEmitter::emitRateQualifiersImpl(IRRate* rate) { if (as(rate)) { m_writer->emit("__shared__ "); } } void CUDASourceEmitter::emitSimpleFuncParamsImpl(IRFunc* func) { m_writer->emit("("); bool hasEmittedParam = false; auto firstParam = func->getFirstParam(); for (auto pp = firstParam; pp; pp = pp->getNextParam()) { auto varLayout = getVarLayout(pp); if (varLayout && varLayout->findSystemValueSemanticAttr()) { // If it has a semantic don't output, it will be accessed via a global continue; } if (hasEmittedParam) m_writer->emit(", "); emitSimpleFuncParamImpl(pp); hasEmittedParam = true; } m_writer->emit(")"); } void CUDASourceEmitter::emitSimpleFuncImpl(IRFunc* func) { // Skip the CPP impl - as it does some processing we don't need here for entry points. CLikeSourceEmitter::emitSimpleFuncImpl(func); } void CUDASourceEmitter::emitSimpleValueImpl(IRInst* inst) { // Make sure we convert float to half when emitting a half literal to avoid // overload ambiguity errors from CUDA. if (inst->getOp() == kIROp_FloatLit) { if (inst->getDataType()->getOp() == kIROp_HalfType) { m_writer->emit("__half("); CLikeSourceEmitter::emitSimpleValueImpl(inst); m_writer->emit(")"); return; } } CLikeSourceEmitter::emitSimpleValueImpl(inst); } void CUDASourceEmitter::emitSemanticsImpl(IRInst* inst) { Super::emitSemanticsImpl(inst); } void CUDASourceEmitter::emitInterpolationModifiersImpl(IRInst* varInst, IRType* valueType, IRVarLayout* layout) { Super::emitInterpolationModifiersImpl(varInst, valueType, layout); } void CUDASourceEmitter::emitVarDecorationsImpl(IRInst* varDecl) { Super::emitVarDecorationsImpl(varDecl); } void CUDASourceEmitter::emitMatrixLayoutModifiersImpl(IRVarLayout* layout) { Super::emitMatrixLayoutModifiersImpl(layout); } void CUDASourceEmitter::emitPreModuleImpl() { SourceWriter* writer = getSourceWriter(); // Emit generated types/functions writer->emit("\n"); } bool CUDASourceEmitter::tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) { // A global shader parameter in the IR for CUDA output will // either be the unique constant buffer that wraps all the // global-scope parameters in the original code (which is // handled as a special-case before this routine would be // called), or it is one of the system-defined varying inputs // like `threadIdx`. We won't need to emit anything in the // output code for the latter case, so we need to emit // nothing here and return `true` so that the base class // uses our logic instead of the default. // SLANG_UNUSED(varDecl); SLANG_UNUSED(varType); return true; } void CUDASourceEmitter::emitModuleImpl(IRModule* module, DiagnosticSink* sink) { CLikeSourceEmitter::emitModuleImpl(module, sink); // Emit all witness table definitions. _emitWitnessTableDefinitions(); } } // namespace Slang