[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)
Jon Chesterfield via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 13 13:09:44 PDT 2025
================
@@ -0,0 +1,501 @@
+//===- LowerGPUIntrinsic.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower the llvm.gpu intrinsics to target specific code sequences.
+// Can be called from clang if building for a specific GPU or from the backend
+// as part of a SPIRV lowering pipeline. Initial pass can lower to amdgcn or
+// nvptx, adding further architectures means adding a column to the lookup table
+// and further intrinsics adding a row.
+//
+// The idea is for the intrinsics to represent a thin abstraction over the
+// different GPU architectures. In particular, code compiled to spirv-- without
+// specifying a specific target can be specialised at JIT time, at which point
+// this pass will rewrite those intrinsics to ones that the current backend
+// knows.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "lower-gpu-intrinsic"
+
+using namespace llvm;
+
+namespace {
+
+// For each intrinsic, specify what function to call to lower it
+typedef bool (*lowerFunction)(Module &M, IRBuilder<> &, Intrinsic::ID from,
+ CallBase *CI);
+
+// Simple lowering, directly replace the intrinsic with a different one
+// with the same type, and optionally refine range metadata on the return value
+template <Intrinsic::ID To>
+bool S(Module &M, IRBuilder<> &, Intrinsic::ID from, CallBase *CI) {
+
+ static_assert(To != Intrinsic::not_intrinsic);
+ Intrinsic::ID GenericID = from;
+ Intrinsic::ID SpecificID = To;
+
+ bool Changed = false;
+ Function *Generic = Intrinsic::getDeclarationIfExists(&M, GenericID);
+ auto *Specific = Intrinsic::getOrInsertDeclaration(&M, SpecificID);
+
+ if ((Generic->getType() != Specific->getType()) ||
+ (Generic->getReturnType() != Specific->getReturnType()))
+ report_fatal_error("LowerGPUIntrinsic: Inconsistent types between "
+ "intrinsics in lookup table");
+
+ CI->setCalledFunction(Specific);
+ Changed = true;
+
+ return Changed;
+}
+
+// Replace intrinsic call with a linear sequence of instructions
+typedef Value *(*builder)(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI);
+
+template <builder F>
+bool B(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, CallBase *CI) {
+ bool Changed = false;
+
+ Builder.SetInsertPoint(CI);
+
+ Value *replacement = F(M, Builder, from, CI);
+ if (replacement) {
+ CI->replaceAllUsesWith(replacement);
+ CI->eraseFromParent();
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+template <Intrinsic::ID Numerator, Intrinsic::ID Denominator>
+Value *intrinsicRatio(Module &M, IRBuilder<> &Builder, Intrinsic::ID,
+ CallBase *) {
+ Value *N = Builder.CreateIntrinsic(Numerator, {}, {});
+ Value *D = Builder.CreateIntrinsic(Denominator, {}, {});
+ return Builder.CreateUDiv(N, D);
+}
+
+namespace amdgpu {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ return Builder.CreateIntrinsic(
+ Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)},
+ {ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+}
+
+Value *lane_id(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ Constant *M1 = ConstantInt::get(Type::getInt32Ty(Ctx), -1);
+ Constant *Z = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+ CallInst *Lo =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {M1, Z});
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {M1, Lo});
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane,
+ {Type::getInt32Ty(Ctx)},
+ {CI->getArgOperand(1)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+
+ Value *idx = CI->getArgOperand(1);
+ Value *x = CI->getArgOperand(2);
+ Value *width = CI->getArgOperand(3);
+
+ Value *id = Builder.CreateIntrinsic(Intrinsic::gpu_lane_id, {}, {});
+
+ Value *n = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 0),
+ width, "not");
+ Value *a = Builder.CreateAnd(id, n, "and");
+ Value *add = Builder.CreateAdd(a, idx, "add");
+ Value *shl =
+ Builder.CreateShl(add, ConstantInt::get(Type::getInt32Ty(Ctx), 2), "shl");
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {shl, x});
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+
+ Value *C =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)},
+ {CI->getArgOperand(1)});
+
+ return Builder.CreateAnd(C, CI->getArgOperand(0));
+}
+
+Value *sync_threads(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ auto &Ctx = M.getContext();
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+ Value *F = Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ Ctx.getOrInsertSyncScopeID("workgroup"));
+
+ return F;
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+
+ auto &Ctx = M.getContext();
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {},
+ {ConstantInt::get(Type::getInt32Ty(Ctx), 2)});
+}
+
+Value *dispatch_ptr(IRBuilder<> &Builder) {
+ CallInst *Call =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {});
+ Call->addRetAttr(
+ Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
+ Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
+ return Call;
+}
+
+Value *implicit_arg_ptr(IRBuilder<> &Builder) {
+ CallInst *Call =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
+ Call->addRetAttr(
+ Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
+ Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
+ return Call;
+}
+
+template <unsigned Index>
+Value *grid_size(Module &M, IRBuilder<> &Builder, Intrinsic::ID, CallBase *) {
+ auto &Ctx = M.getContext();
+ const unsigned XOffset = 12;
+ auto *DP = dispatch_ptr(Builder);
+
+ // Indexing the HSA kernel_dispatch_packet struct.
+ auto *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), XOffset + Index * 4);
+ auto *GEP = Builder.CreateGEP(Type::getInt8Ty(Ctx), DP, Offset);
+ auto *LD = Builder.CreateLoad(Type::getInt32Ty(Ctx), GEP);
+ llvm::MDBuilder MDB(Ctx);
+ // Known non-zero.
+ LD->setMetadata(llvm::LLVMContext::MD_range,
+ MDB.createRange(APInt(32, 1), APInt::getZero(32)));
+ LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+ llvm::MDNode::get(Ctx, {}));
+ return LD;
+}
+
+template <int Index>
+Value *WGSize(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+ CallBase *CI) {
+
+ // Note: "__oclc_ABI_version" is supposed to be emitted and initialized by
+ // clang during compilation of user code.
+ StringRef Name = "__oclc_ABI_version";
----------------
JonChesterfield wrote:
That's the lowering of the intrinsic to amdgpu, it's gonna be amdgpu specific. Prior to the lowering which can be at JIT time you've got llvm.gpu.num_threads_x or __builtin_gpu_num_threads_x.
https://github.com/llvm/llvm-project/pull/131190
More information about the llvm-commits
mailing list