[clang] [llvm] [mlir] [Sema] Fix bug in builtin AS override (PR #138141)
via cfe-commits
cfe-commits at lists.llvm.org
Thu May 1 07:23:44 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-gpu
Author: Joe Nash (Sisyph)
<details>
<summary>Changes</summary>
Fix the logic in rewriteBuiltinFunctionDecl to work when the builtin
has a pointer parameter with an address space and one without a fixed
address space. A builtin fitting these criteria was recently added.
Change the attribute string to perform type checking on it, so without
the sema change compilation would fail with a wrong number of arguments
error.
---
Patch is 58.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/138141.diff
24 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+1)
- (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+5)
- (modified) clang/lib/Sema/SemaAMDGPU.cpp (+1)
- (modified) clang/lib/Sema/SemaExpr.cpp (+2-4)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl (+30)
- (added) clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl (+60)
- (added) clang/test/SemaOpenCL/builtins-amdgcn-load-to-lds-err.cl (+25)
- (modified) llvm/docs/AMDGPUUsage.rst (+9-1)
- (modified) llvm/docs/ReleaseNotes.md (+8)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+20)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+5)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+20)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+7-1)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.gfx950.ll (+128)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll (+220)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+18)
- (modified) mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td (+7-5)
- (modified) mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td (+29-6)
- (modified) mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp (+7-8)
- (modified) mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp (+15-6)
- (modified) mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir (+51-16)
- (modified) mlir/test/Dialect/LLVMIR/rocdl.mlir (+10-7)
- (modified) mlir/test/Target/LLVMIR/rocdl.mlir (+7-4)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 39fef9e4601f8..802b4be42419d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -257,6 +257,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_load_to_lds, "vv*v*3IUiIiIUi", "", "vmem-to-lds-load-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "vmem-to-lds-load-insts")
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index ad012d98635ff..d1c722c9dc610 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -564,6 +564,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr});
}
+ case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
+ // Should this have asan instrumentation?
+ return emitBuiltinWithOneOverloadedType<5>(*this, E,
+ Intrinsic::amdgcn_load_to_lds);
+ }
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
{llvm::Type::getInt64Ty(getLLVMContext())});
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index a6366aceec2a6..e6414a623b929 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
+ case AMDGPU::BI__builtin_amdgcn_load_to_lds:
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
constexpr const int SizeIdx = 2;
llvm::APSInt Size;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 283d910a09d54..1be03327ae915 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6311,7 +6311,8 @@ static FunctionDecl *rewriteBuiltinFunctionDecl(Sema *Sema, ASTContext &Context,
return nullptr;
Expr *Arg = ArgRes.get();
QualType ArgType = Arg->getType();
- if (!ParamType->isPointerType() || ParamType.hasAddressSpace() ||
+ if (!ParamType->isPointerType() ||
+ ParamType->getPointeeType().hasAddressSpace() ||
!ArgType->isPointerType() ||
!ArgType->getPointeeType().hasAddressSpace() ||
isPtrSizeAddressSpace(ArgType->getPointeeType().getAddressSpace())) {
@@ -6320,9 +6321,6 @@ static FunctionDecl *rewriteBuiltinFunctionDecl(Sema *Sema, ASTContext &Context,
}
QualType PointeeType = ParamType->getPointeeType();
- if (PointeeType.hasAddressSpace())
- continue;
-
NeedsNewDecl = true;
LangAS AS = ArgType->getPointeeType().getAddressSpace();
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 8251d6c213e3d..3d81893553c65 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -1766,6 +1766,36 @@ void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
*out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1);
}
+// CHECK-LABEL: @test_load_to_lds_96(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_96(global void* src, local void *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_128(global void* src, local void *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
+}
+
// CHECK-LABEL: @test_global_load_lds_96(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
new file mode 100644
index 0000000000000..6cdedb33bdd80
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-load-to-lds.cl
@@ -0,0 +1,60 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+// CHECK-LABEL: @test_load_to_lds_u32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u32(global u32* src, local u32 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_u16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u16(global u16* src, local u16 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_load_to_lds_u8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
+// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[SRC_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr [[DST_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
+// CHECK-NEXT: ret void
+//
+void test_load_to_lds_u8(global u8* src, local u8 *dst) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-load-to-lds-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-load-to-lds-err.cl
new file mode 100644
index 0000000000000..d93d724212077
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-load-to-lds-err.cl
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -S -verify=gfx,expected -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx,expected -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -S -verify=gfx,expected -o - %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int u32;
+
+void test_load_to_lds_unsupported_size(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) {
+ __builtin_amdgcn_load_to_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_load_to_lds' must be a constant integer}}
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}}
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}}
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}}
+}
+
+__attribute__((target("gfx950-insts")))
+void test_load_to_lds_via_target_feature(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) {
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
+ __builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
+}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index d1535960a0257..3ee0f8cae3fc2 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1216,7 +1216,15 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
The format is a 64-bit concatenation of the MODE and TRAPSTS registers.
:ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state.
-
+ llvm.amdgcn.load.to.lds.p<1/7> Loads values from global memory (either in the form of a global
+ a raw fat buffer pointer) to LDS. The size of the data copied can be 1, 2,
+ or 4 bytes (and gfx950 also allows 12 or 16 bytes). The LDS pointer
+ argument should be wavefront-uniform; the global pointer need not be.
+ The LDS pointer is implicitly offset by 4 * lane_id bytes for sies <= 4 bytes
+ and 16 * lane_id bytes for larger sizes. This lowers to `global_load_lds`,
+ `buffer_load_* ... lds`, or `global_load__* ... lds` depnedening on address
+ space and architecture. `amdgcn.global.load.lds` has the same semantics as
+ `amdgcn.load.to.lds.p1`.
llvm.amdgcn.readfirstlane Provides direct access to v_readfirstlane_b32. Returns the value in
the lowest active lane of the input operand. Currently implemented
for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 6fb206e4df188..d86fc74fe2889 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -102,6 +102,14 @@ Changes to the AMDGPU Backend
* Bump the default `.amdhsa_code_object_version` to 6. ROCm 6.3 is required to run any program compiled with COV6.
+* Add a new `amdgcn.load.to.lds` intrinsic that wraps the existing global.load.lds
+intrinsic and has the same semantics. This intrinsic allows using buffer fat pointers
+(`ptr addrspace(7)`) as arguments, allowing loads to LDS from these pointers to be
+represented in the IR without needing to use buffer resource intrinsics directly.
+This intrinsic is exposed to Clang as `__builtin_amdgcn_load_to_lds`, though
+buffer fat pointers are not yet enabled in Clang. Migration to this intrinsic is
+optional, and there are no plans to deprecate `amdgcn.global.load.lds`.
+
Changes to the ARM Backend
--------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a57eb4a6dba49..7939ef0cf8620 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2641,6 +2641,26 @@ def int_amdgcn_perm :
// GFX9 Intrinsics
//===----------------------------------------------------------------------===//
+/// This is a general-purpose intrinsic for all operations that take a pointer
+/// a base location in LDS, and a data size and use it to perform a gather to LDS.
+/// This allows abstracting over both global pointers (address space 1) and
+/// the buffer-resource-wrapper pointers (address space 7 and 9).
+/// TODO: add support for address space 5 and scratch_load_lds.
+class AMDGPULoadToLDS :
+ Intrinsic <
+ [],
+ [llvm_anyptr_ty, // Base pointer to load from. Varies per lane.
+ LLVMQualPointerType<3>, // LDS base pointer to store to. Must be wave-uniform.
+ llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
+ llvm_i32_ty, // imm offset (applied to both input and LDS address)
+ llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
+ // bit 1 = sc1,
+ // bit 4 = scc))
+ [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+ ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]>;
+def int_amdgcn_load_to_lds : AMDGPULoadToLDS;
+
class AMDGPUGlobalLoadLDS :
ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
Intrinsic <
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2fa03e3964207..907b5b7e705d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2335,6 +2335,11 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_struct_buffer_load_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
return selectBufferLoadLds(I);
+ // Until we can store both the address space of the global and the LDS
+ // arguments by having tto MachineMemOperands on an intrinsic, we just trust
+ // that the argument is a global pointer (buffer pointers have been handled by
+ // a LLVM IR-level lowering).
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds:
return selectGlobalLoadLds(I);
case Intrinsic::amdgcn_exp_compr:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 7163ad2aa7dca..f86aafdf08f9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
case Intrinsic::memset:
case Intrinsic::memset_inline:
case Intrinsic::experimental_memset_pattern:
+ case Intrinsic::amdgcn_load_to_lds:
return true;
}
}
@@ -2255,6 +2256,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
SplitUsers.insert(&I);
return {NewRsrc, Off};
}
+ case Intrinsic::amdgcn_load_to_lds: {
+ Value *Ptr = I.getArgOperand(0);
+ if (!isSplitFatPtr(Ptr->getType()))
+ return {nullptr, nullptr};
+ IRB.SetInsertPoint(&I);
+ auto [Rsrc, Off] = getPtrParts(Ptr);
+ Value *LDSPtr = I.getArgOperand(1);
+ Value *LoadSize = I.getArgOperand(2);
+ Value *ImmOff = I.getArgOperand(3);
+ Value *Aux = I.getArgOperand(4);
+ Value *SOffset = IRB.getInt32(0);
+ Instruction *NewLoad = IRB.CreateIntrinsic(
+ Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {},
+ {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux});
+ copyMetadata(NewLoad, &I);
+ SplitUsers.insert(&I);
+ I.replaceAllUsesWith(NewLoad);
+ return {nullptr, nullptr};
+ }
}
return {nullptr, nullptr};
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1d0e81db5a5db..6085c8d584af2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3312,6 +3312,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 6); // soffset
return;
}
+ case Intrinsic::amdgcn_load_to_lds:
case Intrinsic::amdgcn_global_load_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithR...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/138141
More information about the cfe-commits
mailing list