[llvm-branch-commits] [clang] [llvm] AMDGPU: Add support for load transpose instructions for gfx950 (PR #117378)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Nov 22 13:12:45 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
This patch support for intrinsics in clang, as well as assembly
instructions in the backend.
Co-authored-by: Sirish Pande <Sirish.Pande@<!-- -->amd.com>
---
Patch is 28.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117378.diff
13 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+5)
- (modified) clang/lib/CodeGen/CGBuiltin.cpp (+17-2)
- (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl (+50)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+5)
- (modified) llvm/lib/Target/AMDGPU/DSInstructions.td (+33)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+9-1)
- (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+44)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll (+108)
- (modified) llvm/test/MC/AMDGPU/gfx950-unsupported.s (+77)
- (added) llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s (+34)
- (added) llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt (+37)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 548bcc8ad55f48..a42ad56ce4f998 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -462,6 +462,11 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8, "V16fV4iV8iV16fiIiI
TARGET_BUILTIN(__builtin_amdgcn_permlane16_swap, "V2UiUiUiIbIb", "nc", "permlane16-swap")
TARGET_BUILTIN(__builtin_amdgcn_permlane32_swap, "V2UiUiUiIbIb", "nc", "permlane32-swap")
+TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
+
//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 8f754953d28998..d8c13584fa7aec 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19697,8 +19697,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: {
-
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
@@ -19713,6 +19716,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
+ IID = Intrinsic::amdgcn_ds_read_tr4_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
+ IID = Intrinsic::amdgcn_ds_read_tr8_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
+ IID = Intrinsic::amdgcn_ds_read_tr6_b96;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
+ IID = Intrinsic::amdgcn_ds_read_tr16_b64;
+ break;
}
llvm::Type *LoadTy = ConvertType(E->getType());
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
new file mode 100644
index 00000000000000..39fa46d5845f42
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
@@ -0,0 +1,50 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v3i __attribute__((ext_vector_type(3)));
+typedef short v4s __attribute__((ext_vector_type(4)));
+
+// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32(
+// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// GFX950-NEXT: entry:
+// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) [[INPTR]])
+// GFX950-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_ds_read_b64_tr_b4_v2i32(local v2i* inptr)
+{
+ return __builtin_amdgcn_ds_read_tr4_b64_v2i32(inptr);
+}
+
+// GFX950-LABEL: define dso_local <3 x i32> @test_amdgcn_ds_read_b96_tr_b6_v3i32(
+// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// GFX950-NEXT: entry:
+// GFX950-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) [[INPTR]])
+// GFX950-NEXT: ret <3 x i32> [[TMP0]]
+//
+v3i test_amdgcn_ds_read_b96_tr_b6_v3i32(local v3i* inptr)
+{
+ return __builtin_amdgcn_ds_read_tr6_b96_v3i32(inptr);
+}
+
+// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b8_v2i32(
+// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// GFX950-NEXT: entry:
+// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) [[INPTR]])
+// GFX950-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_ds_read_b64_tr_b8_v2i32(local v2i* inptr)
+{
+ return __builtin_amdgcn_ds_read_tr8_b64_v2i32(inptr);
+}
+
+// GFX950-LABEL: define dso_local <4 x i16> @test_amdgcn_ds_read_b64_tr_b16_v2i16(
+// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// GFX950-NEXT: entry:
+// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) [[INPTR]])
+// GFX950-NEXT: ret <4 x i16> [[TMP0]]
+//
+v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr)
+{
+ return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 73599851f50003..b46fe668ea7afd 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2726,6 +2726,10 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_read_tr16_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
// i32 @llvm.amdgcn.wave.id()
def int_amdgcn_wave_id :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8c050348f753bb..b3a6f0fd09ea02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4967,6 +4967,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
+ case Intrinsic::amdgcn_ds_read_tr4_b64:
+ case Intrinsic::amdgcn_ds_read_tr6_b96:
+ case Intrinsic::amdgcn_ds_read_tr8_b64:
+ case Intrinsic::amdgcn_ds_read_tr16_b64:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index bc8b373d06e01a..10175557fadc71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -345,6 +345,11 @@ def : SourceOfDivergence<intr>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
+def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>;
+def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>;
+def : SourceOfDivergence<int_amdgcn_ds_read_tr8_b64>;
+def : SourceOfDivergence<int_amdgcn_ds_read_tr16_b64>;
+
// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
// uniform result, so the returned struct is collectively divergent.
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 061ffda2498f45..7cbd6d2dc62097 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -294,6 +294,12 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
}
}
+multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
+ let has_m0_read = 0 in {
+ def "" : DS_1A_RET<opName, rc>;
+ }
+}
+
class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
DS_1A_RET<opName, rc, 1>;
@@ -744,6 +750,13 @@ multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
} // let SubtargetPredicate = isGFX12Plus
+let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
+ defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
+ defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>;
+ defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", VReg_64>;
+ defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", VReg_96>;
+}
+
//===----------------------------------------------------------------------===//
// DS Patterns
//===----------------------------------------------------------------------===//
@@ -1179,6 +1192,18 @@ def : GCNPat <
sub0)
>;
+class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPat <
+ (vt (node (DS1Addr1Offset i32:$ptr, i32:$offset))),
+ (inst $ptr, Offset:$offset, (i1 0))
+>;
+
+let SubtargetPredicate = HasGFX950Insts in {
+ def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>;
+ def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
+ def : DSLoadTrPat <DS_READ_B96_TR_B6, v3i32, int_amdgcn_ds_read_tr6_b96>;
+ def : DSLoadTrPat <DS_READ_B64_TR_B16, v4i16, int_amdgcn_ds_read_tr16_b64>;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
@@ -1748,3 +1773,11 @@ def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
+
+//===----------------------------------------------------------------------===//
+// GFX950.
+//===----------------------------------------------------------------------===//
+def DS_READ_B64_TR_B4_vi : DS_Real_vi<0x0e0, DS_READ_B64_TR_B4>;
+def DS_READ_B96_TR_B6_vi : DS_Real_vi<0x0e1, DS_READ_B96_TR_B6>;
+def DS_READ_B64_TR_B8_vi : DS_Real_vi<0x0e2, DS_READ_B64_TR_B8>;
+def DS_READ_B64_TR_B16_vi : DS_Real_vi<0x0e3, DS_READ_B64_TR_B16>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e520dfff1016b2..b30cf437877608 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1382,7 +1382,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
case Intrinsic::amdgcn_global_load_tr_b64:
- case Intrinsic::amdgcn_global_load_tr_b128: {
+ case Intrinsic::amdgcn_global_load_tr_b128:
+ case Intrinsic::amdgcn_ds_read_tr4_b64:
+ case Intrinsic::amdgcn_ds_read_tr6_b96:
+ case Intrinsic::amdgcn_ds_read_tr8_b64:
+ case Intrinsic::amdgcn_ds_read_tr16_b64: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
@@ -1477,6 +1481,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
+ case Intrinsic::amdgcn_ds_read_tr4_b64:
+ case Intrinsic::amdgcn_ds_read_tr6_b96:
+ case Intrinsic::amdgcn_ds_read_tr8_b64:
+ case Intrinsic::amdgcn_ds_read_tr16_b64:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 0dfd1d880f9cf2..fb1420ee340043 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -261,6 +261,50 @@ bb:
ret void
}
+declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3))
+
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
+define amdgpu_kernel void @ds_read_b64_tr4_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
+ store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3))
+
+; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
+define amdgpu_kernel void @ds_read_b96_tr6_v3i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
+ store <3 x i32> %tmp0, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3))
+
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
+define amdgpu_kernel void @ds_read_b64_tr8_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
+ store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3))
+
+; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
+define amdgpu_kernel void @ds_read_b64_tr_b16_v4i16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i16 4
+ %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
+ store <4 x i16> %tmp0, ptr addrspace(1) %out, align 16
+ ret void
+}
+
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
new file mode 100644
index 00000000000000..0689af0d56268d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
+
+declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
+declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
+declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3))
+declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3))
+
+define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX950-SDAG-LABEL: ds_read_b64_tr_b4:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: ds_read_b64_tr_b4:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX950-SDAG-LABEL: ds_read_b96_tr_b6:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: ds_read_b96_tr_b6:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
+; GFX950-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+ store <3 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_read_b64_tr_b8(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX950-SDAG-LABEL: ds_read_b64_tr_b8:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: ds_read_b64_tr_b8:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_read_b64_tr_b16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX950-SDAG-LABEL: ds_read_b64_tr_b16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: ds_read_b64_tr_b16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3) %gep)
+ store <4 x i16> %val, ptr addrspace(1) %use
+ ret void
+}
diff --git a/llvm/test/MC/AMDGPU/gfx950-unsupported.s b/llvm/test/MC/AMDGPU/gfx950-unsupported.s
index f8bbd40b700fd8..225784177ae185 100644
--- a/llvm/test/MC/AMDGPU/gfx950-unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx950-unsupported...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/117378
More information about the llvm-branch-commits
mailing list