[clang] [llvm] AMDGPU: Add support for load transpose instructions for gfx950 (PR #117378)
Matt Arsenault via cfe-commits
cfe-commits at lists.llvm.org
Mon Nov 25 09:33:44 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117378
>From 0faa36e6240f21d77d94e981712f4b57bc4b5d11 Mon Sep 17 00:00:00 2001
From: Sirish Pande <Sirish.Pande at amd.com>
Date: Wed, 7 Feb 2024 10:12:03 -0600
Subject: [PATCH] AMDGPU: Add support for load transpose instructions for
gfx950
This patch support for intrinsics in clang, as well as assembly
instructions in the backend.
Co-authored-by: Sirish Pande <Sirish.Pande at amd.com>
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 5 +
clang/lib/CodeGen/CGBuiltin.cpp | 19 ++-
.../builtins-amdgcn-gfx950-read-tr.cl | 50 ++++++++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 +
.../Target/AMDGPU/AMDGPUSearchableTables.td | 5 +
llvm/lib/Target/AMDGPU/DSInstructions.td | 33 ++++++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +-
.../UniformityAnalysis/AMDGPU/intrinsics.ll | 44 +++++++
.../AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll | 108 ++++++++++++++++++
llvm/test/MC/AMDGPU/gfx950-unsupported.s | 77 +++++++++++++
llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s | 34 ++++++
.../AMDGPU/gfx950_dasm_ds_read_tr.txt | 37 ++++++
13 files changed, 427 insertions(+), 3 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
create mode 100644 llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s
create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 548bcc8ad55f48..a42ad56ce4f998 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -462,6 +462,11 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8, "V16fV4iV8iV16fiIiI
TARGET_BUILTIN(__builtin_amdgcn_permlane16_swap, "V2UiUiUiIbIb", "nc", "permlane16-swap")
TARGET_BUILTIN(__builtin_amdgcn_permlane32_swap, "V2UiUiUiIbIb", "nc", "permlane32-swap")
+TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts")
+
//===----------------------------------------------------------------------===//
// GFX12+ only builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3db439c87fa326..91b70b4fdf3d20 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19697,8 +19697,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: {
-
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
@@ -19713,6 +19716,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
+ IID = Intrinsic::amdgcn_ds_read_tr4_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
+ IID = Intrinsic::amdgcn_ds_read_tr8_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
+ IID = Intrinsic::amdgcn_ds_read_tr6_b96;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16:
+ IID = Intrinsic::amdgcn_ds_read_tr16_b64;
+ break;
}
llvm::Type *LoadTy = ConvertType(E->getType());
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
new file mode 100644
index 00000000000000..39fa46d5845f42
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
@@ -0,0 +1,50 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v3i __attribute__((ext_vector_type(3)));
+typedef short v4s __attribute__((ext_vector_type(4)));
+
+// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32(
+// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// GFX950-NEXT: entry:
+// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) [[INPTR]])
+// GFX950-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_ds_read_b64_tr_b4_v2i32(local v2i* inptr)
+{
+ return __builtin_amdgcn_ds_read_tr4_b64_v2i32(inptr);
+}
+
+// GFX950-LABEL: define dso_local <3 x i32> @test_amdgcn_ds_read_b96_tr_b6_v3i32(
+// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// GFX950-NEXT: entry:
+// GFX950-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) [[INPTR]])
+// GFX950-NEXT: ret <3 x i32> [[TMP0]]
+//
+v3i test_amdgcn_ds_read_b96_tr_b6_v3i32(local v3i* inptr)
+{
+ return __builtin_amdgcn_ds_read_tr6_b96_v3i32(inptr);
+}
+
+// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b8_v2i32(
+// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// GFX950-NEXT: entry:
+// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) [[INPTR]])
+// GFX950-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_ds_read_b64_tr_b8_v2i32(local v2i* inptr)
+{
+ return __builtin_amdgcn_ds_read_tr8_b64_v2i32(inptr);
+}
+
+// GFX950-LABEL: define dso_local <4 x i16> @test_amdgcn_ds_read_b64_tr_b16_v2i16(
+// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// GFX950-NEXT: entry:
+// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) [[INPTR]])
+// GFX950-NEXT: ret <4 x i16> [[TMP0]]
+//
+v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr)
+{
+ return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 73599851f50003..b46fe668ea7afd 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2726,6 +2726,10 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_read_tr16_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
// i32 @llvm.amdgcn.wave.id()
def int_amdgcn_wave_id :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8c050348f753bb..b3a6f0fd09ea02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4967,6 +4967,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
+ case Intrinsic::amdgcn_ds_read_tr4_b64:
+ case Intrinsic::amdgcn_ds_read_tr6_b96:
+ case Intrinsic::amdgcn_ds_read_tr8_b64:
+ case Intrinsic::amdgcn_ds_read_tr16_b64:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index bc8b373d06e01a..10175557fadc71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -345,6 +345,11 @@ def : SourceOfDivergence<intr>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
+def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>;
+def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>;
+def : SourceOfDivergence<int_amdgcn_ds_read_tr8_b64>;
+def : SourceOfDivergence<int_amdgcn_ds_read_tr16_b64>;
+
// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
// uniform result, so the returned struct is collectively divergent.
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 061ffda2498f45..7cbd6d2dc62097 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -294,6 +294,12 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
}
}
+multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
+ let has_m0_read = 0 in {
+ def "" : DS_1A_RET<opName, rc>;
+ }
+}
+
class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
DS_1A_RET<opName, rc, 1>;
@@ -744,6 +750,13 @@ multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
} // let SubtargetPredicate = isGFX12Plus
+let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
+ defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
+ defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>;
+ defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", VReg_64>;
+ defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", VReg_96>;
+}
+
//===----------------------------------------------------------------------===//
// DS Patterns
//===----------------------------------------------------------------------===//
@@ -1179,6 +1192,18 @@ def : GCNPat <
sub0)
>;
+class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPat <
+ (vt (node (DS1Addr1Offset i32:$ptr, i32:$offset))),
+ (inst $ptr, Offset:$offset, (i1 0))
+>;
+
+let SubtargetPredicate = HasGFX950Insts in {
+ def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>;
+ def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
+ def : DSLoadTrPat <DS_READ_B96_TR_B6, v3i32, int_amdgcn_ds_read_tr6_b96>;
+ def : DSLoadTrPat <DS_READ_B64_TR_B16, v4i16, int_amdgcn_ds_read_tr16_b64>;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
@@ -1748,3 +1773,11 @@ def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>;
def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>;
def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>;
def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>;
+
+//===----------------------------------------------------------------------===//
+// GFX950.
+//===----------------------------------------------------------------------===//
+def DS_READ_B64_TR_B4_vi : DS_Real_vi<0x0e0, DS_READ_B64_TR_B4>;
+def DS_READ_B96_TR_B6_vi : DS_Real_vi<0x0e1, DS_READ_B96_TR_B6>;
+def DS_READ_B64_TR_B8_vi : DS_Real_vi<0x0e2, DS_READ_B64_TR_B8>;
+def DS_READ_B64_TR_B16_vi : DS_Real_vi<0x0e3, DS_READ_B64_TR_B16>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f3b5e6985e8e0d..d35bb15ac6566a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1382,7 +1382,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
case Intrinsic::amdgcn_global_load_tr_b64:
- case Intrinsic::amdgcn_global_load_tr_b128: {
+ case Intrinsic::amdgcn_global_load_tr_b128:
+ case Intrinsic::amdgcn_ds_read_tr4_b64:
+ case Intrinsic::amdgcn_ds_read_tr6_b96:
+ case Intrinsic::amdgcn_ds_read_tr8_b64:
+ case Intrinsic::amdgcn_ds_read_tr16_b64: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
@@ -1477,6 +1481,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
+ case Intrinsic::amdgcn_ds_read_tr4_b64:
+ case Intrinsic::amdgcn_ds_read_tr6_b96:
+ case Intrinsic::amdgcn_ds_read_tr8_b64:
+ case Intrinsic::amdgcn_ds_read_tr16_b64:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 0dfd1d880f9cf2..fb1420ee340043 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -261,6 +261,50 @@ bb:
ret void
}
+declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3))
+
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
+define amdgpu_kernel void @ds_read_b64_tr4_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
+ store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3))
+
+; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
+define amdgpu_kernel void @ds_read_b96_tr6_v3i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep)
+ store <3 x i32> %tmp0, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3))
+
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
+define amdgpu_kernel void @ds_read_b64_tr8_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep)
+ store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3))
+
+; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
+define amdgpu_kernel void @ds_read_b64_tr_b16_v4i16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i16 4
+ %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep)
+ store <4 x i16> %tmp0, ptr addrspace(1) %out, align 16
+ ret void
+}
+
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
new file mode 100644
index 00000000000000..0689af0d56268d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
+
+declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
+declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
+declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3))
+declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3))
+
+define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX950-SDAG-LABEL: ds_read_b64_tr_b4:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: ds_read_b64_tr_b4:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX950-SDAG-LABEL: ds_read_b96_tr_b6:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: ds_read_b96_tr_b6:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: global_store_dwordx3 v[4:5], v[0:2], off
+; GFX950-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+ store <3 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_read_b64_tr_b8(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX950-SDAG-LABEL: ds_read_b64_tr_b8:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: ds_read_b64_tr_b8:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_read_b64_tr_b16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX950-SDAG-LABEL: ds_read_b64_tr_b16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: ds_read_b64_tr_b16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3) %gep)
+ store <4 x i16> %val, ptr addrspace(1) %use
+ ret void
+}
diff --git a/llvm/test/MC/AMDGPU/gfx950-unsupported.s b/llvm/test/MC/AMDGPU/gfx950-unsupported.s
index f8bbd40b700fd8..225784177ae185 100644
--- a/llvm/test/MC/AMDGPU/gfx950-unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx950-unsupported.s
@@ -1,4 +1,5 @@
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck -check-prefix=ERR %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=W32-ERR %s
//===----------------------------------------------------------------------===//
// v_mfma_f32_32x32x4_xf32
@@ -177,3 +178,79 @@ v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7]
v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+//===----------------------------------------------------------------------===//
+// ds_read_b64_tr_b4
+//===----------------------------------------------------------------------===//
+ds_read_b64_tr_b4 v[1:2], v0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b64_tr_b4 v1, v0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b64_tr_b4 v[0:1], s0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b64_tr_b4 v[2:3], v2 offset:-64
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 16-bit unsigned offset
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+//===----------------------------------------------------------------------===//
+//ds_read_b64_tr_b8
+//===----------------------------------------------------------------------===//
+ds_read_b64_tr_b8 v[1:2], v0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b64_tr_b8 v1, v0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b64_tr_b8 v[0:1], s0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b64_tr_b8 v[2:3], v2 offset:-64
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 16-bit unsigned offset
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+//===----------------------------------------------------------------------===//
+// ds_read_b64_tr_b16
+//===----------------------------------------------------------------------===//
+ds_read_b64_tr_b16 v[1:2], v0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b64_tr_b16 v1, v0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b64_tr_b16 v[0:1], s0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b64_tr_b16 v[2:3], v2 offset:-64
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 16-bit unsigned offset
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+//===----------------------------------------------------------------------===//
+// ds_read_b96_tr_b6
+//===----------------------------------------------------------------------===//
+ds_read_b96_tr_b6 v[1:3], v0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b96_tr_b6 v1, v0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b96_tr_b6 v[0:3], s0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+ds_read_b96_tr_b6 v[2:4], v2 offset:-64
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 16-bit unsigned offset
+// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s b/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s
new file mode 100644
index 00000000000000..93d015f790c862
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s
@@ -0,0 +1,34 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940-ERR --implicit-check-not=error: %s
+
+ds_read_b64_tr_b4 v[0:1], v1
+// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// GFX950: encoding: [0x00,0x00,0xc0,0xd9,0x01,0x00,0x00,0x00]
+
+ds_read_b64_tr_b4 v[2:3], v3 offset:64
+// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// GFX950: encoding: [0x40,0x00,0xc0,0xd9,0x03,0x00,0x00,0x02]
+
+ds_read_b64_tr_b8 v[0:1], v1
+// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// GFX950: encoding: [0x00,0x00,0xc4,0xd9,0x01,0x00,0x00,0x00]
+
+ds_read_b64_tr_b8 v[2:3], v3 offset:64
+// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// GFX950: encoding: [0x40,0x00,0xc4,0xd9,0x03,0x00,0x00,0x02]
+
+ds_read_b64_tr_b16 v[0:1], v1
+// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// GFX950: encoding: [0x00,0x00,0xc6,0xd9,0x01,0x00,0x00,0x00]
+
+ds_read_b64_tr_b16 v[2:3], v3 offset:64
+// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// GFX950: encoding: [0x40,0x00,0xc6,0xd9,0x03,0x00,0x00,0x02]
+
+ds_read_b96_tr_b6 v[0:2], v0
+// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// GFX950: encoding: [0x00,0x00,0xc2,0xd9,0x00,0x00,0x00,0x00]
+
+ds_read_b96_tr_b6 v[2:4], v2 offset:64
+// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+// GFX950: encoding: [0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt
new file mode 100644
index 00000000000000..1efd2d7b996d48
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt
@@ -0,0 +1,37 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -disassemble -show-encoding %s | FileCheck -check-prefix=GFX950 %s
+
+# GFX950: ds_read_b64_tr_b4 v[0:1], v0 ; encoding: [0x00,0x00,0xc0,0xd9,0x00,0x00,0x00,0x00]
+0x00,0x00,0xc0,0xd9,0x00,0x00,0x00,0x00
+
+# GFX950: ds_read_b64_tr_b4 v[2:3], v2 ; encoding: [0x00,0x00,0xc0,0xd9,0x02,0x00,0x00,0x02]
+0x00,0x00,0xc0,0xd9,0x02,0x00,0x00,0x02
+
+# GFX950: ds_read_b64_tr_b4 v[2:3], v2 offset:64 ; encoding: [0x40,0x00,0xc0,0xd9,0x02,0x00,0x00,0x02]
+0x40,0x00,0xc0,0xd9,0x02,0x00,0x00,0x02
+
+# GFX950: ds_read_b64_tr_b8 v[0:1], v0 ; encoding: [0x00,0x00,0xc4,0xd9,0x00,0x00,0x00,0x00]
+0x00,0x00,0xc4,0xd9,0x00,0x00,0x00,0x00
+
+# GFX950: ds_read_b64_tr_b8 v[2:3], v2 ; encoding: [0x00,0x00,0xc4,0xd9,0x02,0x00,0x00,0x02]
+0x00,0x00,0xc4,0xd9,0x02,0x00,0x00,0x02
+
+# GFX950: ds_read_b64_tr_b8 v[2:3], v2 offset:64 ; encoding: [0x40,0x00,0xc4,0xd9,0x02,0x00,0x00,0x02]
+0x40,0x00,0xc4,0xd9,0x02,0x00,0x00,0x02
+
+# GFX950: ds_read_b64_tr_b16 v[0:1], v0 ; encoding: [0x00,0x00,0xc6,0xd9,0x00,0x00,0x00,0x00]
+0x00,0x00,0xc6,0xd9,0x00,0x00,0x00,0x00
+
+# GFX950: ds_read_b64_tr_b16 v[2:3], v2 ; encoding: [0x00,0x00,0xc6,0xd9,0x02,0x00,0x00,0x02]
+0x00,0x00,0xc6,0xd9,0x02,0x00,0x00,0x02
+
+# GFX950: ds_read_b64_tr_b16 v[2:3], v2 offset:64 ; encoding: [0x40,0x00,0xc6,0xd9,0x02,0x00,0x00,0x02]
+0x40,0x00,0xc6,0xd9,0x02,0x00,0x00,0x02
+
+# GFX950: ds_read_b96_tr_b6 v[0:2], v0 ; encoding: [0x00,0x00,0xc2,0xd9,0x00,0x00,0x00,0x00]
+0x00,0x00,0xc2,0xd9,0x00,0x00,0x00,0x00
+
+# GFX950: ds_read_b96_tr_b6 v[2:4], v2 ; encoding: [0x00,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02]
+0x00,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02
+
+# GFX950: ds_read_b96_tr_b6 v[2:4], v2 offset:64 ; encoding: [0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02]
+0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02
More information about the cfe-commits
mailing list