[clang] [llvm] AMDGPU: Implement intrinsic/builtins for gfx1250 load transpose instructions (PR #146289)
Changpeng Fang via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 29 12:42:37 PDT 2025
https://github.com/changpeng created https://github.com/llvm/llvm-project/pull/146289
None
>From fc2039dcf338f04977b2a0b43e8714cb5eb0f440 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Fri, 27 Jun 2025 14:59:33 -0700
Subject: [PATCH] AMDGPU: Implement intrinsic/builtins for gfx1250 load
transpose instructions
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 13 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 36 ++
.../builtins-amdgcn-gfx1250-load-tr.cl | 130 +++++++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 +
.../Target/AMDGPU/AMDGPUSearchableTables.td | 6 +
llvm/lib/Target/AMDGPU/DSInstructions.td | 17 +-
llvm/lib/Target/AMDGPU/FLATInstructions.td | 5 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 12 +
.../UniformityAnalysis/AMDGPU/intrinsics.ll | 72 ++++
.../AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll | 322 ++++++++++++++++++
11 files changed, 622 insertions(+), 3 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 1d1f5a4ee3f9f..4e28f3bb7ef81 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -640,6 +640,19 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8i16, "V8sV8s*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8f16, "V8hV8h*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8bf16, "V8yV8y*1", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr4_b64_v2i32, "V2iV2i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr6_b96_v3i32, "V3iV3i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32")
+
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 982e5cd37ffd1..f09b3b92c4ea0 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -545,6 +545,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32:
@@ -555,6 +567,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32:
IID = Intrinsic::amdgcn_global_load_tr_b64;
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
@@ -563,8 +576,31 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16:
IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32:
+ IID = Intrinsic::amdgcn_global_load_tr4_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32:
+ IID = Intrinsic::amdgcn_global_load_tr6_b96;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32:
+ IID = Intrinsic::amdgcn_ds_load_tr4_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32:
+ IID = Intrinsic::amdgcn_ds_load_tr6_b96;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32:
+ IID = Intrinsic::amdgcn_ds_load_tr8_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16:
+ IID = Intrinsic::amdgcn_ds_load_tr16_b128;
+ break;
case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32:
IID = Intrinsic::amdgcn_ds_read_tr4_b64;
break;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
new file mode 100644
index 0000000000000..1e3a88a41f90e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
@@ -0,0 +1,130 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef int v3i __attribute__((ext_vector_type(3)));
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef __bf16 v8y __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr4_b64_v2i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_global_load_tr4_b64_v2i32(global v2i* inptr)
+{
+ return __builtin_amdgcn_global_load_tr4_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr8_b64_v2i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_global_load_tr8_b64_v2i32(global v2i* inptr)
+{
+ return __builtin_amdgcn_global_load_tr8_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr6_b96_v3i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <3 x i32> [[TMP0]]
+//
+v3i test_amdgcn_global_load_tr6_b96_v3i32(global v3i* inptr)
+{
+ return __builtin_amdgcn_global_load_tr6_b96_v3i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8i16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x i16> [[TMP0]]
+//
+v8s test_amdgcn_global_load_tr16_b128_v8i16(global v8s* inptr)
+{
+ return __builtin_amdgcn_global_load_tr16_b128_v8i16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8f16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x half> [[TMP0]]
+//
+v8h test_amdgcn_global_load_tr16_b128_v8f16(global v8h* inptr)
+{
+ return __builtin_amdgcn_global_load_tr16_b128_v8f16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x bfloat> [[TMP0]]
+//
+v8y test_amdgcn_global_load_tr16_b128_v8bf16(global v8y* inptr)
+{
+ return __builtin_amdgcn_global_load_tr16_b128_v8bf16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr4_b64_v2i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_ds_load_tr4_b64_v2i32(local v2i* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr4_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr8_b64_v2i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_ds_load_tr8_b64_v2i32(local v2i* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr8_b64_v2i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr6_b96_v3i32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <3 x i32> [[TMP0]]
+//
+v3i test_amdgcn_ds_load_tr6_b96_v3i32(local v3i* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr6_b96_v3i32(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8i16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x i16> [[TMP0]]
+//
+v8s test_amdgcn_ds_load_tr16_b128_v8i16(local v8s* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr16_b128_v8i16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8f16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x half> [[TMP0]]
+//
+v8h test_amdgcn_ds_load_tr16_b128_v8f16(local v8h* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr16_b128_v8f16(inptr);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) [[INPTR:%.*]])
+// CHECK-GFX1250-NEXT: ret <8 x bfloat> [[TMP0]]
+//
+v8y test_amdgcn_ds_load_tr16_b128_v8bf16(local v8y* inptr)
+{
+ return __builtin_amdgcn_ds_load_tr16_b128_v8bf16(inptr);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 6f974c97361de..ce37702b91486 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2996,6 +2996,12 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_global_load_tr4_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_global_load_tr6_b96 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_ds_load_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_load_tr16_b128 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_load_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
+def int_amdgcn_ds_load_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>;
def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b20760c356263..6874657a4ffe7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5105,6 +5105,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
+ case Intrinsic::amdgcn_global_load_tr4_b64:
+ case Intrinsic::amdgcn_global_load_tr6_b96:
+ case Intrinsic::amdgcn_ds_load_tr8_b64:
+ case Intrinsic::amdgcn_ds_load_tr16_b128:
+ case Intrinsic::amdgcn_ds_load_tr4_b64:
+ case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 3b62dcf3c92cd..1f6002a3c6a20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -344,6 +344,12 @@ def : SourceOfDivergence<intr>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr4_b64>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr6_b96>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr8_b64>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr16_b128>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr4_b64>;
+def : SourceOfDivergence<int_amdgcn_ds_load_tr6_b96>;
def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>;
def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 445ba9a26d336..f824253ce0f35 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -789,12 +789,12 @@ let WaveSizePredicate = isWave32, mayStore = 0 in {
let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VReg_64>;
defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VReg_96>;
-} // let OtherPredicates = [HasTransposeLoadF4F6Insts]
+} // End OtherPredicates = [HasTransposeLoadF4F6Insts]
defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>;
defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>;
-} // let WaveSizePredicate = isWave32, mayStore = 0
+} // End WaveSizePredicate = isWave32, mayStore = 0
-} // let SubtargetPredicate = isGFX1250Plus
+} // End SubtargetPredicate = isGFX1250Plus
let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
@@ -1276,6 +1276,17 @@ class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPa
(inst $ptr, Offset:$offset, (i1 0))
>;
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in {
+let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
+ def : DSLoadTrPat <DS_LOAD_TR4_B64, v2i32, int_amdgcn_ds_load_tr4_b64>;
+ def : DSLoadTrPat <DS_LOAD_TR6_B96, v3i32, int_amdgcn_ds_load_tr6_b96>;
+} // End OtherPredicates = [HasTransposeLoadF4F6Insts]
+
+ def : DSLoadTrPat <DS_LOAD_TR8_B64, v2i32, int_amdgcn_ds_load_tr8_b64>;
+ foreach vt = [v8i16, v8f16, v8bf16] in
+ def : DSLoadTrPat <DS_LOAD_TR16_B128, vt, int_amdgcn_ds_load_tr16_b128>;
+} // End WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus
+
let SubtargetPredicate = HasGFX950Insts in {
def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>;
def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c4db88b6e5105..dc6dbcef1f033 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1848,6 +1848,11 @@ let WaveSizePredicate = isWave64, OtherPredicates = [isGFX12PlusNot12_50] in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, vt>;
}
+let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] in {
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR4_B64, int_amdgcn_global_load_tr4_b64, v2i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>;
+}
+
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8d7dcf8c4a064..bb1de58e04fbc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1444,6 +1444,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_ds_load_tr6_b96:
+ case Intrinsic::amdgcn_ds_load_tr4_b64:
+ case Intrinsic::amdgcn_ds_load_tr8_b64:
+ case Intrinsic::amdgcn_ds_load_tr16_b128:
+ case Intrinsic::amdgcn_global_load_tr6_b96:
+ case Intrinsic::amdgcn_global_load_tr4_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
case Intrinsic::amdgcn_ds_read_tr4_b64:
@@ -1548,6 +1554,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
+ case Intrinsic::amdgcn_ds_load_tr8_b64:
+ case Intrinsic::amdgcn_ds_load_tr16_b128:
+ case Intrinsic::amdgcn_ds_load_tr4_b64:
+ case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr4_b64:
case Intrinsic::amdgcn_ds_read_tr6_b96:
case Intrinsic::amdgcn_ds_read_tr8_b64:
@@ -1562,6 +1572,8 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
+ case Intrinsic::amdgcn_global_load_tr4_b64:
+ case Intrinsic::amdgcn_global_load_tr6_b96:
Ptr = II->getArgOperand(0);
break;
case Intrinsic::amdgcn_load_to_lds:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 1028cc9ebb342..bd7464577b7db 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -261,6 +261,70 @@ bb:
ret void
}
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %addr)
+define amdgpu_kernel void @global_load_tr4_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %addr)
+ store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %addr)
+define amdgpu_kernel void @global_load_tr6_b96_v3i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %addr)
+ store <3 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %addr)
+define amdgpu_kernel void @ds_load_tr8_b64_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %addr)
+ store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) %addr)
+define amdgpu_kernel void @ds_load_tr4_b64_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) %addr)
+ store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) %addr)
+define amdgpu_kernel void @ds_load_tr6_b96_v3i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) %addr)
+ store <3 x i32> %tmp0, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) %addr)
+define amdgpu_kernel void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) %addr)
+ store <8 x i16> %tmp0, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %addr)
+define amdgpu_kernel void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %addr)
+ store <8 x half> %tmp0, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) %addr)
+define amdgpu_kernel void @ds_load_tr16_b128_v8bf16(ptr addrspace(3) %addr, ptr addrspace(1) %out) {
+bb:
+ %tmp0 = call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) %addr)
+ store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
+ ret void
+}
+
declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3))
; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep)
@@ -563,6 +627,14 @@ declare i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1))
declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))
declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1))
declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1))
+declare <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1))
+declare <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1))
+declare <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3))
+declare <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3))
+declare <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3))
+declare <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3))
+declare <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3))
+declare <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3))
declare i32 @llvm.amdgcn.dead.i32()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
new file mode 100644
index 0000000000000..d91b03ca4461d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1))
+declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
+declare <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1))
+declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
+declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1))
+declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1))
+
+declare <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32.p3(ptr addrspace(3))
+declare <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32.p3(ptr addrspace(3))
+declare <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3))
+declare <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16.p3(ptr addrspace(3))
+declare <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16.p3(ptr addrspace(3))
+declare <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16.p3(ptr addrspace(3))
+
+
+define amdgpu_ps void @global_load_tr4_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr4_b64_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_tr4_b64 v[0:1], v[0:1], off offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr4_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr4_b64_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_tr4_b64 v[2:3], v2, s[0:1] offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr8_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr8_b64_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_tr8_b64 v[0:1], v[0:1], off offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr8_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr8_b64_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_tr8_b64 v[2:3], v2, s[0:1] offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr6_b96_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr6_b96_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_tr6_b96 v[4:6], v[0:1], off offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b96 v[2:3], v[4:6], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
+ store <3 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr6_b96_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr6_b96_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v2, s[0:1] offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b96 v[0:1], v[2:4], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep)
+ store <3 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr16_b128_v8i16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr16_b128_v8i16_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep)
+ store <8 x i16> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr16_b128_v8i16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr16_b128_v8i16_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep)
+ store <8 x i16> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr16_b128_v8f16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr16_b128_v8f16_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep)
+ store <8 x half> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr16_b128_v8f16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr16_b128_v8f16_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep)
+ store <8 x half> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr16_b128_v8b16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr16_b128_v8b16_vaddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep)
+ store <8 x bfloat> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @global_load_tr16_b128_v8bf16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: global_load_tr16_b128_v8bf16_saddr:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep)
+ store <8 x bfloat> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_load_tr4_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX1250-SDAG-LABEL: ds_load_tr4_b64:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: ds_load_tr4_b64 v[0:1], v0 offset:32
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: ds_load_tr4_b64:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: ds_load_tr4_b64 v[0:1], v0 offset:32
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: global_store_b64 v[4:5], v[0:1], off
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32.p3(ptr addrspace(3) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_load_tr8_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX1250-SDAG-LABEL: ds_load_tr8_b64:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-SDAG-NEXT: ds_load_tr8_b64 v[0:1], v0 offset:32
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: ds_load_tr8_b64:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: ds_load_tr8_b64 v[0:1], v0 offset:32
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: global_store_b64 v[4:5], v[0:1], off
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32.p3(ptr addrspace(3) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_load_tr6_b96(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX1250-SDAG-LABEL: ds_load_tr6_b96:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX1250-SDAG-NEXT: ds_load_tr6_b96 v[0:2], v0 offset:32
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: global_store_b96 v[4:5], v[0:2], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: ds_load_tr6_b96:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: ds_load_tr6_b96 v[0:2], v0 offset:32
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: global_store_b96 v[4:5], v[0:2], off
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+ store <3 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8i16:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX1250-SDAG-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8i16:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16.p3(ptr addrspace(3) %gep)
+ store <8 x i16> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8f16:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX1250-SDAG-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8f16:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX1250-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16.p3(ptr addrspace(3) %gep)
+ store <8 x half> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_ps void @ds_load_tr16_b128_v8bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
+; GFX1250-LABEL: ds_load_tr16_b128_v8bf16:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX1250-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+ %val = call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16.p3(ptr addrspace(3) %gep)
+ store <8 x bfloat> %val, ptr addrspace(1) %use
+ ret void
+}
More information about the llvm-commits
mailing list