[llvm] [clang] [AMDGPU] Add global_load_tr for GFX12 (PR #77772)
Piotr Sobczak via cfe-commits
cfe-commits at lists.llvm.org
Tue Jan 16 03:47:15 PST 2024
https://github.com/piotrAMD updated https://github.com/llvm/llvm-project/pull/77772
>From 1b2085465dd0988459a4c71dab6cd65b1de065be Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Thu, 11 Jan 2024 14:52:59 +0100
Subject: [PATCH 1/5] [AMDGPU] Add global_load_tr for GFX12
Support new amdgcn_global_load_tr instructions for load with transpose.
* MC layer support for GLOBAL_LOAD_TR_B64/GLOBAL_LOAD_TR_B128
* Intrinsics int_amdgcn_global_load_tr_b64/int_amdgcn_global_load_tr_b128
* Clang builtins amdgcn_global_load_tr_b64/amdgcn_global_load_tr_b128
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 7 ++
clang/lib/CodeGen/CGBuiltin.cpp | 45 ++++++++
...uiltins-amdgcn-global-load-tr-gfx11-err.cl | 26 +++++
...ins-amdgcn-global-load-tr-gfx12-w32-err.cl | 15 +++
...ins-amdgcn-global-load-tr-gfx12-w64-err.cl | 16 +++
.../builtins-amdgcn-global-load-tr-w32.cl | 48 ++++++++
.../builtins-amdgcn-global-load-tr-w64.cl | 47 ++++++++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 21 ++++
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +
.../Disassembler/AMDGPUDisassembler.cpp | 4 +
llvm/lib/Target/AMDGPU/FLATInstructions.td | 33 ++++++
.../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 106 ++++++++++++++++++
.../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 106 ++++++++++++++++++
llvm/test/MC/AMDGPU/gfx11_unsupported.s | 6 +
.../test/MC/AMDGPU/gfx12_asm_global_load_tr.s | 103 +++++++++++++++++
.../AMDGPU/gfx12_dasm_global_load_tr.txt | 34 ++++++
16 files changed, 619 insertions(+)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s
create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194e..098c309f8085375 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -423,6 +423,13 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 998fcc3af581753..dc634b1c388f46f 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18178,6 +18178,51 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
}
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: {
+
+ Intrinsic::ID IID;
+ llvm::Type *ArgTy;
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
+ ArgTy = llvm::FixedVectorType::get(
+ llvm::Type::getInt32Ty(getLLVMContext()), 2);
+ IID = Intrinsic::amdgcn_global_load_tr_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
+ ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
+ IID = Intrinsic::amdgcn_global_load_tr_b64;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
+ ArgTy = llvm::FixedVectorType::get(
+ llvm::Type::getInt16Ty(getLLVMContext()), 8);
+ IID = Intrinsic::amdgcn_global_load_tr_b128;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
+ ArgTy = llvm::FixedVectorType::get(
+ llvm::Type::getHalfTy(getLLVMContext()), 8);
+ IID = Intrinsic::amdgcn_global_load_tr_b128;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
+ ArgTy = llvm::FixedVectorType::get(
+ llvm::Type::getInt16Ty(getLLVMContext()), 4);
+ IID = Intrinsic::amdgcn_global_load_tr_b128;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
+ ArgTy = llvm::FixedVectorType::get(
+ llvm::Type::getHalfTy(getLLVMContext()), 4);
+ IID = Intrinsic::amdgcn_global_load_tr_b128;
+ break;
+ }
+
+ llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
+ llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
+ return Builder.CreateCall(F, {Addr});
+ }
case AMDGPU::BI__builtin_amdgcn_read_exec:
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
new file mode 100644
index 000000000000000..10e2325cdea75cd
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 -emit-llvm \
+// RUN: -verify -S -o - %s
+
+// REQUIRES: amdgpu-registered-target
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+
+typedef half v4h __attribute__((ext_vector_type(4)));
+typedef short v4s __attribute__((ext_vector_type(4)));
+
+
+
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr,
+ global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
+{
+ v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
+ v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
+ v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+
+ int out_4 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
+ v4s out_5 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+ v4h out_6 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+}
+
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
new file mode 100644
index 000000000000000..299a793a7b31e13
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm \
+// RUN: -verify -S -o - %s
+
+// REQUIRES: amdgpu-registered-target
+
+typedef half v4h __attribute__((ext_vector_type(4)));
+typedef short v4s __attribute__((ext_vector_type(4)));
+
+void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
+{
+ int out_4 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
+ v4s out_5 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+ v4h out_6 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+}
+
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
new file mode 100644
index 000000000000000..79f374af240c7e4
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm \
+// RUN: -verify -S -o - %s
+
+// REQUIRES: amdgpu-registered-target
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr)
+{
+ v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
+ v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
+ v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+}
+
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
new file mode 100644
index 000000000000000..df523827e668d48
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
@@ -0,0 +1,48 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int v2i __attribute__((ext_vector_type(2)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef short v8s __attribute__((ext_vector_type(8)));
+
+// Wave32
+
+//
+// amdgcn_global_load_tr_b64
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]]
+//
+v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
+{
+ return __builtin_amdgcn_global_load_tr_b64_v2i32(inptr);
+}
+
+//
+// amdgcn_global_load_tr_b128
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]]
+//
+v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr)
+{
+ return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]]
+//
+v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr)
+{
+ return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
+}
+
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
new file mode 100644
index 000000000000000..06b512164073774
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
@@ -0,0 +1,47 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef half v4h __attribute__((ext_vector_type(4)));
+typedef short v4s __attribute__((ext_vector_type(4)));
+
+// Wave64
+
+//
+// amdgcn_global_load_tr_b64
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: ret i32 [[TMP0]]
+//
+int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
+{
+ return __builtin_amdgcn_global_load_tr_b64_i32(inptr);
+}
+
+//
+// amdgcn_global_load_tr_b128
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]]
+//
+v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr)
+{
+ return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16(
+// CHECK-GFX1200-NEXT: entry:
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]]
+//
+v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr)
+{
+ return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
+}
+
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e5596258847f9f1..ad850c9c31490cc 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2496,6 +2496,27 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+class AMDGPUGlobalLoadTr<LLVMType data_ty> :
+ Intrinsic<
+ [data_ty],
+ [global_ptr_ty],
+ [IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree],
+ "",
+ [SDNPMemOperand]
+ >;
+
+// Wave32
+// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))
+// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1))
+// <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1))
+// Wave64
+// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))
+// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))
+// <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1))
+
+def int_amdgcn_global_load_tr_b64 : AMDGPUGlobalLoadTr<llvm_any_ty>;
+def int_amdgcn_global_load_tr_b128 : AMDGPUGlobalLoadTr<llvm_any_ty>;
+
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 391c2b9ec256eac..0cfab44a7a0354e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4837,6 +4837,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
+ case Intrinsic::amdgcn_global_load_tr_b64:
+ case Intrinsic::amdgcn_global_load_tr_b128:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 9dff3f6c2efd025..441032a37dfd9ee 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -544,6 +544,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS);
if (Res)
break;
+
+ Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS);
+ if (Res)
+ break;
}
// Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 16a8b770e0577d5..47c3d806e487e34 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -995,6 +995,17 @@ defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_d
} // End SubtargetPredicate = HasFlatScratchInsts
+let SubtargetPredicate = isGFX12Plus in {
+ let WaveSizePredicate = isWave32 in {
+ defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>;
+ defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w32", VReg_64>;
+ }
+ let WaveSizePredicate = isWave64 in {
+ defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>;
+ defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
+ }
+} // End SubtargetPredicate = isGFX12Plus
+
let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
defm GLOBAL_ATOMIC_FCMPSWAP :
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
@@ -1559,6 +1570,17 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i
let OtherPredicates = [isGFX12Plus] in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
+
+ let WaveSizePredicate = isWave32 in {
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8f16>;
+ }
+ let WaveSizePredicate = isWave64 in {
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4i16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4f16>;
+ }
}
let OtherPredicates = [isGFX10Plus] in {
@@ -2686,6 +2708,17 @@ defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_A
defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">;
defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">;
defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
+
+let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
+ defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w32", "global_load_tr_b128">;
+ defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w32", "global_load_tr_b64">;
+}
+
+let WaveSizePredicate = isWave64, DecoderNamespace = "GFX12W64" in {
+ defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w64", "global_load_tr_b128">;
+ defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w64", "global_load_tr_b64">;
+}
+
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;
defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
new file mode 100644
index 000000000000000..89a9138d4d2c626
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W32 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W32 %s
+
+declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
+declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
+declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1))
+
+define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-SDAG-W32-LABEL: global_load_tr_b64:
+; GFX12-SDAG-W32: ; %bb.0: ; %entry
+; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-SDAG-W32-NEXT: s_nop 0
+; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-W32-NEXT: s_endpgm
+;
+; GFX12-GISEL-W32-LABEL: global_load_tr_b64:
+; GFX12-GISEL-W32: ; %bb.0: ; %entry
+; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-GISEL-W32-NEXT: s_nop 0
+; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-W32-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep)
+ store <2 x i32> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-SDAG-W32-LABEL: global_load_tr_b128_i16:
+; GFX12-SDAG-W32: ; %bb.0: ; %entry
+; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-SDAG-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-SDAG-W32-NEXT: s_nop 0
+; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-W32-NEXT: s_endpgm
+;
+; GFX12-GISEL-W32-LABEL: global_load_tr_b128_i16:
+; GFX12-GISEL-W32: ; %bb.0: ; %entry
+; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-GISEL-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-GISEL-W32-NEXT: s_nop 0
+; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-W32-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep)
+ store <8 x i16> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-SDAG-W32-LABEL: global_load_tr_b128_half:
+; GFX12-SDAG-W32: ; %bb.0: ; %entry
+; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-SDAG-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-SDAG-W32-NEXT: s_nop 0
+; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-W32-NEXT: s_endpgm
+;
+; GFX12-GISEL-W32-LABEL: global_load_tr_b128_half:
+; GFX12-GISEL-W32: ; %bb.0: ; %entry
+; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-GISEL-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-GISEL-W32-NEXT: s_nop 0
+; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-W32-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep)
+ store <8 x half> %val, ptr addrspace(1) %use
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
new file mode 100644
index 000000000000000..73dc4fa506c7ba0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W64 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W64 %s
+
+declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
+declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
+declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1))
+
+define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-SDAG-W64-LABEL: global_load_tr_b64:
+; GFX12-SDAG-W64: ; %bb.0: ; %entry
+; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-W64-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX12-SDAG-W64-NEXT: s_nop 0
+; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-W64-NEXT: s_endpgm
+;
+; GFX12-GISEL-W64-LABEL: global_load_tr_b64:
+; GFX12-GISEL-W64: ; %bb.0: ; %entry
+; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-GISEL-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-W64-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX12-GISEL-W64-NEXT: s_nop 0
+; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-W64-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1) %gep)
+ store i32 %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-SDAG-W64-LABEL: global_load_tr_b128_i16:
+; GFX12-SDAG-W64: ; %bb.0: ; %entry
+; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-SDAG-W64-NEXT: s_nop 0
+; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-W64-NEXT: s_endpgm
+;
+; GFX12-GISEL-W64-LABEL: global_load_tr_b128_i16:
+; GFX12-GISEL-W64: ; %bb.0: ; %entry
+; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-GISEL-W64-NEXT: s_nop 0
+; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-W64-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1) %gep)
+ store <4 x i16> %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-SDAG-W64-LABEL: global_load_tr_b128_half:
+; GFX12-SDAG-W64: ; %bb.0: ; %entry
+; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-SDAG-W64-NEXT: s_nop 0
+; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-W64-NEXT: s_endpgm
+;
+; GFX12-GISEL-W64-LABEL: global_load_tr_b128_half:
+; GFX12-GISEL-W64: ; %bb.0: ; %entry
+; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-GISEL-W64-NEXT: s_nop 0
+; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-W64-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1) %gep)
+ store <4 x half> %val, ptr addrspace(1) %use
+ ret void
+}
diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
index e01eb05e85588d2..ab7e97b482da9d9 100644
--- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
@@ -211,6 +211,12 @@ global_load_lds_ubyte v[2:3], off
global_load_lds_ushort v[2:3], off
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+global_load_tr_b128 v[1:4], v5, s[2:3]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+global_load_tr_b64 v[1:2], v[3:4], off
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
image_atomic_fcmpswap v[1:2], v2, s[12:19] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s b/llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s
new file mode 100644
index 000000000000000..597e0d29b43afd6
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s
@@ -0,0 +1,103 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s
+
+global_load_tr_b128 v[1:4], v0, s[0:1] offset:-64
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_tr_b128 v[1:4], v0, s[0:1] offset:64
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_tr_b128 v[1:4], v5, s[2:3]
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x02,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+
+global_load_tr_b128 v[1:4], v[0:1], off offset:-64
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_tr_b128 v[1:4], v[0:1], off offset:64
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_tr_b128 v[1:4], v[5:6], off
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+
+global_load_tr_b64 v[1:2], v0, s[0:1] offset:-64
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_tr_b64 v[1:2], v0, s[0:1] offset:64
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_tr_b64 v[1:2], v3, s[2:3]
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x02,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+
+global_load_tr_b64 v[1:2], v[0:1], off offset:-64
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_tr_b64 v[1:2], v[0:1], off offset:64
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_tr_b64 v[1:2], v[3:4], off
+// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W32: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+
+
+
+global_load_tr_b128 v[1:2], v0, s[0:1] offset:-64
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_tr_b128 v[1:2], v0, s[0:1] offset:64
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_tr_b128 v[1:2], v5, s[2:3]
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x02,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+
+global_load_tr_b128 v[1:2], v[0:1], off offset:-64
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_tr_b128 v[1:2], v[0:1], off offset:64
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_tr_b128 v[1:2], v[5:6], off
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+
+global_load_tr_b64 v1, v0, s[0:1] offset:-64
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_tr_b64 v1, v0, s[0:1] offset:64
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_tr_b64 v1, v3, s[2:3]
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x02,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+
+global_load_tr_b64 v1, v[0:1], off offset:-64
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
+
+global_load_tr_b64 v1, v[0:1], off offset:64
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+
+global_load_tr_b64 v1, v[3:4], off
+// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// W64: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt
new file mode 100644
index 000000000000000..e8498d4aef0a1bd
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt
@@ -0,0 +1,34 @@
+# RUN: llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s
+# RUN: llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s
+
+# W32: global_load_tr_b128 v[1:4], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+# W64: global_load_tr_b128 v[1:2], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# W32: global_load_tr_b128 v[1:4], v5, s[2:3] ; encoding: [0x02,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+# W64: global_load_tr_b128 v[1:2], v5, s[2:3] ; encoding: [0x02,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+0x02,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00
+
+# W32: global_load_tr_b128 v[1:4], v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+# W64: global_load_tr_b128 v[1:2], v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# W32: global_load_tr_b128 v[1:4], v[5:6], off ; encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+# W64: global_load_tr_b128 v[1:2], v[5:6], off ; encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00]
+0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00
+
+# W32: global_load_tr_b64 v[1:2], v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+# W64: global_load_tr_b64 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# W32: global_load_tr_b64 v[1:2], v3, s[2:3] ; encoding: [0x02,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+# W64: global_load_tr_b64 v1, v3, s[2:3] ; encoding: [0x02,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x02,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
+
+# W32: global_load_tr_b64 v[1:2], v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+# W64: global_load_tr_b64 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
+0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
+
+# W32: global_load_tr_b64 v[1:2], v[3:4], off ; encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+# W64: global_load_tr_b64 v1, v[3:4], off ; encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00]
+0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00
>From b49e50fc0162daadb163c9773ea9d23e76196daf Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Fri, 12 Jan 2024 13:33:57 +0100
Subject: [PATCH 2/5] Common up intrinsic variants
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 12 ++---
clang/lib/CodeGen/CGBuiltin.cpp | 45 ++++++++-----------
...uiltins-amdgcn-global-load-tr-gfx11-err.cl | 12 ++---
...ins-amdgcn-global-load-tr-gfx12-w32-err.cl | 6 +--
...ins-amdgcn-global-load-tr-gfx12-w64-err.cl | 6 +--
.../builtins-amdgcn-global-load-tr-w32.cl | 28 ++++++------
.../builtins-amdgcn-global-load-tr-w64.cl | 28 ++++++------
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 +++----
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 +-
llvm/lib/Target/AMDGPU/FLATInstructions.td | 12 ++---
.../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 12 ++---
.../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 12 ++---
12 files changed, 91 insertions(+), 100 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 098c309f8085375..9b745819454aef6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -423,13 +423,13 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index dc634b1c388f46f..f9794ebd6be33bc 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18178,49 +18178,42 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
}
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: {
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: {
- Intrinsic::ID IID;
llvm::Type *ArgTy;
switch (BuiltinID) {
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
- ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getInt32Ty(getLLVMContext()), 2);
- IID = Intrinsic::amdgcn_global_load_tr_b64;
- break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
- IID = Intrinsic::amdgcn_global_load_tr_b64;
break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getInt16Ty(getLLVMContext()), 8);
- IID = Intrinsic::amdgcn_global_load_tr_b128;
+ llvm::Type::getInt32Ty(getLLVMContext()), 2);
break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getHalfTy(getLLVMContext()), 8);
- IID = Intrinsic::amdgcn_global_load_tr_b128;
+ llvm::Type::getHalfTy(getLLVMContext()), 4);
break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getInt16Ty(getLLVMContext()), 4);
- IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getHalfTy(getLLVMContext()), 4);
- IID = Intrinsic::amdgcn_global_load_tr_b128;
+ llvm::Type::getHalfTy(getLLVMContext()), 8);
+ break;
+ case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16:
+ ArgTy = llvm::FixedVectorType::get(
+ llvm::Type::getInt16Ty(getLLVMContext()), 8);
break;
}
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
- llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
+ llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
return Builder.CreateCall(F, {Addr});
}
case AMDGPU::BI__builtin_amdgcn_read_exec:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
index 10e2325cdea75cd..f7afb7cb97edad0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
@@ -15,12 +15,12 @@ typedef short v4s __attribute__((ext_vector_type(4)));
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr,
global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
{
- v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
- v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
- v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+ v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
+ v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
+ v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
- int out_4 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
- v4s out_5 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
- v4h out_6 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+ int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
+ v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+ v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
index 299a793a7b31e13..04ac0a66db7ce74 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
@@ -8,8 +8,8 @@ typedef short v4s __attribute__((ext_vector_type(4)));
void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr)
{
- int out_4 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
- v4s out_5 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
- v4h out_6 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
+ int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}}
+ v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
+ v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
index 79f374af240c7e4..113b54b853a9f48 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
@@ -9,8 +9,8 @@ typedef short v8s __attribute__((ext_vector_type(8)));
void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr)
{
- v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
- v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
- v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
+ v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
+ v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
+ v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
index df523827e668d48..b5fcad68a470204 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
@@ -9,40 +9,40 @@ typedef short v8s __attribute__((ext_vector_type(8)));
// Wave32
//
-// amdgcn_global_load_tr_b64
+// amdgcn_global_load_tr
//
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v2i32(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]]
//
-v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
+v2i test_amdgcn_global_load_tr_v2i32(global v2i* inptr)
{
- return __builtin_amdgcn_global_load_tr_b64_v2i32(inptr);
+ return __builtin_amdgcn_global_load_tr_v2i32(inptr);
}
//
-// amdgcn_global_load_tr_b128
+// amdgcn_global_load_tr
//
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8i16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]]
//
-v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr)
+v8s test_amdgcn_global_load_tr_v8i16(global v8s* inptr)
{
- return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr);
+ return __builtin_amdgcn_global_load_tr_v8i16(inptr);
}
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8f16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]]
//
-v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr)
+v8h test_amdgcn_global_load_tr_v8f16(global v8h* inptr)
{
- return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
+ return __builtin_amdgcn_global_load_tr_v8f16(inptr);
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
index 06b512164073774..9c48ac071b4d3fa 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
@@ -8,40 +8,40 @@ typedef short v4s __attribute__((ext_vector_type(4)));
// Wave64
//
-// amdgcn_global_load_tr_b64
+// amdgcn_global_load_tr
//
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_i32(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret i32 [[TMP0]]
//
-int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
+int test_amdgcn_global_load_tr_i32(global int* inptr)
{
- return __builtin_amdgcn_global_load_tr_b64_i32(inptr);
+ return __builtin_amdgcn_global_load_tr_i32(inptr);
}
//
-// amdgcn_global_load_tr_b128
+// amdgcn_global_load_tr
//
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4i16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]]
//
-v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr)
+v4s test_amdgcn_global_load_tr_v4i16(global v4s* inptr)
{
- return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr);
+ return __builtin_amdgcn_global_load_tr_v4i16(inptr);
}
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16(
+// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4f16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]]
//
-v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr)
+v4h test_amdgcn_global_load_tr_v4f16(global v4h* inptr)
{
- return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
+ return __builtin_amdgcn_global_load_tr_v4f16(inptr);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ad850c9c31490cc..3a419b4b56caf2a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2506,16 +2506,15 @@ class AMDGPUGlobalLoadTr<LLVMType data_ty> :
>;
// Wave32
-// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))
-// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1))
-// <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1))
+// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64
+// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128
+// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128
// Wave64
-// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1))
-// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))
-// <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1))
+// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64
+// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128
+// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128
-def int_amdgcn_global_load_tr_b64 : AMDGPUGlobalLoadTr<llvm_any_ty>;
-def int_amdgcn_global_load_tr_b128 : AMDGPUGlobalLoadTr<llvm_any_ty>;
+def int_amdgcn_global_load_tr : AMDGPUGlobalLoadTr<llvm_any_ty>;
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0cfab44a7a0354e..410dd352a8459a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4837,8 +4837,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
- case Intrinsic::amdgcn_global_load_tr_b64:
- case Intrinsic::amdgcn_global_load_tr_b128:
+ case Intrinsic::amdgcn_global_load_tr:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 47c3d806e487e34..c6df1c4132f9400 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1572,14 +1572,14 @@ let OtherPredicates = [isGFX12Plus] in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
let WaveSizePredicate = isWave32 in {
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8f16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>;
}
let WaveSizePredicate = isWave64 in {
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4i16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4f16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 89a9138d4d2c626..398c84b9f79aec1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -2,9 +2,9 @@
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W32 %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W32 %s
-declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
-declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
-declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1))
+declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1))
+declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1))
+declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-SDAG-W32-LABEL: global_load_tr_b64:
@@ -34,7 +34,7 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp
; GFX12-GISEL-W32-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep)
+ %val = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
@@ -67,7 +67,7 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a
; GFX12-GISEL-W32-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep)
+ %val = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1) %gep)
store <8 x i16> %val, ptr addrspace(1) %use
ret void
}
@@ -100,7 +100,7 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr
; GFX12-GISEL-W32-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep)
+ %val = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1) %gep)
store <8 x half> %val, ptr addrspace(1) %use
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index 73dc4fa506c7ba0..04151cd2db2e0b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -2,9 +2,9 @@
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W64 %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W64 %s
-declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
-declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
-declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1))
+declare i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1))
+declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1))
+declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-SDAG-W64-LABEL: global_load_tr_b64:
@@ -34,7 +34,7 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp
; GFX12-GISEL-W64-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1) %gep)
+ %val = call i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1) %gep)
store i32 %val, ptr addrspace(1) %use
ret void
}
@@ -67,7 +67,7 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a
; GFX12-GISEL-W64-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1) %gep)
+ %val = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1) %gep)
store <4 x i16> %val, ptr addrspace(1) %use
ret void
}
@@ -100,7 +100,7 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr
; GFX12-GISEL-W64-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1) %gep)
+ %val = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1) %gep)
store <4 x half> %val, ptr addrspace(1) %use
ret void
}
>From 4659b3c2d07b1423a155c207fc237edd1c4e4934 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Fri, 12 Jan 2024 13:58:38 +0100
Subject: [PATCH 3/5] Appease clang formatter
---
clang/lib/CodeGen/CGBuiltin.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f9794ebd6be33bc..16b57a6ba51b019 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18213,7 +18213,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
}
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
- llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
+ llvm::Function *F =
+ CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
return Builder.CreateCall(F, {Addr});
}
case AMDGPU::BI__builtin_amdgcn_read_exec:
>From 36f2a2ce6086ece4d942e00837103223f44136bd Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 15 Jan 2024 11:43:27 +0100
Subject: [PATCH 4/5] Support bfloat in the intrinsic
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 14 ++++----
llvm/lib/Target/AMDGPU/FLATInstructions.td | 2 ++
.../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 34 +++++++++++++++++++
.../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 34 +++++++++++++++++++
4 files changed, 78 insertions(+), 6 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 3a419b4b56caf2a..92fe24f15c27bf4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2506,13 +2506,15 @@ class AMDGPUGlobalLoadTr<LLVMType data_ty> :
>;
// Wave32
-// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64
-// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128
-// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128
+// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64
+// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128
+// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128
+// <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128
// Wave64
-// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64
-// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128
-// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128
+// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64
+// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128
+// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128
+// <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128
def int_amdgcn_global_load_tr : AMDGPUGlobalLoadTr<llvm_any_ty>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c6df1c4132f9400..fd1ed6b2ef999d8 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1575,11 +1575,13 @@ let OtherPredicates = [isGFX12Plus] in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8bf16>;
}
let WaveSizePredicate = isWave64 in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4bf16>;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 398c84b9f79aec1..0c8aa734393be8d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -5,6 +5,7 @@
declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1))
declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1))
declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1))
+declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-SDAG-W32-LABEL: global_load_tr_b64:
@@ -104,3 +105,36 @@ entry:
store <8 x half> %val, ptr addrspace(1) %use
ret void
}
+
+define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-SDAG-W32-LABEL: global_load_tr_b128_bfloat:
+; GFX12-SDAG-W32: ; %bb.0: ; %entry
+; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-SDAG-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-SDAG-W32-NEXT: s_nop 0
+; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-W32-NEXT: s_endpgm
+;
+; GFX12-GISEL-W32-LABEL: global_load_tr_b128_bfloat:
+; GFX12-GISEL-W32: ; %bb.0: ; %entry
+; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-GISEL-W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-GISEL-W32-NEXT: s_nop 0
+; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-W32-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1) %gep)
+ store <8 x bfloat> %val, ptr addrspace(1) %use
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index 04151cd2db2e0b8..6d77c7f08bb11d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -5,6 +5,7 @@
declare i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1))
declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1))
declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1))
+declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-SDAG-W64-LABEL: global_load_tr_b64:
@@ -104,3 +105,36 @@ entry:
store <4 x half> %val, ptr addrspace(1) %use
ret void
}
+
+define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-SDAG-W64-LABEL: global_load_tr_b128_bfloat:
+; GFX12-SDAG-W64: ; %bb.0: ; %entry
+; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-SDAG-W64-NEXT: s_nop 0
+; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-W64-NEXT: s_endpgm
+;
+; GFX12-GISEL-W64-LABEL: global_load_tr_b128_bfloat:
+; GFX12-GISEL-W64: ; %bb.0: ; %entry
+; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W64-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-GISEL-W64-NEXT: s_nop 0
+; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-W64-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+ %val = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1) %gep)
+ store <4 x bfloat> %val, ptr addrspace(1) %use
+ ret void
+}
>From 5ca2539cd7695c1217fa2a2d311a8af6913b1a7a Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Tue, 16 Jan 2024 12:30:33 +0100
Subject: [PATCH 5/5] Drop the global from intrinsic and use llvm_anyptr_ty
---
clang/lib/CodeGen/CGBuiltin.cpp | 3 +--
.../builtins-amdgcn-global-load-tr-w32.cl | 6 ++---
.../builtins-amdgcn-global-load-tr-w64.cl | 6 ++---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 24 +++++++++----------
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +-
llvm/lib/Target/AMDGPU/FLATInstructions.td | 16 ++++++-------
.../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 16 ++++++-------
.../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 16 ++++++-------
8 files changed, 44 insertions(+), 45 deletions(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 16b57a6ba51b019..c9d023a2e8ec9d6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18211,10 +18211,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Type::getInt16Ty(getLLVMContext()), 8);
break;
}
-
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
llvm::Function *F =
- CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
+ CGM.getIntrinsic(Intrinsic::amdgcn_load_tr, {ArgTy, Addr->getType()});
return Builder.CreateCall(F, {Addr});
}
case AMDGPU::BI__builtin_amdgcn_read_exec:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
index b5fcad68a470204..93eafcce021fbe4 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
@@ -14,7 +14,7 @@ typedef short v8s __attribute__((ext_vector_type(8)));
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v2i32(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.load.tr.v2i32.p1(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]]
//
v2i test_amdgcn_global_load_tr_v2i32(global v2i* inptr)
@@ -28,7 +28,7 @@ v2i test_amdgcn_global_load_tr_v2i32(global v2i* inptr)
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8i16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.load.tr.v8i16.p1(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]]
//
v8s test_amdgcn_global_load_tr_v8i16(global v8s* inptr)
@@ -38,7 +38,7 @@ v8s test_amdgcn_global_load_tr_v8i16(global v8s* inptr)
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8f16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.load.tr.v8f16.p1(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]]
//
v8h test_amdgcn_global_load_tr_v8f16(global v8h* inptr)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
index 9c48ac071b4d3fa..959705b1eefee37 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
@@ -13,7 +13,7 @@ typedef short v4s __attribute__((ext_vector_type(4)));
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_i32(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.load.tr.i32.p1(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret i32 [[TMP0]]
//
int test_amdgcn_global_load_tr_i32(global int* inptr)
@@ -27,7 +27,7 @@ int test_amdgcn_global_load_tr_i32(global int* inptr)
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4i16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.load.tr.v4i16.p1(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]]
//
v4s test_amdgcn_global_load_tr_v4i16(global v4s* inptr)
@@ -37,7 +37,7 @@ v4s test_amdgcn_global_load_tr_v4i16(global v4s* inptr)
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4f16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.load.tr.v4f16.p1(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]]
//
v4h test_amdgcn_global_load_tr_v4f16(global v4h* inptr)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 92fe24f15c27bf4..cd2905fff41a7bb 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2496,27 +2496,27 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-class AMDGPUGlobalLoadTr<LLVMType data_ty> :
+class AMDGPULoadTr :
Intrinsic<
- [data_ty],
- [global_ptr_ty],
+ [llvm_any_ty],
+ [llvm_anyptr_ty],
[IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree],
"",
[SDNPMemOperand]
>;
// Wave32
-// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64
-// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128
-// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128
-// <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128
+// <2 x i32> @llvm.amdgcn.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64
+// <8 x i16> @llvm.amdgcn.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128
+// <8 x half> @llvm.amdgcn.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128
+// <8 x bfloat> @llvm.amdgcn.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128
// Wave64
-// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64
-// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128
-// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128
-// <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128
+// i32 @llvm.amdgcn.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64
+// <4 x i16> @llvm.amdgcn.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128
+// <4 x half> @llvm.amdgcn.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128
+// <4 x bfloat> @llvm.amdgcn.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128
-def int_amdgcn_global_load_tr : AMDGPUGlobalLoadTr<llvm_any_ty>;
+def int_amdgcn_load_tr : AMDGPULoadTr;
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 410dd352a8459a6..1051204a1043cf5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4837,7 +4837,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
- case Intrinsic::amdgcn_global_load_tr:
+ case Intrinsic::amdgcn_load_tr:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index fd1ed6b2ef999d8..b11f179f23a7dcd 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1572,16 +1572,16 @@ let OtherPredicates = [isGFX12Plus] in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
let WaveSizePredicate = isWave32 in {
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8bf16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_load_tr, v2i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_load_tr, v8i16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_load_tr, v8f16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_load_tr, v8bf16>;
}
let WaveSizePredicate = isWave64 in {
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4bf16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_load_tr, i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_load_tr, v4i16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_load_tr, v4f16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_load_tr, v4bf16>;
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 0c8aa734393be8d..f655ab921e4d603 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -2,10 +2,10 @@
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W32 %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W32 %s
-declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1))
-declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1))
-declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1))
-declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1))
+declare <2 x i32> @llvm.amdgcn.load.tr.v2i32.p1(ptr addrspace(1))
+declare <8 x i16> @llvm.amdgcn.load.tr.v8i16.p1(ptr addrspace(1))
+declare <8 x half> @llvm.amdgcn.load.tr.v8f16.p1(ptr addrspace(1))
+declare <8 x bfloat> @llvm.amdgcn.load.tr.v8bf16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-SDAG-W32-LABEL: global_load_tr_b64:
@@ -35,7 +35,7 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp
; GFX12-GISEL-W32-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1) %gep)
+ %val = call <2 x i32> @llvm.amdgcn.load.tr.v2i32.p1(ptr addrspace(1) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
@@ -68,7 +68,7 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a
; GFX12-GISEL-W32-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1) %gep)
+ %val = call <8 x i16> @llvm.amdgcn.load.tr.v8i16.p1(ptr addrspace(1) %gep)
store <8 x i16> %val, ptr addrspace(1) %use
ret void
}
@@ -101,7 +101,7 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr
; GFX12-GISEL-W32-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1) %gep)
+ %val = call <8 x half> @llvm.amdgcn.load.tr.v8f16.p1(ptr addrspace(1) %gep)
store <8 x half> %val, ptr addrspace(1) %use
ret void
}
@@ -134,7 +134,7 @@ define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, pt
; GFX12-GISEL-W32-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1) %gep)
+ %val = call <8 x bfloat> @llvm.amdgcn.load.tr.v8bf16.p1(ptr addrspace(1) %gep)
store <8 x bfloat> %val, ptr addrspace(1) %use
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index 6d77c7f08bb11d6..88930661698dd37 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -2,10 +2,10 @@
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W64 %s
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W64 %s
-declare i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1))
-declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1))
-declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1))
-declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1))
+declare i32 @llvm.amdgcn.load.tr.i32.p1(ptr addrspace(1))
+declare <4 x i16> @llvm.amdgcn.load.tr.v4i16.p1(ptr addrspace(1))
+declare <4 x half> @llvm.amdgcn.load.tr.v4f16.p1(ptr addrspace(1))
+declare <4 x bfloat> @llvm.amdgcn.load.tr.v4bf16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-SDAG-W64-LABEL: global_load_tr_b64:
@@ -35,7 +35,7 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp
; GFX12-GISEL-W64-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1) %gep)
+ %val = call i32 @llvm.amdgcn.load.tr.i32.p1(ptr addrspace(1) %gep)
store i32 %val, ptr addrspace(1) %use
ret void
}
@@ -68,7 +68,7 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a
; GFX12-GISEL-W64-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1) %gep)
+ %val = call <4 x i16> @llvm.amdgcn.load.tr.v4i16.p1(ptr addrspace(1) %gep)
store <4 x i16> %val, ptr addrspace(1) %use
ret void
}
@@ -101,7 +101,7 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr
; GFX12-GISEL-W64-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1) %gep)
+ %val = call <4 x half> @llvm.amdgcn.load.tr.v4f16.p1(ptr addrspace(1) %gep)
store <4 x half> %val, ptr addrspace(1) %use
ret void
}
@@ -134,7 +134,7 @@ define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, pt
; GFX12-GISEL-W64-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1) %gep)
+ %val = call <4 x bfloat> @llvm.amdgcn.load.tr.v4bf16.p1(ptr addrspace(1) %gep)
store <4 x bfloat> %val, ptr addrspace(1) %use
ret void
}
More information about the cfe-commits
mailing list