[clang] [llvm] [AMDGPU] Add suffix _d4 to tensor load/store with 4 groups D#, NFC (PR #184176)
Changpeng Fang via cfe-commits
cfe-commits at lists.llvm.org
Tue Mar 3 13:02:55 PST 2026
https://github.com/changpeng updated https://github.com/llvm/llvm-project/pull/184176
>From 0da7138b70532ac54dee9978075d4b3d246df248 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Mon, 2 Mar 2026 08:57:08 -0800
Subject: [PATCH 1/3] [AMDGPU] Add suffix _D4 to tensor load/store with 4
groups D#, NFC
Rename TENSOR_LOAD_TO_LDS to TENSOR_LOAD_TO_LDS_D4
Rename TENSOR_STORE_FROM_LDS to TENSOR_STORE_FROM_LDS_D4
Also rename function names in a couple of tests to reflact this change.
---
...iltins-amdgcn-gfx1250-tensor-load-store.cl | 8 ++++----
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 2 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 2 +-
llvm/lib/Target/AMDGPU/MIMGInstructions.td | 8 ++++----
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 12 +++++------
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 ++--
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 ++--
.../AMDGPU/llvm.amdgcn.tensor.load.store.ll | 20 +++++++++----------
8 files changed, 30 insertions(+), 30 deletions(-)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
index cb106805d24bd..5c97067cdd971 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
@@ -8,12 +8,12 @@ typedef int v8i __attribute__((ext_vector_type(8)));
static v4i v4i_zeros = (v4i){0,0,0,0};
static v8i v8i_zeros = (v8i){0,0,0,0,0,0,0,0};
-// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds(
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d4(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
-void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
+void test_amdgcn_tensor_load_to_lds_d4(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
{
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, v8i_zeros, 0);
}
@@ -28,12 +28,12 @@ void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1)
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 27);
}
-// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds(
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d4(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 22)
// CHECK-GFX1250-NEXT: ret void
//
-void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
+void test_amdgcn_tensor_store_from_lds_d4(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
{
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, v8i_zeros, 22);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 4acda590ed5b6..a7324417ea151 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3006,7 +3006,7 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
unsigned Opc =
- IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS;
+ IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D4 : AMDGPU::TENSOR_STORE_FROM_LDS_D4;
SmallVector<SDValue, 7> TensorOps;
// First two groups
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3ae638f14ee40..5a6676e58f23c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3794,7 +3794,7 @@ bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
Intrinsic::ID IID) const {
bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
unsigned Opc =
- IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : AMDGPU::TENSOR_STORE_FROM_LDS;
+ IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D4 : AMDGPU::TENSOR_STORE_FROM_LDS_D4;
int NumGroups = 4;
// A lamda function to check whether an operand is a vector of all 0s.
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 0521e199c31dd..6b37a87ba44d0 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -2052,7 +2052,7 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_no
class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
InstSI<(outs ), (ins ), "", []>,
- SIMCInstr<opName#!if(_UpTo2D, "_D2", ""), SIEncodingFamily.NONE> {
+ SIMCInstr<opName#!if(_UpTo2D, "_D2", "_D4"), SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
@@ -2077,8 +2077,8 @@ class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
}
let SubtargetPredicate = isGFX125xOnly in {
-def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
-def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
+def TENSOR_LOAD_TO_LDS_D4 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
+def TENSOR_STORE_FROM_LDS_D4 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
} // End SubtargetPredicate = isGFX125xOnly.
@@ -2114,7 +2114,7 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> {
let AssemblerPredicate = isGFX125xOnly, DecoderNamespace = "GFX1250" in {
- foreach DSuffix = ["_D2", ""] in {
+ foreach DSuffix = ["_D2", "_D4"] in {
defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix);
def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 41608cfb3457b..f24d2bd4678ad 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7530,12 +7530,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
return nullptr;
}
- // Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
- // TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
- if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
- MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
- MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
- MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
+ // Legalize TENSOR_LOAD_TO_LDS_D2/_D4, TENSOR_STORE_FROM_LDS_D2/_D4. All their
+ // operands are scalar.
+ if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
+ MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D4 ||
+ MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2 ||
+ MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D4) {
for (MachineOperand &Src : MI.explicit_operands()) {
if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dd4c97f0c5746..2fb408c06d535 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -827,8 +827,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned Opc = MI.getOpcode();
// Exclude instructions that read FROM LDS (not write to it)
return isLDSDMA(MI) && Opc != AMDGPU::BUFFER_STORE_LDS_DWORD &&
- Opc != AMDGPU::TENSOR_STORE_FROM_LDS &&
- Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D2;
+ Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D2 &&
+ Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D4;
}
static bool isSBarrierSCCWrite(unsigned Opcode) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c1337f27a0f70..1c4380d8cce43 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -773,8 +773,8 @@ bool isAsyncStore(unsigned Opc) {
}
bool isTensorStore(unsigned Opc) {
- return Opc == TENSOR_STORE_FROM_LDS_gfx1250 ||
- Opc == TENSOR_STORE_FROM_LDS_D2_gfx1250;
+ return Opc == TENSOR_STORE_FROM_LDS_D2_gfx1250 ||
+ Opc == TENSOR_STORE_FROM_LDS_D4_gfx1250;
}
unsigned getTemporalHintType(const MCInstrDesc TID) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
index a8bba2e384377..2ce533c299dce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
@@ -6,8 +6,8 @@
declare void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 %cpol)
declare void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 %cpol)
-define amdgpu_ps void @tensor_load_to_lds(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) {
-; GFX1250-LABEL: tensor_load_to_lds:
+define amdgpu_ps void @tensor_load_to_lds_d4(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) {
+; GFX1250-LABEL: tensor_load_to_lds_d4:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19]
@@ -16,8 +16,8 @@ define amdgpu_ps void @tensor_load_to_lds(<4 x i32> inreg %D0, <8 x i32> inreg %
ret void
}
-define amdgpu_ps void @tensor_load_to_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) {
-; GFX1250-SDAG-LABEL: tensor_load_to_lds_vector:
+define amdgpu_ps void @tensor_load_to_lds_d4_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) {
+; GFX1250-SDAG-LABEL: tensor_load_to_lds_d4_vector:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4
@@ -44,7 +44,7 @@ define amdgpu_ps void @tensor_load_to_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <
; GFX1250-SDAG-NEXT: tensor_load_to_lds s[8:11], s[0:7], s[12:15], s[16:19]
; GFX1250-SDAG-NEXT: s_endpgm
;
-; GFX1250-GISEL-LABEL: tensor_load_to_lds_vector:
+; GFX1250-GISEL-LABEL: tensor_load_to_lds_d4_vector:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0
@@ -126,8 +126,8 @@ define amdgpu_ps void @tensor_load_to_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1
ret void
}
-define amdgpu_ps void @tensor_store_from_lds(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) {
-; GFX1250-LABEL: tensor_store_from_lds:
+define amdgpu_ps void @tensor_store_from_lds_d4(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) {
+; GFX1250-LABEL: tensor_store_from_lds_d4:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV
@@ -136,8 +136,8 @@ define amdgpu_ps void @tensor_store_from_lds(<4 x i32> inreg %D0, <8 x i32> inre
ret void
}
-define amdgpu_ps void @tensor_store_from_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) {
-; GFX1250-SDAG-LABEL: tensor_store_from_lds_vector:
+define amdgpu_ps void @tensor_store_from_lds_d4_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) {
+; GFX1250-SDAG-LABEL: tensor_store_from_lds_d4_vector:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v4
@@ -164,7 +164,7 @@ define amdgpu_ps void @tensor_store_from_lds_vector(<4 x i32> %D0, <8 x i32> %D1
; GFX1250-SDAG-NEXT: tensor_store_from_lds s[8:11], s[0:7], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_endpgm
;
-; GFX1250-GISEL-LABEL: tensor_store_from_lds_vector:
+; GFX1250-GISEL-LABEL: tensor_store_from_lds_d4_vector:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s8, v0
>From 92aaddbce5e594c3b1477a67005b9bbbad9b514d Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Tue, 3 Mar 2026 12:53:11 -0800
Subject: [PATCH 2/3] [AMDGPU] Compiler invented suffixes should use lowercase
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 6 +++---
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 6 +++---
llvm/lib/Target/AMDGPU/MIMGInstructions.td | 12 ++++++------
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 10 +++++-----
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 ++--
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 ++--
6 files changed, 21 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index a7324417ea151..cc2058a5a1d4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3006,7 +3006,7 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
unsigned Opc =
- IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D4 : AMDGPU::TENSOR_STORE_FROM_LDS_D4;
+ IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
SmallVector<SDValue, 7> TensorOps;
// First two groups
@@ -3018,8 +3018,8 @@ void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
SDValue Group3 = N->getOperand(5);
if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
ISD::isBuildVectorAllZeros(Group3.getNode())) {
- Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2
- : AMDGPU::TENSOR_STORE_FROM_LDS_D2;
+ Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
+ : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
} else { // Has at least 4 groups
TensorOps.push_back(Group2); // D# group 2
TensorOps.push_back(Group3); // D# group 3
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5a6676e58f23c..61b70dc7585aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3794,7 +3794,7 @@ bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
Intrinsic::ID IID) const {
bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
unsigned Opc =
- IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D4 : AMDGPU::TENSOR_STORE_FROM_LDS_D4;
+ IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
int NumGroups = 4;
// A lamda function to check whether an operand is a vector of all 0s.
@@ -3808,8 +3808,8 @@ bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
// Use _D2 version if both group 2 and 3 are zero-initialized.
if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
NumGroups = 2;
- Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2
- : AMDGPU::TENSOR_STORE_FROM_LDS_D2;
+ Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d2
+ : AMDGPU::TENSOR_STORE_FROM_LDS_d2;
}
// TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 6b37a87ba44d0..03159cf9398ca 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -2052,7 +2052,7 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_no
class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
InstSI<(outs ), (ins ), "", []>,
- SIMCInstr<opName#!if(_UpTo2D, "_D2", "_D4"), SIEncodingFamily.NONE> {
+ SIMCInstr<opName#!if(_UpTo2D, "_d2", "_d4"), SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
@@ -2077,10 +2077,10 @@ class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
}
let SubtargetPredicate = isGFX125xOnly in {
-def TENSOR_LOAD_TO_LDS_D4 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
-def TENSOR_STORE_FROM_LDS_D4 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
-def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
-def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
+def TENSOR_LOAD_TO_LDS_d4 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
+def TENSOR_STORE_FROM_LDS_d4 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
+def TENSOR_LOAD_TO_LDS_d2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
+def TENSOR_STORE_FROM_LDS_d2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
} // End SubtargetPredicate = isGFX125xOnly.
class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> :
@@ -2114,7 +2114,7 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> {
let AssemblerPredicate = isGFX125xOnly, DecoderNamespace = "GFX1250" in {
- foreach DSuffix = ["_D2", "_D4"] in {
+ foreach DSuffix = ["_d2", "_d4"] in {
defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix);
def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f24d2bd4678ad..7d9e621110330 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7530,12 +7530,12 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
return nullptr;
}
- // Legalize TENSOR_LOAD_TO_LDS_D2/_D4, TENSOR_STORE_FROM_LDS_D2/_D4. All their
+ // Legalize TENSOR_LOAD_TO_LDS_d2/_d4, TENSOR_STORE_FROM_LDS_d2/_d4. All their
// operands are scalar.
- if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
- MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D4 ||
- MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2 ||
- MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D4) {
+ if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d2 ||
+ MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_d4 ||
+ MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d2 ||
+ MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_d4) {
for (MachineOperand &Src : MI.explicit_operands()) {
if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 2fb408c06d535..f363560784730 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -827,8 +827,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned Opc = MI.getOpcode();
// Exclude instructions that read FROM LDS (not write to it)
return isLDSDMA(MI) && Opc != AMDGPU::BUFFER_STORE_LDS_DWORD &&
- Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D2 &&
- Opc != AMDGPU::TENSOR_STORE_FROM_LDS_D4;
+ Opc != AMDGPU::TENSOR_STORE_FROM_LDS_d2 &&
+ Opc != AMDGPU::TENSOR_STORE_FROM_LDS_d4;
}
static bool isSBarrierSCCWrite(unsigned Opcode) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 1c4380d8cce43..865770031bb06 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -773,8 +773,8 @@ bool isAsyncStore(unsigned Opc) {
}
bool isTensorStore(unsigned Opc) {
- return Opc == TENSOR_STORE_FROM_LDS_D2_gfx1250 ||
- Opc == TENSOR_STORE_FROM_LDS_D4_gfx1250;
+ return Opc == TENSOR_STORE_FROM_LDS_d2_gfx1250 ||
+ Opc == TENSOR_STORE_FROM_LDS_d4_gfx1250;
}
unsigned getTemporalHintType(const MCInstrDesc TID) {
>From d2b15f3351d6b8a11506cf9af9604afe352377fe Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Tue, 3 Mar 2026 13:01:59 -0800
Subject: [PATCH 3/3] [AMDGPU] Fix instruction suffixes for
reg-coalescer-subreg-liveness.mir
---
.../CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
index f098618018839..00c4ec981111e 100644
--- a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
@@ -18,7 +18,7 @@ body: |
; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0
; CHECK-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
- ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
; CHECK-NEXT: {{ $}}
@@ -27,8 +27,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
- ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
- ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_d2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
; CHECK-NEXT: $vcc_lo = COPY $exec_lo
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1
@@ -47,7 +47,7 @@ body: |
undef %3.sub0:sgpr_128 = COPY %2
%4:sreg_32 = S_MOV_B32 0
undef %5.sub0:sgpr_256 = COPY %4
- TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
%6:sgpr_128 = COPY killed %3
%6.sub1:sgpr_128 = COPY killed %1
%7:sreg_32 = COPY $exec_lo
@@ -62,11 +62,11 @@ body: |
%11.sub1:sgpr_128 = COPY killed %10
%11.sub2:sgpr_128 = COPY %2
%11.sub3:sgpr_128 = COPY %2
- TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
%12:sreg_32 = COPY killed %9
%13:sgpr_128 = COPY %6
%13.sub2:sgpr_128 = COPY killed %12
- TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+ TENSOR_LOAD_TO_LDS_d2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
$vcc_lo = COPY %7
%8:sreg_32 = COPY %4
%9:sreg_32 = COPY %2
More information about the cfe-commits
mailing list