[llvm] 2cb5241 - Revert "[AMDGPU][True16][CodeGen] FLAT_load using D16 pseudo instruction (#114500)"
Nikita Popov via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 18 08:16:22 PST 2025
Author: Nikita Popov
Date: 2025-02-18T17:16:12+01:00
New Revision: 2cb5241c7724cbf4fa02b25547e098bfc635aba9
URL: https://github.com/llvm/llvm-project/commit/2cb5241c7724cbf4fa02b25547e098bfc635aba9
DIFF: https://github.com/llvm/llvm-project/commit/2cb5241c7724cbf4fa02b25547e098bfc635aba9.diff
LOG: Revert "[AMDGPU][True16][CodeGen] FLAT_load using D16 pseudo instruction (#114500)"
This reverts commit f7a5f067885b7f6cc4a000c8392adf6b777a9108.
Fails to build with:
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp:126:37: error: no member named 'OPERAND_LAST' in 'llvm::AMDGPU::OpName'
126 | uint16_t OpName = AMDGPU::OpName::OPERAND_LAST;
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
llvm/lib/Target/AMDGPU/FLATInstructions.td
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
llvm/test/CodeGen/AMDGPU/flat-address-space.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 5408e72528c78..38272b9d4840d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -114,63 +114,9 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
llvm_unreachable("unknown operand type");
}
-// Lower true16 D16 Pseudo instruction to d16_lo/d16_hi MCInst based on
-// Dst/Data's .l/.h selection
-void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI,
- MCInst &OutMI) const {
- unsigned Opcode = MI->getOpcode();
- const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
- const auto *Info = AMDGPU::getT16D16Helper(Opcode);
-
- uint16_t OpName = AMDGPU::OpName::OPERAND_LAST;
- if (TII->isDS(Opcode)) {
- if (MI->mayLoad())
- OpName = llvm::AMDGPU::OpName::vdst;
- else if (MI->mayStore())
- OpName = llvm::AMDGPU::OpName::data0;
- else
- llvm_unreachable("LDS load or store expected");
- } else {
- OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata)
- ? llvm::AMDGPU::OpName::vdata
- : llvm::AMDGPU::OpName::vdst;
- }
-
- // select Dst/Data
- int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName);
- const MachineOperand &MIVDstOrVData = MI->getOperand(VDstOrVDataIdx);
-
- // select hi/lo MCInst
- bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI);
- Opcode = IsHi ? Info->HiOp : Info->LoOp;
-
- int MCOpcode = TII->pseudoToMCOpcode(Opcode);
- assert(MCOpcode != -1 &&
- "Pseudo instruction doesn't have a target-specific version");
- OutMI.setOpcode(MCOpcode);
-
- // lower operands
- for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
- const MachineOperand &MO = MI->getOperand(I);
- MCOperand MCOp;
- if (I == VDstOrVDataIdx)
- MCOp = MCOperand::createReg(TRI.get32BitRegister(MIVDstOrVData.getReg()));
- else
- lowerOperand(MO, MCOp);
- OutMI.addOperand(MCOp);
- }
-
- if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) {
- MCOperand MCOp;
- lowerOperand(MIVDstOrVData, MCOp);
- OutMI.addOperand(MCOp);
- }
-}
-
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
unsigned Opcode = MI->getOpcode();
- const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());
+ const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
// FIXME: Should be able to handle this with lowerPseudoInstExpansion. We
// need to select it to the subtarget specific version, and there's no way to
@@ -191,9 +137,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
Opcode == AMDGPU::SI_TCRETURN_GFX) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
- } else if (AMDGPU::getT16D16Helper(Opcode)) {
- lowerT16D16Helper(MI, OutMI);
- return;
}
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index 5ddf1ca2ab06d..7176cc5d3439b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -39,8 +39,6 @@ class AMDGPUMCInstLower {
/// Lower a MachineInstr to an MCInst
void lower(const MachineInstr *MI, MCInst &OutMI) const;
-
- void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const;
};
namespace {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index ea6e703eba5d9..8fa708b74dde3 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -16,12 +16,6 @@ let WantsRoot = true in {
def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>;
}
-class True16D16Table <string hiOp, string loOp> {
- Instruction T16Op = !cast<Instruction>(NAME);
- Instruction HiOp = !cast<Instruction>(hiOp);
- Instruction LoOp = !cast<Instruction>(loOp);
-}
-
//===----------------------------------------------------------------------===//
// FLAT classes
//===----------------------------------------------------------------------===//
@@ -232,12 +226,6 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
}
-multiclass FLAT_Load_Pseudo_t16<string opName> {
- def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>;
- let True16Predicate = UseRealTrue16Insts in
- def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>;
-}
-
class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
opName,
@@ -674,12 +662,12 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>;
let SubtargetPredicate = HasD16LoadStore in {
let TiedSourceNotRead = 1 in {
+def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>;
def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">;
+def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>;
def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">;
+def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>;
def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>;
-defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">;
}
def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>;
@@ -1061,11 +1049,6 @@ class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> :
(inst $vaddr, $offset, 0, $in)
>;
-class FlatLoadPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset))),
- (inst $vaddr, $offset, (i32 0))
->;
-
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
@@ -1388,29 +1371,16 @@ def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
- def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
- def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
- def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
- def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
-}
-
-let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
- def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
-} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
-
def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_32_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
@@ -2791,11 +2761,3 @@ defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_
defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>;
defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>;
-
-def True16D16Table : GenericTable {
- let FilterClass = "True16D16Table";
- let CppTypeName = "True16D16Info";
- let Fields = ["T16Op", "HiOp", "LoOp"];
- let PrimaryKey = ["T16Op"];
- let PrimaryKeyName = "getT16D16Helper";
-}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e30e257da6873..4fd68b52b53bb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2483,15 +2483,8 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
// Return an AGPR+VGPR operand class for the given VGPR register class.
class getLdStRegisterOperand<RegisterClass RC> {
- // This type of operands is only used in pseudo instructions helping
- // code generation and thus doesn't need encoding and decoding methods.
- // It also doesn't need to support AGPRs, because GFX908/A/40 do not
- // support True16.
- defvar VLdSt_16 = RegisterOperand<VGPR_16>;
-
RegisterOperand ret =
- !cond(!eq(RC.Size, 16) : VLdSt_16,
- !eq(RC.Size, 32) : AVLdSt_32,
+ !cond(!eq(RC.Size, 32) : AVLdSt_32,
!eq(RC.Size, 64) : AVLdSt_64,
!eq(RC.Size, 96) : AVLdSt_96,
!eq(RC.Size, 128) : AVLdSt_128,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c521d0dd3ad2d..59afcbed35294 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -430,7 +430,6 @@ struct FP4FP8DstByteSelInfo {
#define GET_VOPDPairs_IMPL
#define GET_VOPTrue16Table_DECL
#define GET_VOPTrue16Table_IMPL
-#define GET_True16D16Table_IMPL
#define GET_WMMAOpcode2AddrMappingTable_DECL
#define GET_WMMAOpcode2AddrMappingTable_IMPL
#define GET_WMMAOpcode3AddrMappingTable_DECL
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 103993e6435de..e458b6b9604b6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -113,12 +113,6 @@ struct CvtScaleF32_F32F16ToF8F4_Info {
unsigned Opcode;
};
-struct True16D16Info {
- unsigned T16Op;
- unsigned HiOp;
- unsigned LoOp;
-};
-
#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
@@ -129,7 +123,6 @@ struct True16D16Info {
#define GET_MAIInstInfoTable_DECL
#define GET_isMFMA_F8F6F4Table_DECL
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
-#define GET_True16D16Table_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index db9a89a2a7370..21a2ae80574e0 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -3,10 +3,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch,-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s
define <2 x half> @chain_hi_to_lo_private() {
; GFX900-LABEL: chain_hi_to_lo_private:
@@ -158,23 +156,14 @@ define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) {
; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_arithmatic:
-; GFX11-TRUE16: ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_arithmatic:
-; GFX11-FAKE16: ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1
-; GFX11-FAKE16-NEXT: scratch_load_d16_hi_b16 v1, v0, off
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_arithmatic:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
+; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%arith_lo = fadd half %in, 1.0
%load_hi = load half, ptr addrspace(5) %base
@@ -372,31 +361,18 @@ define <2 x half> @chain_hi_to_lo_flat() {
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat:
-; GFX11-TRUE16: ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 2
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat:
-; GFX11-FAKE16: ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 2
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_flat:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 2
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: flat_load_u16 v0, v[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds half, ptr null, i64 1
%load_lo = load half, ptr %gep_lo
@@ -427,23 +403,14 @@ define <2 x half> @chain_hi_to_lo_flat_
diff erent_bases(ptr %base_lo, ptr %base_h
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_
diff erent_bases:
-; GFX11-TRUE16: ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v0, v[0:1]
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_
diff erent_bases:
-; GFX11-FAKE16: ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: flat_load_u16 v0, v[0:1]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_flat_
diff erent_bases:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_u16 v0, v[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%load_lo = load half, ptr %base_lo
%load_hi = load half, ptr %base_hi
@@ -897,31 +864,17 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-TRUE16-LABEL: chain_hi_to_lo_flat_other_dep:
-; GFX11-TRUE16: ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:
-; GFX11-FAKE16: ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: chain_hi_to_lo_flat_other_dep:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1
%load_lo = load volatile i16, ptr addrspace(0) %gep_lo
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 91f9aa1c5fe3b..4c68b8d35260f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -2,9 +2,8 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,CIVI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,CIVI,CIVI-HSA %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10PLUS %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX11 %s
; GCN-LABEL: {{^}}store_flat_i32:
; GCN-DAG: s_load_{{dwordx2|b64}} s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]],
@@ -225,8 +224,7 @@ define amdgpu_kernel void @store_flat_i8_neg_offset(ptr %fptr, i8 %x) #0 {
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}}
; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
-; GFX11-TRUE16: flat_load_d16_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
-; GFX11-FAKE16: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
+; GFX11: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc dlc{{$}}
define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 {
%fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4095
%val = load volatile i8, ptr %fptr.offset
@@ -236,9 +234,7 @@ define amdgpu_kernel void @load_flat_i8_max_offset(ptr %fptr) #0 {
; GCN-LABEL: {{^}}load_flat_i8_max_offset_p1:
; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
-; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
-; GFX11-TRUE16: flat_load_d16_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
-; GFX11-FAKE16: flat_load_u8 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
+; GFX10PLUS: flat_load_{{ubyte|u8}} v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
define amdgpu_kernel void @load_flat_i8_max_offset_p1(ptr %fptr) #0 {
%fptr.offset = getelementptr inbounds i8, ptr %fptr, i64 4096
%val = load volatile i8, ptr %fptr.offset
More information about the llvm-commits
mailing list