[llvm] ae059a1 - [AMDGPU][True16][CodeGen] support v_mov_b16 and v_swap_b16 in true16 format (#102198)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 8 13:53:02 PDT 2024
Author: Brox Chen
Date: 2024-08-08T16:52:59-04:00
New Revision: ae059a1f9f1e501b08a99cb636ec0869ec204c6f
URL: https://github.com/llvm/llvm-project/commit/ae059a1f9f1e501b08a99cb636ec0869ec204c6f
DIFF: https://github.com/llvm/llvm-project/commit/ae059a1f9f1e501b08a99cb636ec0869ec204c6f.diff
LOG: [AMDGPU][True16][CodeGen] support v_mov_b16 and v_swap_b16 in true16 format (#102198)
support v_swap_b16 in true16 format.
update tableGen pattern and folding for v_mov_b16.
---------
Co-authored-by: guochen2 <guochen2 at amd.com>
Added:
llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s
Modified:
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
llvm/lib/Target/AMDGPU/VOP1Instructions.td
llvm/test/CodeGen/AMDGPU/bf16.ll
llvm/test/CodeGen/AMDGPU/fadd.f16.ll
llvm/test/MC/AMDGPU/gfx11_asm_err.s
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 32ecf350db59cf..875738dad74ced 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1460,7 +1460,15 @@ bool SIFoldOperands::tryFoldFoldableCopy(
return false;
}
- MachineOperand &OpToFold = MI.getOperand(1);
+ MachineOperand *OpToFoldPtr;
+ if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
+ // Folding when any src_modifiers are non-zero is unsupported
+ if (TII->hasAnyModifiersSet(MI))
+ return false;
+ OpToFoldPtr = &MI.getOperand(2);
+ } else
+ OpToFoldPtr = &MI.getOperand(1);
+ MachineOperand &OpToFold = *OpToFoldPtr;
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
// FIXME: We could also be folding things like TargetIndexes.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b6dd4905fb61bb..8af5c364509f0e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3369,6 +3369,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
switch (MI.getOpcode()) {
+ case AMDGPU::V_MOV_B16_t16_e32:
+ case AMDGPU::V_MOV_B16_t16_e64:
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::V_MOV_B32_e64:
case AMDGPU::V_MOV_B64_PSEUDO:
@@ -5639,7 +5641,9 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
const TargetRegisterClass *RC = RI.getRegClass(RCID);
unsigned Size = RI.getRegSizeInBits(*RC);
- unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
+ unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
+ : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
+ : AMDGPU::V_MOV_B32_e32;
if (MO.isReg())
Opcode = AMDGPU::COPY;
else if (RI.isSGPRClass(RC))
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 79bcf5e8cd30d4..155747551471e3 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -657,6 +657,7 @@ void SIShrinkInstructions::dropInstructionKeepingImpDefs(
// although requirements match the pass placement and it reduces code size too.
MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+ MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
MovT.getOpcode() == AMDGPU::COPY);
Register T = MovT.getOperand(0).getReg();
@@ -668,7 +669,12 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
Register X = Xop.getReg();
unsigned Xsub = Xop.getSubReg();
- unsigned Size = TII->getOpSize(MovT, 0) / 4;
+ unsigned Size = TII->getOpSize(MovT, 0);
+
+ // We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers
+ // are not allocatble.
+ if (Size == 2 && X.isVirtual())
+ return nullptr;
if (!TRI->isVGPR(*MRI, X))
return nullptr;
@@ -684,9 +690,9 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
KilledT = MovY->killsRegister(T, TRI);
if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+ MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
MovY->getOpcode() != AMDGPU::COPY) ||
- !MovY->getOperand(1).isReg() ||
- MovY->getOperand(1).getReg() != T ||
+ !MovY->getOperand(1).isReg() || MovY->getOperand(1).getReg() != T ||
MovY->getOperand(1).getSubReg() != Tsub)
continue;
@@ -714,6 +720,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
}
if (MovX ||
(I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+ I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
I->getOpcode() != AMDGPU::COPY) ||
I->getOperand(0).getReg() != X ||
I->getOperand(0).getSubReg() != Xsub) {
@@ -721,7 +728,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
break;
}
- if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
+ if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
continue;
MovX = &*I;
@@ -730,23 +737,40 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
if (!MovX)
continue;
- LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
+ LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY);
- for (unsigned I = 0; I < Size; ++I) {
- TargetInstrInfo::RegSubRegPair X1, Y1;
- X1 = getSubRegForIndex(X, Xsub, I);
- Y1 = getSubRegForIndex(Y, Ysub, I);
- MachineBasicBlock &MBB = *MovT.getParent();
+ MachineBasicBlock &MBB = *MovT.getParent();
+ SmallVector<MachineInstr *, 4> Swaps;
+ if (Size == 2) {
auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
- TII->get(AMDGPU::V_SWAP_B32))
- .addDef(X1.Reg, 0, X1.SubReg)
- .addDef(Y1.Reg, 0, Y1.SubReg)
- .addReg(Y1.Reg, 0, Y1.SubReg)
- .addReg(X1.Reg, 0, X1.SubReg).getInstr();
- if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
- // Drop implicit EXEC.
- MIB->removeOperand(MIB->getNumExplicitOperands());
- MIB->copyImplicitOps(*MBB.getParent(), *MovX);
+ TII->get(AMDGPU::V_SWAP_B16))
+ .addDef(X)
+ .addDef(Y)
+ .addReg(Y)
+ .addReg(X)
+ .getInstr();
+ Swaps.push_back(MIB);
+ } else {
+ assert(Size > 0 && Size % 4 == 0);
+ for (unsigned I = 0; I < Size / 4; ++I) {
+ TargetInstrInfo::RegSubRegPair X1, Y1;
+ X1 = getSubRegForIndex(X, Xsub, I);
+ Y1 = getSubRegForIndex(Y, Ysub, I);
+ auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
+ TII->get(AMDGPU::V_SWAP_B32))
+ .addDef(X1.Reg, 0, X1.SubReg)
+ .addDef(Y1.Reg, 0, Y1.SubReg)
+ .addReg(Y1.Reg, 0, Y1.SubReg)
+ .addReg(X1.Reg, 0, X1.SubReg)
+ .getInstr();
+ Swaps.push_back(MIB);
+ }
+ }
+ // Drop implicit EXEC.
+ if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
+ for (MachineInstr *Swap : Swaps) {
+ Swap->removeOperand(Swap->getNumExplicitOperands());
+ Swap->copyImplicitOps(*MBB.getParent(), *MovX);
}
}
MovX->eraseFromParent();
@@ -833,6 +857,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
}
if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+ MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
MI.getOpcode() == AMDGPU::COPY)) {
if (auto *NextMI = matchSwap(MI)) {
Next = NextMI->getIterator();
@@ -1023,7 +1048,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
MachineFunctionProperties::Property::NoVRegs))
continue;
- if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
+ if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
!shouldShrinkTrue16(MI))
continue;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 0a2e338b347871..34d12aa5e07835 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -751,7 +751,7 @@ let SubtargetPredicate = isGFX11Plus in {
let IsInvalidSingleUseConsumer = 1;
let IsInvalidSingleUseProducer = 1;
}
- defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
+ defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index d732da1a67bc1f..970bb08e1838b2 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2131,26 +2131,14 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
; GFX10-NEXT: global_store_short v[2:3], v5, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: test_store_fpimm:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, 0x3f80
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, 0x4228
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
-; GFX11TRUE16-NEXT: global_store_b16 v[0:1], v5, off
-; GFX11TRUE16-NEXT: global_store_b16 v[2:3], v4, off
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_store_fpimm:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_mov_b32_e32 v4, 0x3f80
-; GFX11FAKE16-NEXT: v_mov_b32_e32 v5, 0x4228
-; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off
-; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_store_fpimm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX11-NEXT: v_mov_b32_e32 v5, 0x4228
+; GFX11-NEXT: global_store_b16 v[0:1], v4, off
+; GFX11-NEXT: global_store_b16 v[2:3], v5, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
store bfloat 1.0, ptr addrspace(1) %ptr0
store bfloat 42.0, ptr addrspace(1) %ptr1
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 7352fcdd071d5b..9fe7544003568c 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -246,9 +246,7 @@ define amdgpu_kernel void @fadd_f16_imm_a(
; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x3c00
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -264,9 +262,7 @@ define amdgpu_kernel void @fadd_f16_imm_a(
; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x3c00
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l
; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -390,9 +386,7 @@ define amdgpu_kernel void @fadd_f16_imm_b(
; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x4000
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -408,9 +402,7 @@ define amdgpu_kernel void @fadd_f16_imm_b(
; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x4000
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l
; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
new file mode 100644
index 00000000000000..1f36f7a0d9616e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
+
+define half @swap(half %a, half %b, i32 %i) {
+; GFX11-TRUE16-LABEL: swap:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: .LBB0_1: ; %loop
+; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_swap_b16 v0.l, v0.h
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-TRUE16-NEXT: ; %bb.2: ; %ret
+; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: swap:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT: .LBB0_1: ; %loop
+; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2
+; GFX11-FAKE16-NEXT: v_swap_b32 v1, v0
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB0_1
+; GFX11-FAKE16-NEXT: ; %bb.2: ; %ret
+; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: swap:
+; GFX12-TRUE16: ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT: .LBB0_1: ; %loop
+; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v2, -1, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX12-TRUE16-NEXT: v_swap_b16 v0.l, v0.h
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB0_1
+; GFX12-TRUE16-NEXT: ; %bb.2: ; %ret
+; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: swap:
+; GFX12-FAKE16: ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT: .LBB0_1: ; %loop
+; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2
+; GFX12-FAKE16-NEXT: v_swap_b32 v1, v0
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB0_1
+; GFX12-FAKE16-NEXT: ; %bb.2: ; %ret
+; GFX12-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ br label %loop
+
+loop:
+ %x = phi half [%a, %entry], [%y, %loop]
+ %y = phi half [%b, %entry], [%x, %loop]
+ %i2 = phi i32 [%i, %entry], [%i3, %loop]
+
+ %i3 = sub i32 %i2, 1
+
+ %cmp = icmp eq i32 %i3, 0
+ br i1 %cmp, label %ret, label %loop
+
+ret:
+ ret half %x
+}
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index 7f99afe0192599..68442b01bf7d90 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -169,21 +169,3 @@ s_load_b96 s[20:22], s[2:3], s0
s_buffer_load_b96 s[20:22], s[4:7], s0
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-
-v_mov_b16 v0.l, s0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, ttmp0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, a0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, s0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, ttmp0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, a0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s
new file mode 100644
index 00000000000000..aa2309dd7d5d7c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s
@@ -0,0 +1,10 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s
+
+v_mov_b16 v0.l, s0.h
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_mov_b16 v0.l, ttmp0.h
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_mov_b16 v0.l, a0.h
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
More information about the llvm-commits
mailing list