[llvm] [AMDGPU][True16][CodeGen] Implement sgpr folding in true16 (PR #128929)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 24 08:43:29 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/128929
>From 25ba83947e7cb47a94111de0e92d5341e88950f6 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Wed, 26 Feb 2025 13:39:42 -0500
Subject: [PATCH 1/2] 16bit sgpr folding
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 89 +++-
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 1 +
llvm/test/CodeGen/AMDGPU/bf16.ll | 67 ++-
llvm/test/CodeGen/AMDGPU/bswap.ll | 145 ++++--
.../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 84 ++--
llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 6 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 268 ++++++------
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 12 +-
llvm/test/CodeGen/AMDGPU/fp-classify.ll | 117 ++---
llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/fptosi.f16.ll | 7 +-
llvm/test/CodeGen/AMDGPU/fptoui.f16.ll | 7 +-
llvm/test/CodeGen/AMDGPU/icmp.i16.ll | 22 +-
llvm/test/CodeGen/AMDGPU/imm16.ll | 56 +--
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 413 ++++++------------
.../CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll | 13 +-
.../llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll | 31 +-
.../llvm.amdgcn.raw.tbuffer.store.d16.ll | 70 +--
...lvm.amdgcn.struct.ptr.tbuffer.store.d16.ll | 34 +-
.../llvm.amdgcn.struct.tbuffer.store.d16.ll | 77 +---
.../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 48 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 20 +-
llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 20 +-
llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll | 12 +-
llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 7 +-
26 files changed, 702 insertions(+), 932 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index cc15dd7cb495c..5319a88ecaee7 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -12,8 +12,11 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -576,6 +579,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}
MachineOperand *New = Fold.OpToFold;
+ // TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs.
+ // Rework once the VS_16 register class is updated to include proper
+ // 16-bit SGPRs instead of 32-bit ones.
+ if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
+ Old.setSubReg(AMDGPU::NoSubRegister);
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
Old.setIsUndef(New->isUndef());
return true;
@@ -947,9 +955,15 @@ void SIFoldOperandsImpl::foldOperand(
return;
// FIXME: Fold operands with subregs.
- if (UseOp->isReg() && OpToFold.isReg() &&
- (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
- return;
+ if (UseOp->isReg() && OpToFold.isReg()) {
+ if (UseOp->isImplicit())
+ return;
+ // Allow folding from SGPRs to 16-bit VGPRs.
+ if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
+ (UseOp->getSubReg() != AMDGPU::lo16 ||
+ !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
+ return;
+ }
// Special case for REG_SEQUENCE: We can't fold literals into
// REG_SEQUENCE instructions, so we have to fold them into the
@@ -1040,6 +1054,14 @@ void SIFoldOperandsImpl::foldOperand(
}
}
+ // Allow immediates COPYd into sgpr_lo16 to be further folded while
+ // still being legal if not further folded
+ if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
+ assert(ST->useRealTrue16Insts());
+ MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
+ DestRC = &AMDGPU::SGPR_32RegClass;
+ }
+
// In order to fold immediates into copies, we need to change the
// copy to a MOV.
@@ -1073,9 +1095,43 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(0).getReg().isVirtual() &&
!UseMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
+ unsigned Size = TII->getOpSize(*UseMI, 1);
Register UseReg = OpToFold.getReg();
UseMI->getOperand(1).setReg(UseReg);
- UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
+ unsigned SubRegIdx = OpToFold.getSubReg();
+ // Hack to allow 32-bit SGPRs to be folded into True16 instructions
+ // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
+ // VS_16RegClass
+ //
+ // Excerpt from AMDGPUGenRegisterInfo.inc
+ // NoSubRegister, //0
+ // hi16, // 1
+ // lo16, // 2
+ // sub0, // 3
+ // ...
+ // sub1, // 11
+ // sub1_hi16, // 12
+ // sub1_lo16, // 13
+ static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
+ if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isSGPRReg(*MRI, UseReg)) {
+ // Produce the 32 bit subregister index to which the 16-bit subregister
+ // is aligned.
+ if (SubRegIdx > AMDGPU::sub1) {
+ LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
+ M |= M.getLane(M.getHighestLane() - 1);
+ SmallVector<unsigned, 4> Indexes;
+ TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
+ Indexes);
+ assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
+ SubRegIdx = Indexes[0];
+ // 32-bit registers do not have a sub0 index
+ } else if (TII->getOpSize(*UseMI, 1) == 4)
+ SubRegIdx = 0;
+ else
+ SubRegIdx = AMDGPU::sub0;
+ }
+ UseMI->getOperand(1).setSubReg(SubRegIdx);
UseMI->getOperand(1).setIsKill(false);
CopiesToReplace.push_back(UseMI);
OpToFold.setIsKill(false);
@@ -1713,6 +1769,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
return false;
+ // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
+ // Can remove this code if proper 16-bit SGPRs are implemented
+ // Example: Pre-peephole-opt
+ // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
+ // %32:sreg_32 = COPY %29:sgpr_lo16
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // Post-peephole-opt and DCE
+ // %32:sreg_32 = COPY %16.lo16:sreg_32
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // After this transform
+ // %32:sreg_32 = COPY %16:sreg_32
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // After the fold operands pass
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
+ if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
+ OpToFold.getSubReg()) {
+ const TargetRegisterClass *DstRC =
+ MRI->getRegClass(MI.getOperand(0).getReg());
+ if (DstRC == &AMDGPU::SReg_32RegClass &&
+ DstRC == MRI->getRegClass(OpToFold.getReg())) {
+ assert(OpToFold.getSubReg() == AMDGPU::lo16);
+ OpToFold.setSubReg(0);
+ }
+ }
+
// Prevent folding operands backwards in the function. For example,
// the COPY opcode must not be replaced by 1 in this example:
//
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index def06c1e9a0d7..db5b1e5c9a035 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -776,6 +776,7 @@ let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
[], /*VOP1Only=*/ 1>;
+ let isAsCheapAsAMove = 1 in
defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 2ef88010bd157..0619415fdd370 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -38259,16 +38259,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
; GFX11TRUE16-LABEL: s_select_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
+; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
@@ -38376,19 +38374,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
;
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
+; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v2bf16:
@@ -40095,30 +40091,25 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
;
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
-; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1
+; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v4bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index a95a1aba0c914..e70cd2400172d 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -303,18 +303,32 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: test_bswap_i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-REAL16-LABEL: test_bswap_i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s2, 0x10203
+; GFX11-REAL16-NEXT: s_mov_b32 s2, -1
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-REAL16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bswap_i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s4, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s5, 0x10203
+; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in, align 8
%bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
store i64 %bswap, ptr addrspace(1) %out, align 8
@@ -364,20 +378,36 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: test_bswap_v2i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v3, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-REAL16-LABEL: test_bswap_v2i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-REAL16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-REAL16-NEXT: s_mov_b32 s2, -1
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s4, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, s6, 0x10203
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-REAL16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bswap_v2i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, 0, s6, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, s7, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s4, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s5, 0x10203
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load <2 x i64>, ptr addrspace(1) %in, align 16
%bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
store <2 x i64> %bswap, ptr addrspace(1) %out, align 16
@@ -445,26 +475,49 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: test_bswap_v4i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v7, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v6, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v5, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v4, 0, s5, 0x10203
-; GFX11-NEXT: v_perm_b32 v3, 0, s2, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s3, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-REAL16-LABEL: test_bswap_v4i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX11-REAL16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-REAL16-NEXT: s_mov_b32 s10, -1
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s4, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, s6, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v4, 0, s0, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v6, 0, s2, 0x10203
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v5, v4
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-REAL16-NEXT: s_clause 0x1
+; GFX11-REAL16-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 offset:16
+; GFX11-REAL16-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bswap_v4i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, 0, s6, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, 0, s7, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, 0, s4, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, 0, s5, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, 0, s2, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, s3, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s0, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s1, 0x10203
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load <4 x i64>, ptr addrspace(1) %in, align 32
%bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
store <4 x i64> %bswap, ptr addrspace(1) %out, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 14ddf7daad1c6..03cb3b28480c4 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -216,34 +216,19 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2
; VI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: extract_vector_elt_v3f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: extract_vector_elt_v3f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: extract_vector_elt_v3f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
+; GFX11-NEXT: s_endpgm
%p0 = extractelement <3 x half> %foo, i32 0
%p1 = extractelement <3 x half> %foo, i32 2
%out1 = getelementptr half, ptr addrspace(1) %out, i32 1
@@ -284,35 +269,20 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: dynamic_extract_vector_elt_v3f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s6, 4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
-; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: dynamic_extract_vector_elt_v3f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s6, 4
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
-; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: dynamic_extract_vector_elt_v3f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshl_b32 s4, s6, 4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_endpgm
%p0 = extractelement <3 x half> %foo, i32 %idx
%out1 = getelementptr half, ptr addrspace(1) %out, i32 1
store half %p0, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 365588eaec3ac..c0fa7b86cbe12 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -297,10 +297,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |v0.l|, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |s2|, s3
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 9063af4351297..9ef48588a51ae 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -175,9 +175,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, s2, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 7e4b1259db3aa..9e5fedccddc7f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -60,34 +60,19 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag,
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: s_copysign_f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_copysign_f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s3
-; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
-; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: s_copysign_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
%out = call half @llvm.copysign.f16(half %mag, half %sign)
store half %out, ptr addrspace(1) %arg_out
ret void
@@ -1928,6 +1913,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
+<<<<<<< HEAD
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
@@ -2044,6 +2030,66 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4
; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
+=======
+; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX11-NEXT: s_lshr_b32 s6, s3, 8
+; GFX11-NEXT: s_or_b32 s2, s5, s2
+; GFX11-NEXT: s_and_b32 s5, s6, 0xffe
+; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014
+; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
+; GFX11-NEXT: v_readfirstlane_b32 s3, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_readfirstlane_b32 s6, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_or_b32 s3, s5, s3
+; GFX11-NEXT: s_or_b32 s5, s3, 0x1000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshr_b32 s7, s5, s6
+; GFX11-NEXT: s_lshl_b32 s6, s7, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_lg_u32 s6, s5
+; GFX11-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-NEXT: s_addk_i32 s2, 0xfc10
+; GFX11-NEXT: s_or_b32 s5, s7, s5
+; GFX11-NEXT: s_lshl_b32 s6, s2, 12
+; GFX11-NEXT: s_or_b32 s6, s3, s6
+; GFX11-NEXT: s_cmp_lt_i32 s2, 1
+; GFX11-NEXT: s_cselect_b32 s5, s5, s6
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s6, s5, 7
+; GFX11-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-NEXT: s_or_b32 s6, s6, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s5, s5, s6
+; GFX11-NEXT: s_cmp_lt_i32 s2, 31
+; GFX11-NEXT: s_movk_i32 s6, 0x7e00
+; GFX11-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-NEXT: s_cselect_b32 s3, s6, 0x7c00
+; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX11-NEXT: s_cselect_b32 s2, s3, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
+>>>>>>> 41d8a9928050 (16bit sgpr folding)
%mag.trunc = fptrunc double %mag to half
%result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
store half %result, ptr addrspace(1) %arg_out
@@ -2114,44 +2160,24 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: s_copysign_v2f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s3
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_copysign_v2f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s3
-; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: s_copysign_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
+; GFX11-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
%out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign)
store <2 x half> %out, ptr addrspace(1) %arg_out
ret void
@@ -2244,24 +2270,23 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
-; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s3
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v4, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v3, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b16 v4, v0, s[4:5] offset:4
; GFX11-TRUE16-NEXT: global_store_b32 v4, v1, s[4:5]
@@ -2391,62 +2416,31 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: s_copysign_v4f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2
-; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s2
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v4, v5
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v6, v7
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v0
-; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v4
-; GFX11-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[4:5]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_copysign_v4f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2
-; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0
-; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 16
-; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2
-; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v0
-; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v3, 16, v4
-; GFX11-FAKE16-NEXT: global_store_b64 v5, v[0:1], s[4:5]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: s_copysign_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
+; GFX11-NEXT: s_lshr_b32 s6, s1, 16
+; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0
+; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5]
+; GFX11-NEXT: s_endpgm
%out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
store <4 x half> %out, ptr addrspace(1) %arg_out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 9642b36ecb7e8..67bec43078803 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -55,10 +55,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v0.h|
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e64 v0.l, s3, |s2|
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -132,10 +130,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -|v0.h|
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, s3, -|s2|
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 6a0d52962265d..498df8a65feda 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -620,32 +620,18 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: test_isinf_pattern_f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x204
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: test_isinf_pattern_f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: test_isinf_pattern_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
%fabs = tail call half @llvm.fabs.f16(half %x) #1
%cmp = fcmp oeq half %fabs, 0xH7C00
%ext = zext i1 %cmp to i32
@@ -684,32 +670,18 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: test_isfinite_pattern_0_f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x1f8
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: test_isfinite_pattern_0_f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: test_isfinite_pattern_0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
%ord = fcmp ord half %x, 0.0
%x.fabs = tail call half @llvm.fabs.f16(half %x) #1
%ninf = fcmp une half %x.fabs, 0xH7C00
@@ -747,32 +719,18 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: test_isfinite_pattern_4_f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x1f8
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: test_isfinite_pattern_4_f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: test_isfinite_pattern_4_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
%ord = fcmp ord half %x, 0.0
%x.fabs = tail call half @llvm.fabs.f16(half %x) #1
%ninf = fcmp one half %x.fabs, 0xH7C00
@@ -786,3 +744,6 @@ declare half @llvm.fabs.f16(half) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-FAKE16: {{.*}}
+; GFX11-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 59ba9b72e2911..fa358c92e07ea 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -393,10 +393,8 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a)
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index f84e14ea62273..97a94edc9205a 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -616,11 +616,10 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index bba3a23df11a5..72ddc32b2ba5c 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -614,11 +614,10 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0.l
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
index 77575c78fb349..6a4ae7f4e0d78 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
@@ -1,8 +1,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s
-; FIXME-TRUE16. In true16 flow, the codegen introduces addtional s2v copy and mov, and revert the operand order thus picking different cmp instructions
-; This should be corrected after addtional mov/copy is removed
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s
;;;==========================================================================;;;
@@ -215,7 +213,7 @@ entry:
; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX11-FAKE16: v_cmp_eq_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
+; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}}
define amdgpu_kernel void @i16_eq_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -233,7 +231,7 @@ entry:
; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX11-FAKE16: v_cmp_ne_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
+; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}}
define amdgpu_kernel void @i16_ne_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -251,7 +249,7 @@ entry:
; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX11-FAKE16: v_cmp_lt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
+; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}}
define amdgpu_kernel void @i16_ugt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -269,7 +267,7 @@ entry:
; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX11-FAKE16: v_cmp_le_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
+; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}}
define amdgpu_kernel void @i16_uge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -287,7 +285,7 @@ entry:
; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX11-FAKE16: v_cmp_gt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
+; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}}
define amdgpu_kernel void @i16_ult_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -305,7 +303,7 @@ entry:
; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX11-FAKE16: v_cmp_ge_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
+; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}}
define amdgpu_kernel void @i16_ule_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -323,7 +321,7 @@ entry:
; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX11-FAKE16: v_cmp_lt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
+; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}}
define amdgpu_kernel void @i16_sgt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -341,7 +339,7 @@ entry:
; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX11-FAKE16: v_cmp_le_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
+; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}}
define amdgpu_kernel void @i16_sge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -359,7 +357,7 @@ entry:
; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX11-FAKE16: v_cmp_gt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
+; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}}
define amdgpu_kernel void @i16_slt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -377,7 +375,7 @@ entry:
; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX11-FAKE16: v_cmp_ge_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}}
+; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}}
define amdgpu_kernel void @i16_sle_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index bc4a8634dbe50..8ca87678a36f3 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -647,10 +647,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l ; encoding: [0x80,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -715,10 +713,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0.5, v0.l ; encoding: [0xf0,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -783,10 +779,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -0.5, v0.l ; encoding: [0xf1,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -851,10 +845,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; encoding: [0xf2,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -919,10 +911,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -1.0, v0.l ; encoding: [0xf3,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -987,10 +977,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; encoding: [0xf4,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -1055,10 +1043,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -2.0, v0.l ; encoding: [0xf5,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -1123,10 +1109,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 4.0, v0.l ; encoding: [0xf6,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -1191,10 +1175,8 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -4.0, v0.l ; encoding: [0xf7,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -1455,10 +1437,8 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x)
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1, v0.l ; encoding: [0x81,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -1523,10 +1503,8 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x)
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 2, v0.l ; encoding: [0x82,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -1591,10 +1569,8 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x)
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 16, v0.l ; encoding: [0x90,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -1896,10 +1872,8 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x)
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 63, v0.l ; encoding: [0xbf,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
@@ -1964,10 +1938,8 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x)
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00]
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 64, v0.l ; encoding: [0xc0,0x00,0x00,0x64]
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 49879f66dd852..04b6b33a3cc75 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -199,9 +199,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s2
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s2
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-TRUE16-NEXT: ;;#ASMSTART
@@ -356,42 +354,23 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s4, 16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s3
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, s4, s2
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use s3
-; GFX11-TRUE16-NEXT: ;;#ASMEND
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s4, 16
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, s3, s2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use s3
-; GFX11-FAKE16-NEXT: ;;#ASMEND
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_lshr_b32 s3, s4, 16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_pack_lh_b32_b16 s2, s3, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s3
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: s_endpgm
%vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
%elt.hi = lshr i32 %elt.arg, 16
%elt = trunc i32 %elt.hi to i16
@@ -468,52 +447,27 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s4, 16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_mov_b32 s4, s3
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-TRUE16-NEXT: s_mov_b32 s5, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s5
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
-; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use s3
-; GFX11-TRUE16-NEXT: ;;#ASMEND
-; GFX11-TRUE16-NEXT: ;;#ASMSTART
-; GFX11-TRUE16-NEXT: ; use s2
-; GFX11-TRUE16-NEXT: ;;#ASMEND
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s4, 16
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s3, s2
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
-; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use s3
-; GFX11-FAKE16-NEXT: ;;#ASMEND
-; GFX11-FAKE16-NEXT: ;;#ASMSTART
-; GFX11-FAKE16-NEXT: ; use s2
-; GFX11-FAKE16-NEXT: ;;#ASMEND
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_lshr_b32 s3, s4, 16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_pack_ll_b32_b16 s4, s3, s2
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s3
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: ;;#ASMSTART
+; GFX11-NEXT: ; use s2
+; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: s_endpgm
%vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
%elt.hi = lshr i32 %elt.arg, 16
%elt = trunc i32 %elt.hi to i16
@@ -1792,34 +1746,19 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: v_insertelement_v4f16_0:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0
-; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: v_insertelement_v4f16_0:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, s4, v0
-; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: v_insertelement_v4f16_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s4, v0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1978,34 +1917,19 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: v_insertelement_v4f16_2:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1
-; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: v_insertelement_v4f16_2:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
-; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: v_insertelement_v4f16_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2164,34 +2088,19 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: v_insertelement_v4i16_2:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1
-; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: v_insertelement_v4i16_2:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
-; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: v_insertelement_v4i16_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2583,34 +2492,19 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: v_insertelement_v8i16_6:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v5, v3
-; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: v_insertelement_v8i16_6:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
-; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: v_insertelement_v8i16_6:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2799,11 +2693,10 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v6, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v5, s[2:3]
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s4
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 7
; GFX11-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
@@ -2816,23 +2709,23 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 3
; GFX11-TRUE16-NEXT: s_cselect_b32 s9, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 0
-; GFX11-TRUE16-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-TRUE16-NEXT: s_cselect_b32 s10, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 1
; GFX11-TRUE16-NEXT: s_cselect_b32 s5, -1, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v5.l, s2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.l, v5.l, s6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v5.l, s8
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.l, s4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v7.l, v5.l, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v5.l, s7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v5.l, s9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v5.l, s5
-; GFX11-TRUE16-NEXT: global_store_b128 v6, v[1:4], s[0:1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, s4, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.l, s4, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, s4, s8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, s4, s10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v6.l, s4, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v7.l, s4, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v8.l, s4, s9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v9.l, s4, s5
+; GFX11-TRUE16-NEXT: global_store_b128 v5, v[1:4], s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_insertelement_v8f16_dynamic:
@@ -3082,45 +2975,24 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; CI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LABEL: v_insertelement_v16i16_6:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v8, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s4
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v9, v3
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX11-TRUE16-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: v_insertelement_v16i16_6:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v8, s[2:3]
-; GFX11-FAKE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX11-FAKE16-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-LABEL: v_insertelement_v16i16_6:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -3453,7 +3325,6 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v12, s[2:3]
; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v12, s[2:3] offset:16
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, s4
; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 7
; GFX11-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
@@ -3482,35 +3353,35 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 11
; GFX11-TRUE16-NEXT: s_cselect_b32 s17, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 8
-; GFX11-TRUE16-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-TRUE16-NEXT: s_cselect_b32 s18, -1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 9
; GFX11-TRUE16-NEXT: s_cselect_b32 s5, -1, 0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, v8.h, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, s4, s2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v2.l, v8.h, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v2.l, s4, s6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v1.l, v8.h, s8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v1.l, s4, s8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v0.l, v8.h, s10
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v0.l, s4, s10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v8.h, s12
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, s4, s12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v8.h, s14
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, s4, s14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v8.h, s16
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, s4, s16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v8.h, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, s4, s18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v7.l, v8.h, s13
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, v8.h, s15
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v8.h, s17
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v13.l, v8.h, s3
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v8.h, s5
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v14.l, v8.h, s7
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, v8.h, s9
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v16.l, v8.h, s11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v7.l, s4, s13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, s4, s15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, s4, s17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v13.l, s4, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, s4, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v14.l, s4, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, s4, s9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v16.l, s4, s11
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b128 v12, v[0:3], s[0:1] offset:16
; GFX11-TRUE16-NEXT: global_store_b128 v12, v[8:11], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
index 07421afde7622..b77b2f7441a0c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll
@@ -78,20 +78,19 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v2, v0, s[6:7] glc dlc
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x3c
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_alignbyte_b32 v0, v1, v2, v0.l
-; GFX11-TRUE16-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_alignbyte_b32 v0, v1, v0, s2
+; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_alignbyte_b32_2:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
index 839892e38db49..d8e2ce3728a9b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
@@ -35,25 +35,15 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat
; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED]
; GFX10-PACKED-NEXT: s_endpgm
;
-; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x:
-; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body
-; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1
-; GFX11-PACKED-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6
-; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-TRUE16-NEXT: s_endpgm
-;
-; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x:
-; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body
-; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1
-; GFX11-PACKED-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-FAKE16-NEXT: s_endpgm
+; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
+; GFX11-PACKED: ; %bb.0: ; %main_body
+; GFX11-PACKED-NEXT: s_clause 0x1
+; GFX11-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0)
ret void
@@ -217,3 +207,6 @@ declare void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half, ptr addrspace(8), i32,
declare void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32)
declare void @llvm.amdgcn.raw.ptr.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32)
declare void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-PACKED-FAKE16: {{.*}}
+; GFX11-PACKED-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index c53c491c216e7..052f7f1c8310b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -39,55 +39,25 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED]
; GFX10-PACKED-NEXT: s_endpgm
;
-; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x:
-; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body
-; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1
-; GFX11-PACKED-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6
-; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-TRUE16-NEXT: s_endpgm
-;
-; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x:
-; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body
-; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1
-; GFX11-PACKED-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX11-PACKED-FAKE16-NEXT: s_endpgm
-;
-; GFX12-PACKED-SDAG-TRUE16-LABEL: tbuffer_store_d16_x:
-; GFX12-PACKED-SDAG-TRUE16: ; %bb.0: ; %main_body
-; GFX12-PACKED-SDAG-TRUE16-NEXT: s_clause 0x1
-; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-PACKED-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6
-; GFX12-PACKED-SDAG-TRUE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX12-PACKED-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-PACKED-SDAG-FAKE16-LABEL: tbuffer_store_d16_x:
-; GFX12-PACKED-SDAG-FAKE16: ; %bb.0: ; %main_body
-; GFX12-PACKED-SDAG-FAKE16-NEXT: s_clause 0x1
-; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
-; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-PACKED-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-PACKED-SDAG-FAKE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX12-PACKED-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
+; GFX11-PACKED: ; %bb.0: ; %main_body
+; GFX11-PACKED-NEXT: s_clause 0x1
+; GFX11-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: s_endpgm
;
-; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_x:
-; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body
-; GFX12-PACKED-GISEL-NEXT: s_clause 0x1
-; GFX12-PACKED-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34
-; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
-; GFX12-PACKED-GISEL-NEXT: s_endpgm
+; GFX12-PACKED-LABEL: tbuffer_store_d16_x:
+; GFX12-PACKED: ; %bb.0: ; %main_body
+; GFX12-PACKED-NEXT: s_clause 0x1
+; GFX12-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
ret void
@@ -298,5 +268,9 @@ declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i3
declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32)
declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-PACKED-FAKE16: {{.*}}
+; GFX11-PACKED-TRUE16: {{.*}}
; GFX12-PACKED-GISEL-FAKE16: {{.*}}
; GFX12-PACKED-GISEL-TRUE16: {{.*}}
+; GFX12-PACKED-SDAG-FAKE16: {{.*}}
+; GFX12-PACKED-SDAG-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
index 530ace778cdc9..fc8f8afa82c2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll
@@ -38,27 +38,16 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat
; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
; GFX10-PACKED-NEXT: s_endpgm
;
-; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x:
-; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body
-; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1
-; GFX11-PACKED-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
-; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6
-; GFX11-PACKED-TRUE16-NEXT: v_mov_b32_e32 v1, s7
-; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-TRUE16-NEXT: s_endpgm
-;
-; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x:
-; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body
-; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1
-; GFX11-PACKED-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
-; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v1, s7
-; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-FAKE16-NEXT: s_endpgm
+; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
+; GFX11-PACKED: ; %bb.0: ; %main_body
+; GFX11-PACKED-NEXT: s_clause 0x1
+; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
+; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
+; GFX11-PACKED-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
ret void
@@ -242,3 +231,6 @@ declare void @llvm.amdgcn.struct.ptr.tbuffer.store.f16(half, ptr addrspace(8), i
declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32, i32)
declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32, i32)
declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32, i32)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-PACKED-FAKE16: {{.*}}
+; GFX11-PACKED-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
index bdb82999197d9..d025e7a15e25a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll
@@ -44,60 +44,27 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32
; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
; GFX10-PACKED-NEXT: s_endpgm
;
-; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x:
-; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body
-; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1
-; GFX11-PACKED-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
-; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6
-; GFX11-PACKED-TRUE16-NEXT: v_mov_b32_e32 v1, s7
-; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-TRUE16-NEXT: s_endpgm
-;
-; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x:
-; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body
-; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1
-; GFX11-PACKED-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
-; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v1, s7
-; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX11-PACKED-FAKE16-NEXT: s_endpgm
-;
-; GFX12-PACKED-SDAG-TRUE16-LABEL: tbuffer_store_d16_x:
-; GFX12-PACKED-SDAG-TRUE16: ; %bb.0: ; %main_body
-; GFX12-PACKED-SDAG-TRUE16-NEXT: s_clause 0x1
-; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
-; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-PACKED-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6
-; GFX12-PACKED-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s7
-; GFX12-PACKED-SDAG-TRUE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX12-PACKED-SDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX12-PACKED-SDAG-FAKE16-LABEL: tbuffer_store_d16_x:
-; GFX12-PACKED-SDAG-FAKE16: ; %bb.0: ; %main_body
-; GFX12-PACKED-SDAG-FAKE16-NEXT: s_clause 0x1
-; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
-; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-PACKED-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-PACKED-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s7
-; GFX12-PACKED-SDAG-FAKE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX12-PACKED-SDAG-FAKE16-NEXT: s_endpgm
+; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
+; GFX11-PACKED: ; %bb.0: ; %main_body
+; GFX11-PACKED-NEXT: s_clause 0x1
+; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
+; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen
+; GFX11-PACKED-NEXT: s_endpgm
;
-; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_x:
-; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body
-; GFX12-PACKED-GISEL-NEXT: s_clause 0x1
-; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
-; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s7
-; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
-; GFX12-PACKED-GISEL-NEXT: s_endpgm
+; GFX12-PACKED-LABEL: tbuffer_store_d16_x:
+; GFX12-PACKED: ; %bb.0: ; %main_body
+; GFX12-PACKED-NEXT: s_clause 0x1
+; GFX12-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
+; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s7
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen
+; GFX12-PACKED-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
ret void
@@ -331,5 +298,9 @@ declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32,
declare void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32)
declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-PACKED-FAKE16: {{.*}}
+; GFX11-PACKED-TRUE16: {{.*}}
; GFX12-PACKED-GISEL-FAKE16: {{.*}}
; GFX12-PACKED-GISEL-TRUE16: {{.*}}
+; GFX12-PACKED-SDAG-FAKE16: {{.*}}
+; GFX12-PACKED-SDAG-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 2378284a521f6..1e8dbb137af9e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -77,42 +77,17 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10CHECK-NEXT: s_endpgm
;
-; GFX11SELDAG-TRUE16-LABEL: sgpr_isnan_f16:
-; GFX11SELDAG-TRUE16: ; %bb.0:
-; GFX11SELDAG-TRUE16-NEXT: s_clause 0x1
-; GFX11SELDAG-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11SELDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11SELDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 3
-; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2
-; GFX11SELDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11SELDAG-TRUE16-NEXT: s_endpgm
-;
-; GFX11SELDAG-FAKE16-LABEL: sgpr_isnan_f16:
-; GFX11SELDAG-FAKE16: ; %bb.0:
-; GFX11SELDAG-FAKE16-NEXT: s_clause 0x1
-; GFX11SELDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11SELDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11SELDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0
-; GFX11SELDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11SELDAG-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11SELDAG-FAKE16-NEXT: s_endpgm
-;
-; GFX11GLISEL-LABEL: sgpr_isnan_f16:
-; GFX11GLISEL: ; %bb.0:
-; GFX11GLISEL-NEXT: s_clause 0x1
-; GFX11GLISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11GLISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11GLISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11GLISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11GLISEL-NEXT: s_endpgm
+; GFX11CHECK-LABEL: sgpr_isnan_f16:
+; GFX11CHECK: ; %bb.0:
+; GFX11CHECK-NEXT: s_clause 0x1
+; GFX11CHECK-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
+; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: s_endpgm
%result = call i1 @llvm.is.fpclass.f16(half %x, i32 3)
%sext = sext i1 %result to i32
store i32 %sext, ptr addrspace(1) %out, align 4
@@ -4313,4 +4288,5 @@ attributes #0 = { "denormal-fp-math"="ieee,preserve-sign" }
; Maybe daz
attributes #1 = { "denormal-fp-math"="ieee,dynamic" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11GLISEL: {{.*}}
; GFX11SELDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index afede06001736..37081fb853e74 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -1091,18 +1091,14 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-LABEL: s_maximum_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
-; GFX11-TRUE16-NEXT: v_pk_max_f16 v2, s0, s1
-; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1
+; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, s0, s1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, s0, s1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index a645a8ab5d2f8..48ce9598b1264 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -904,18 +904,14 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
; GFX11-TRUE16-LABEL: s_minimum_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
-; GFX11-TRUE16-NEXT: v_pk_min_f16 v2, s0, s1
-; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1
+; GFX11-TRUE16-NEXT: v_pk_min_f16 v0, s0, s1
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, s0, s1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index d6c5c937fd83e..a8e068a4854e3 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -802,14 +802,10 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
;
; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3
-; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s0, s2, 16
-; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s1, s3, 16
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, s1
-; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v0.l, s2, s3
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s0, s3, 16
+; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s1, s2, 16
+; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v1.l, s1, s0
; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index a0cfcf671ed0b..1cab175cf492d 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -187,11 +187,8 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
; SDAG-GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; SDAG-GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0
; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
-; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3
-; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff
-; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, v0.h, 0, 0xff
+; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, s2, 0, 0xff
+; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, s3, 0, 0xff
; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; SDAG-GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SDAG-GFX12-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
>From 48153d90e33fc806103f2959b7403d9dee2f743f Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 18 Mar 2025 10:13:06 -0400
Subject: [PATCH 2/2] added mir test
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 1 -
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 190 ++++------------------
llvm/test/CodeGen/AMDGPU/true16-fold.mir | 60 +++++++
3 files changed, 95 insertions(+), 156 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/true16-fold.mir
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 5319a88ecaee7..a605a96af47e6 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -579,7 +579,6 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}
MachineOperand *New = Fold.OpToFold;
- // TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs.
// Rework once the VS_16 register class is updated to include proper
// 16-bit SGPRs instead of 32-bit ones.
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 9e5fedccddc7f..4f77486794527 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1913,124 +1913,6 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-<<<<<<< HEAD
-; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
-; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s3, 0x1ff
-; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 8
-; GFX11-TRUE16-NEXT: s_or_b32 s2, s5, s2
-; GFX11-TRUE16-NEXT: s_and_b32 s5, s6, 0xffe
-; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-TRUE16-NEXT: s_bfe_u32 s2, s3, 0xb0014
-; GFX11-TRUE16-NEXT: s_sub_i32 s3, 0x3f1, s2
-; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0
-; GFX11-TRUE16-NEXT: v_med3_i32 v1, s3, 0, 13
-; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s2, 12
-; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s2, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x1000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, s3, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s2, 31
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 7, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0x7e00
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v3 :: v_dual_add_nc_u32 v1, v1, v2
-; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo
-; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4
-; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
-; GFX11-TRUE16-NEXT: s_endpgm
-;
-; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34
-; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s3, 0x1ff
-; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 8
-; GFX11-FAKE16-NEXT: s_or_b32 s2, s5, s2
-; GFX11-FAKE16-NEXT: s_and_b32 s5, s6, 0xffe
-; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-FAKE16-NEXT: s_bfe_u32 s2, s3, 0xb0014
-; GFX11-FAKE16-NEXT: s_sub_i32 s3, 0x3f1, s2
-; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0
-; GFX11-FAKE16-NEXT: v_med3_i32 v1, s3, 0, 13
-; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s2, 12
-; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s2, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x1000, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v2
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, v1, v3
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, s3, v0
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s2, 31
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 7, v1
-; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0x7e00 :: v_dual_add_nc_u32 v1, v1, v2
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v2, vcc_lo
-; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_mov_b32 v1, 0
-; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4
-; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
-; GFX11-FAKE16-NEXT: s_endpgm
-=======
; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
@@ -2047,49 +1929,47 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014
; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
-; GFX11-NEXT: v_readfirstlane_b32 s3, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_readfirstlane_b32 s6, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_or_b32 s3, s5, s3
-; GFX11-NEXT: s_or_b32 s5, s3, 0x1000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshr_b32 s7, s5, s6
-; GFX11-NEXT: s_lshl_b32 s6, s7, s6
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s6, s5
-; GFX11-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-NEXT: s_addk_i32 s2, 0xfc10
-; GFX11-NEXT: s_or_b32 s5, s7, s5
-; GFX11-NEXT: s_lshl_b32 s6, s2, 12
-; GFX11-NEXT: s_or_b32 s6, s3, s6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v0, s5, v0
+; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
+; GFX11-NEXT: s_lshl_b32 s3, s2, 12
; GFX11-NEXT: s_cmp_lt_i32 s2, 1
-; GFX11-NEXT: s_cselect_b32 s5, s5, s6
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s6, s5, 7
-; GFX11-NEXT: s_cmp_gt_i32 s6, 5
-; GFX11-NEXT: s_cselect_b32 s7, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s6, 3
-; GFX11-NEXT: s_cselect_b32 s6, 1, 0
-; GFX11-NEXT: s_lshr_b32 s5, s5, 2
-; GFX11-NEXT: s_or_b32 s6, s6, s7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s5, s5, s6
+; GFX11-NEXT: v_or_b32_e32 v2, 0x1000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT: v_or_b32_e32 v2, s3, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmp_lt_i32 s2, 31
-; GFX11-NEXT: s_movk_i32 s6, 0x7e00
-; GFX11-NEXT: s_cselect_b32 s5, s5, 0x7c00
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_cselect_b32 s3, s6, 0x7c00
+; GFX11-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX11-NEXT: s_cselect_b32 s2, s3, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0x7e00 :: v_dual_add_nc_u32 v1, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v2, vcc_lo
+; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4
; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
->>>>>>> 41d8a9928050 (16bit sgpr folding)
%mag.trunc = fptrunc double %mag to half
%result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
store half %result, ptr addrspace(1) %arg_out
diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
new file mode 100644
index 0000000000000..ef6e4007b8f7a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -0,0 +1,60 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass si-fold-operands -mattr=+real-true16 -o - %s | FileCheck %s
+
+---
+name: fold_16bit_subreg_1
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: fold_16bit_subreg_1
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed [[DEF1]], 2, [[DEF]].sub1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]]
+ %0:sreg_64_xexec = IMPLICIT_DEF
+ %1:sgpr_lo16 = COPY %0.sub1_lo16:sreg_64_xexec
+ %2:vgpr_16 = COPY %1:sgpr_lo16
+ %3:vgpr_16 = IMPLICIT_DEF
+ %4:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed %3:vgpr_16, 2, killed %2:vgpr_16, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: fold_16bit_subreg_0
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: fold_16bit_subreg_0
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed [[DEF1]], 2, [[DEF]].sub0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]]
+ %0:sreg_64_xexec = IMPLICIT_DEF
+ %1:sgpr_lo16 = COPY %0.lo16:sreg_64_xexec
+ %2:vgpr_16 = COPY %1:sgpr_lo16
+ %3:vgpr_16 = IMPLICIT_DEF
+ %4:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed %3:vgpr_16, 2, killed %2:vgpr_16, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0, implicit %4
+...
+
+---
+name: sgpr_lo16
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: sgpr_lo16
+ ; CHECK: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, 30, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_t16_e64_]]
+ %0:sreg_32 = IMPLICIT_DEF
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:sreg_32 = S_MOV_B32 30
+ %3:sgpr_lo16 = COPY %2.lo16:sreg_32
+ %4:vgpr_16 = COPY %3:sgpr_lo16
+ %5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec
+ S_ENDPGM 0, implicit %5
+...
More information about the llvm-commits
mailing list