[llvm] [AMDGPU][True16][CodeGen] workaround sgpr folding in true16 (PR #128929)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 14 17:56:30 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
We haven't implemented 16 bit SGPRs. Currently allow 32-bit SGPRs to be folded into True16 bit instructions taking 16 bit values. This improves generated code quality.
---
Patch is 129.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128929.diff
26 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+85-4)
- (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+1)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+29-38)
- (modified) llvm/test/CodeGen/AMDGPU/bswap.ll (+99-46)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll (+27-57)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.f16.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+1-3)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+128-255)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+4-8)
- (modified) llvm/test/CodeGen/AMDGPU/fp-classify.ll (+39-78)
- (modified) llvm/test/CodeGen/AMDGPU/fpext.f16.ll (+1-3)
- (modified) llvm/test/CodeGen/AMDGPU/fptosi.f16.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/fptoui.f16.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/icmp.i16.ll (+10-12)
- (modified) llvm/test/CodeGen/AMDGPU/imm16.ll (+14-42)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+142-271)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll (+6-7)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll (+12-19)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll (+22-48)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll (+13-21)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll (+24-53)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll (+12-36)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+8-12)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll (+8-12)
- (modified) llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll (+4-8)
- (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+2-5)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 91df516b80857..049228edc7ba8 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -12,8 +12,11 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -576,6 +579,11 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
}
MachineOperand *New = Fold.OpToFold;
+ // TODO: Temporarily allow folding from SGPRs to 16-bit VGPRs.
+ // Rework once the VS_16 register class is updated to include proper
+ // 16-bit SGPRs instead of 32-bit ones.
+ if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
+ Old.setSubReg(AMDGPU::NoSubRegister);
Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
Old.setIsUndef(New->isUndef());
return true;
@@ -947,9 +955,15 @@ void SIFoldOperandsImpl::foldOperand(
return;
// FIXME: Fold operands with subregs.
- if (UseOp->isReg() && OpToFold.isReg() &&
- (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
- return;
+ if (UseOp->isReg() && OpToFold.isReg()) {
+ if (UseOp->isImplicit())
+ return;
+ // Allow folding from SGPRs to 16-bit VGPRs.
+ if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
+ (UseOp->getSubReg() != AMDGPU::lo16 ||
+ !TRI->isSGPRReg(*MRI, OpToFold.getReg())))
+ return;
+ }
// Special case for REG_SEQUENCE: We can't fold literals into
// REG_SEQUENCE instructions, so we have to fold them into the
@@ -1040,6 +1054,14 @@ void SIFoldOperandsImpl::foldOperand(
}
}
+ // Allow immediates COPYd into sgpr_lo16 to be further folded while
+ // still being legal if not further folded
+ if (DestRC == &AMDGPU::SGPR_LO16RegClass) {
+ assert(ST->useRealTrue16Insts());
+ MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass);
+ DestRC = &AMDGPU::SGPR_32RegClass;
+ }
+
// In order to fold immediates into copies, we need to change the
// copy to a MOV.
@@ -1073,9 +1095,43 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(0).getReg().isVirtual() &&
!UseMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
+ unsigned Size = TII->getOpSize(*UseMI, 1);
Register UseReg = OpToFold.getReg();
UseMI->getOperand(1).setReg(UseReg);
- UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
+ unsigned SubRegIdx = OpToFold.getSubReg();
+ // Hack to allow 32-bit SGPRs to be folded into True16 instructions
+ // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the
+ // VS_16RegClass
+ //
+ // Excerpt from AMDGPUGenRegisterInfo.inc
+ // NoSubRegister, //0
+ // hi16, // 1
+ // lo16, // 2
+ // sub0, // 3
+ // ...
+ // sub1, // 11
+ // sub1_hi16, // 12
+ // sub1_lo16, // 13
+ static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
+ if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ TRI->isSGPRReg(*MRI, UseReg)) {
+ // Produce the 32 bit subregister index to which the 16-bit subregister
+ // is aligned.
+ if (SubRegIdx > AMDGPU::sub1) {
+ LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
+ M |= M.getLane(M.getHighestLane() - 1);
+ SmallVector<unsigned, 4> Indexes;
+ TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
+ Indexes);
+ assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
+ SubRegIdx = Indexes[0];
+ // 32-bit registers do not have a sub0 index
+ } else if (TII->getOpSize(*UseMI, 1) == 4)
+ SubRegIdx = 0;
+ else
+ SubRegIdx = AMDGPU::sub0;
+ }
+ UseMI->getOperand(1).setSubReg(SubRegIdx);
UseMI->getOperand(1).setIsKill(false);
CopiesToReplace.push_back(UseMI);
OpToFold.setIsKill(false);
@@ -1713,6 +1769,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
return false;
+ // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt
+ // Can remove this code if proper 16-bit SGPRs are implemented
+ // Example: Pre-peephole-opt
+ // %29:sgpr_lo16 = COPY %16.lo16:sreg_32
+ // %32:sreg_32 = COPY %29:sgpr_lo16
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // Post-peephole-opt and DCE
+ // %32:sreg_32 = COPY %16.lo16:sreg_32
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // After this transform
+ // %32:sreg_32 = COPY %16:sreg_32
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32
+ // After the fold operands pass
+ // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32
+ if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
+ OpToFold.getSubReg()) {
+ const TargetRegisterClass *DstRC =
+ MRI->getRegClass(MI.getOperand(0).getReg());
+ if (DstRC == &AMDGPU::SReg_32RegClass &&
+ DstRC == MRI->getRegClass(OpToFold.getReg())) {
+ assert(OpToFold.getSubReg() == AMDGPU::lo16);
+ OpToFold.setSubReg(0);
+ }
+ }
+
// Prevent folding operands backwards in the function. For example,
// the COPY opcode must not be replaced by 1 in this example:
//
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index def06c1e9a0d7..db5b1e5c9a035 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -776,6 +776,7 @@ let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
[], /*VOP1Only=*/ 1>;
+ let isAsCheapAsAMove = 1 in
defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index efcaa8807367b..ec7d7379a4233 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -38259,16 +38259,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
; GFX11TRUE16-LABEL: s_select_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
+; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
@@ -38376,19 +38374,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
;
; GFX11TRUE16-LABEL: s_vselect_v2bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0
+; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v2bf16:
@@ -40095,30 +40091,25 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
;
; GFX11TRUE16-LABEL: s_vselect_v4bf16:
; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1
-; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
-; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1
+; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16
+; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4
-; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2
; GFX11TRUE16-NEXT: ; return to shader part epilog
;
; GFX11FAKE16-LABEL: s_vselect_v4bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index a95a1aba0c914..e70cd2400172d 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -303,18 +303,32 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: test_bswap_i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-REAL16-LABEL: test_bswap_i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s2, 0x10203
+; GFX11-REAL16-NEXT: s_mov_b32 s2, -1
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-REAL16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bswap_i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s4, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s5, 0x10203
+; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in, align 8
%bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
store i64 %bswap, ptr addrspace(1) %out, align 8
@@ -364,20 +378,36 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: test_bswap_v2i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v3, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-REAL16-LABEL: test_bswap_v2i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-REAL16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-REAL16-NEXT: s_mov_b32 s2, -1
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s4, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, s6, 0x10203
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-REAL16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bswap_v2i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, 0, s6, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, s7, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s4, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s5, 0x10203
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load <2 x i64>, ptr addrspace(1) %in, align 16
%bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
store <2 x i64> %bswap, ptr addrspace(1) %out, align 16
@@ -445,26 +475,49 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: test_bswap_v4i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v7, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v6, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v5, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v4, 0, s5, 0x10203
-; GFX11-NEXT: v_perm_b32 v3, 0, s2, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s3, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-REAL16-LABEL: test_bswap_v4i64:
+; GFX11-REAL16: ; %bb.0:
+; GFX11-REAL16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX11-REAL16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-REAL16-NEXT: s_mov_b32 s10, -1
+; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s4, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, s6, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v4, 0, s0, 0x10203
+; GFX11-REAL16-NEXT: v_perm_b32 v6, 0, s2, 0x10203
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v5, v4
+; GFX11-REAL16-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-REAL16-NEXT: s_clause 0x1
+; GFX11-REAL16-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 offset:16
+; GFX11-REAL16-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0
+; GFX11-REAL16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bswap_v4i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, 0, s6, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, 0, s7, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, 0, s4, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, 0, s5, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, 0, s2, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, s3, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s0, 0x10203
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s1, 0x10203
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16
+; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load <4 x i64>, ptr addrspace(1) %in, align 32
%bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
store <4 x i64> %bswap, ptr addrspace(1) %out, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 14ddf7daad1c6..03cb3b28480c4 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -216,34 +216,19 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2
; VI-NEXT: s_endpgm
;
-; GFX11-TRUE16-LA...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/128929
More information about the llvm-commits
mailing list