[llvm] [AMDGPU][True16] added Pre-RA hint to improve copy elimination (PR #103366)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 13 10:49:32 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
---
Patch is 27.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/103366.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp (+43)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+67)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.h (+12)
- (modified) llvm/test/CodeGen/AMDGPU/fadd.f16.ll (+1-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll (+2-5)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll (+2-5)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+54-105)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4467836cffc56..c7f3fb872c1a0 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -22,11 +22,21 @@
/// although the same shall be possible with other register classes and
/// instructions if necessary.
///
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// This pass also adds register allocation hints to COPY.
+/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
+/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
+/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
+/// the VGPR_32, the COPY can be completely eliminated.
+///
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
@@ -236,5 +246,38 @@ bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
Changed |= processReg(Reg);
}
+ if (!ST.useRealTrue16Insts())
+ return Changed;
+
+ // Add RA hints to improve True16 COPY elimination.
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ if (MI.getOpcode() != AMDGPU::COPY)
+ continue;
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ if (Dst.isVirtual() &&
+ MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ Src.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+ if (Src.isVirtual() &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+ Dst.isPhysical() &&
+ TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ continue;
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+ MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+ }
+ if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+ MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+ MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+ }
+ }
+
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ee72837a50fc4..4e5fa782345cc 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3327,6 +3327,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
}
+bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF,
+ const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const {
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
+
+ switch (Hint.first) {
+ case AMDGPURI::Size32: {
+ Register Paired = Hint.second;
+ assert(Paired);
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
+ PairedPhys =
+ getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
+ &AMDGPU::VGPR_32RegClass);
+ }
+
+ // Prefer the paired physreg.
+ if (PairedPhys)
+ // isLo(Paired) is implicitly true here from the API of
+ // getMatchingSuperReg.
+ Hints.push_back(PairedPhys);
+ return false;
+ }
+ case AMDGPURI::Size16: {
+ Register Paired = Hint.second;
+ assert(Paired);
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
+ PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
+ } else if (VRM && VRM->hasPhys(Paired)) {
+ PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
+ }
+
+ // First prefer the paired physreg.
+ if (PairedPhys)
+ Hints.push_back(PairedPhys);
+ else {
+ // Add all the lo16 physregs.
+ // When the Paired operand has not yet been assigned a physreg it is
+ // better to try putting VirtReg in a lo16 register, because possibly
+ // later Paired can be assigned to the overlapping register and the COPY
+ // can be eliminated.
+ for (MCPhysReg PhysReg : Order) {
+ if (PhysReg == PairedPhys || AMDGPU::isHi(PhysReg, *this))
+ continue;
+ if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
+ !MRI.isReserved(PhysReg))
+ Hints.push_back(PhysReg);
+ }
+ }
+ return false;
+ }
+ default:
+ return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+ VRM);
+ }
+}
+
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
// Not a callee saved register.
return AMDGPU::SGPR30_SGPR31;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 88d5686720985..622e86d03048a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -29,6 +29,13 @@ class LiveRegUnits;
class RegisterBank;
struct SGPRSpillBuilder;
+/// Register allocation hint types. Helps eliminate unneeded COPY with True16
+namespace AMDGPURI {
+
+enum { Size16 = 1, Size32 = 2 };
+
+} // end namespace AMDGPURI
+
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
private:
const GCNSubtarget &ST;
@@ -326,6 +333,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
unsigned getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const override;
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const MachineFunction &MF, const VirtRegMap *VRM,
+ const LiveRegMatrix *Matrix) const override;
+
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
MCRegister getReturnAddressReg(const MachineFunction &MF) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 1094b768f1bd1..64329c657b814 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 4faa482ede59f..24412cff02ff2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -164,12 +164,9 @@ define amdgpu_kernel void @ceil_v2f16(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ceil_f16_e32 v0.h, v1.l
-; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-NEXT: v_ceil_f16_e32 v1.l, v1.l
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 61f6c9f7f0e6f..c9d05c886d99b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -165,12 +165,9 @@ define amdgpu_kernel void @floor_v2f16(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_floor_f16_e32 v0.h, v1.l
-; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-NEXT: v_floor_f16_e32 v1.l, v1.l
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 72e86f1f6f999..4c3071be3d28f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -217,9 +217,8 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) {
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i8:
@@ -307,9 +306,7 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) {
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_f16_i16:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i16:
@@ -477,14 +474,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, v1, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v3.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -546,13 +540,10 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v3
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v2.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v4.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -610,12 +601,9 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v3.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -665,13 +653,10 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v2.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v2.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -740,16 +725,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v4, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v2.l, v5.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v4, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -820,15 +802,12 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v3, v5
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v2.l, v6.l, v3.l
; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v4, v5
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -895,12 +874,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v4.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v2.l, v5.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0, v2
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -958,13 +934,10 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v4.l, v5.l
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v2.l, v4.l, v5.l
+; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/103366
More information about the llvm-commits
mailing list