[llvm] [AMDGPU] Adding multiple use analysis to SIPeepholeSDWA (PR #94800)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 7 13:23:57 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Brian Favela (bfavela)
<details>
<summary>Changes</summary>
Allow for multiple uses of an operand where each instruction can be promoted to SDWA.
For instance:
; v_and_b32 v2, lit(0x0000ffff), v2
; v_and_b32 v3, 6, v2
; v_and_b32 v2, 1, v2
Can be folded to:
; v_and_b32 v3, 6, sel_lo(v2)
; v_and_b32 v2, 1, sel_lo(v2)
---
Patch is 188.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94800.diff
19 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (+59-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll (+14-13)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+154-154)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+178-181)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll (+26-29)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+155-161)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+154-160)
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+19-17)
- (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+12-13)
- (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+45-47)
- (modified) llvm/test/CodeGen/AMDGPU/idiv-licm.ll (+42-44)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+13-18)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll (+14-11)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.frexp.ll (+19-23)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+12-13)
- (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+255-290)
- (modified) llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll (+2-3)
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll (+96-9)
- (modified) llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll (+3-3)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 1fadd8ce45b1f..082aeeea2c7cc 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -37,20 +37,24 @@ STATISTIC(NumSDWAInstructionsPeepholed,
namespace {
+bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
+ const SIInstrInfo *TII);
class SDWAOperand;
class SDWADstOperand;
-class SIPeepholeSDWA : public MachineFunctionPass {
-public:
- using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+// helper typedef to make code cleaner
+typedef MapVector<MachineInstr *, SDWAOperandsVector> SDWAOperandsMap;
+
+class SIPeepholeSDWA : public MachineFunctionPass {
private:
MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
- MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+ SDWAOperandsMap PotentialMatches;
SmallVector<MachineInstr *, 8> ConvertedInstructions;
std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
@@ -65,7 +69,6 @@ class SIPeepholeSDWA : public MachineFunctionPass {
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineBasicBlock &MBB);
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
- bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
void pseudoOpConvertToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
@@ -93,7 +96,9 @@ class SDWAOperand {
virtual ~SDWAOperand() = default;
- virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
+ virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) = 0;
virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
MachineOperand *getTargetOperand() const { return Target; }
@@ -126,7 +131,9 @@ class SDWASrcOperand : public SDWAOperand {
: SDWAOperand(TargetOp, ReplacedOp),
SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
- MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getSrcSel() const { return SrcSel; }
@@ -153,7 +160,9 @@ class SDWADstOperand : public SDWAOperand {
SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
: SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
- MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getDstSel() const { return DstSel; }
@@ -327,7 +336,37 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
return Mods;
}
-MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches) {
+ // If PotentialMatches is not null, then fill out the map for all uses,
+ // if all can be converted
+ if (PotentialMatches != nullptr) {
+ MachineOperand *Reg = getReplacedOperand();
+ if (!Reg->isReg() || !Reg->isDef()) {
+ return nullptr;
+ }
+
+ for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+ // If there exist use of subreg of Reg then return nullptr
+ if (!isSameReg(UseMO, *Reg))
+ return nullptr;
+
+ // Check that all instructions the use Reg can be converted
+ if (!isConvertibleToSDWA(*(UseMO.getParent()), ST, TII)) {
+ return nullptr;
+ }
+ }
+ // Now that it's guaranteed all uses are legal, iterate over the uses again
+ // to add them for later conversion.
+ for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+ SDWAOperandsMap& potentialMatchesMap = *PotentialMatches;
+ MachineInstr* UseMI = UseMO.getParent();
+ potentialMatchesMap[UseMI].push_back(this);
+ }
+ return nullptr;
+ }
+
// For SDWA src operand potential instruction is one that use register
// defined by parent instruction
MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
@@ -420,7 +459,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
return true;
}
-MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches) {
// For SDWA dst operand potential instruction is one that defines register
// that this operand uses
MachineRegisterInfo *MRI = getMRI();
@@ -919,8 +960,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
}
-bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
- const GCNSubtarget &ST) const {
+namespace {
+bool isConvertibleToSDWA(MachineInstr &MI,
+ const GCNSubtarget &ST,
+ const SIInstrInfo* TII) {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
if (TII->isSDWA(Opc))
@@ -980,6 +1023,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
return true;
}
+} // namespace
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
@@ -1215,7 +1259,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
matchSDWAOperands(MBB);
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
if (PotentialMI &&
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
@@ -1228,8 +1272,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {
PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 02781e763f44a..eb20178f9f4d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -771,7 +771,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v6, 8
+; VI-NEXT: v_mov_b32_e32 v6, 9
+; VI-NEXT: v_mov_b32_e32 v7, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -779,28 +780,28 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v1, v[0:1]
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v2, 9
+; VI-NEXT: v_mov_b32_e32 v2, 0xff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
; VI-NEXT: v_add_u16_e32 v9, 9, v1
-; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT: v_add_u16_e32 v7, 9, v7
+; VI-NEXT: v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; VI-NEXT: v_add_u16_e32 v8, 9, v8
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_and_b32_e32 v1, 0xff, v8
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; VI-NEXT: v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v10
; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10
+; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_or_b32_e32 v2, v0, v2
; VI-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 06930388901b0..4df5fa18e2942 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1271,46 +1271,45 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX8-LABEL: v_fshl_v4i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_not_b32_e32 v7, v2
+; GFX8-NEXT: v_mov_b32_e32 v9, 1
+; GFX8-NEXT: v_and_b32_e32 v6, 7, v2
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX8-NEXT: v_and_b32_e32 v8, 7, v2
-; GFX8-NEXT: v_not_b32_e32 v2, v2
-; GFX8-NEXT: v_mov_b32_e32 v10, 1
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v11
+; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v10
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v8, v2
-; GFX8-NEXT: v_and_b32_e32 v8, 7, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v5
; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX8-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_mov_b32_e32 v9, 0xff
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, v8, v3
+; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4
+; GFX8-NEXT: v_mov_b32_e32 v8, 0xff
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT: v_and_b32_e32 v4, 7, v6
-; GFX8-NEXT: v_not_b32_e32 v5, v6
-; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6
-; GFX8-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6
-; GFX8-NEXT: v_not_b32_e32 v6, v7
-; GFX8-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX8-NEXT: v_and_b32_e32 v5, 7, v7
-; GFX8-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, 7
+; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 8
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v5
+; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
@@ -1321,47 +1320,46 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX9-LABEL: v_fshl_v4i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_not_b32_e32 v7, v2
+; GFX9-NEXT: v_mov_b32_e32 v9, 1
+; GFX9-NEXT: v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX9-NEXT: v_and_b32_e32 v8, 7, v2
-; GFX9-NEXT: v_not_b32_e32 v2, v2
-; GFX9-NEXT: v_mov_b32_e32 v10, 1
-; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_lshlrev_b16_e32 v8, v8, v0
-; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v11
+; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v0
+; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v8, v2
-; GFX9-NEXT: v_and_b32_e32 v8, 7, v5
+; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v5
; GFX9-NEXT: v_not_b32_e32 v5, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX9-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_mov_b32_e32 v9, 0xff
-; GFX9-NEXT: v_lshlrev_b16_e32 v3, v8, v3
+; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3
; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4
+; GFX9-NEXT: v_mov_b32_e32 v8, 0xff
; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT: v_and_b32_e32 v4, 7, v6
-; GFX9-NEXT: v_not_b32_e32 v5, v6
-; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT: v_lshrrev_b16_e32 v6, 1, v6
-; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v6
-; GFX9-NEXT: v_not_b32_e32 v6, v7
-; GFX9-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX9-NEXT: v_and_b32_e32 v5, 7, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_lshrrev_b16_e32 v1, v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, 7
+; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT: v_lshrrev_b16_e32 v10, 1, v10
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10
+; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
+; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 8
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v5
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT: v_and_or_b32 v1, v6, v8, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0
@@ -1370,42 +1368,41 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-LABEL: v_fshl_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2
-; GFX10-NEXT: v_and_b32_e32 v10, 7, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2
+; GFX10-NEXT: v_and_b32_e32 v9, 7, v2
+; GFX10-NEXT: v_and_b32_e...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/94800
More information about the llvm-commits
mailing list