[llvm] 7de6107 - Revert "[AMDGPU] Improve v_cmpx usage on GFX10.3."
Thomas Symalla via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 21 01:55:06 PDT 2022
Author: Thomas Symalla
Date: 2022-03-21T09:50:44+01:00
New Revision: 7de6107dce5388968e3914601d02cf70b9853e9c
URL: https://github.com/llvm/llvm-project/commit/7de6107dce5388968e3914601d02cf70b9853e9c
DIFF: https://github.com/llvm/llvm-project/commit/7de6107dce5388968e3914601d02cf70b9853e9c.diff
LOG: Revert "[AMDGPU] Improve v_cmpx usage on GFX10.3."
This reverts commit 011c64191ef9ccc6538d52f4b57f98f37d4ea36e and
e725e2afe02e18398525652c9bceda1eb055ea64.
Differential Revision: https://reviews.llvm.org/D122117
Added:
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
llvm/lib/Target/AMDGPU/VOPCInstructions.td
llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
llvm/test/CodeGen/AMDGPU/wqm.ll
Removed:
llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5fd13e4d54f7b..5afd0bba095b2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3856,19 +3856,18 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
unsigned Op32) const {
- MachineBasicBlock *MBB = MI.getParent();
+ MachineBasicBlock *MBB = MI.getParent();;
MachineInstrBuilder Inst32 =
BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
.setMIFlags(MI.getFlags());
// Add the dst operand if the 32-bit encoding also has an explicit $vdst.
// For VOPC instructions, this is replaced by an implicit def of vcc.
- if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst) != -1) {
+ int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
+ if (Op32DstIdx != -1) {
// dst
Inst32.add(MI.getOperand(0));
- } else if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::sdst) != -1) {
- // VOPCX instructions won't be writing to an explicit dst, so this should
- // not fail for these instructions.
+ } else {
assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
(MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
"Unexpected case");
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 4b7589002a718..25d3f4a765e6b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1263,10 +1263,6 @@ namespace AMDGPU {
LLVM_READONLY
int getMFMAEarlyClobberOp(uint16_t Opcode);
- /// \returns v_cmpx version of a v_cmp instruction.
- LLVM_READONLY
- int getVCMPXOpFromVCMP(uint16_t Opcode);
-
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e0e99680a962b..30c67d92b4cde 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2669,15 +2669,6 @@ def getMFMAEarlyClobberOp : InstrMapping {
let ValueCols = [["0"]];
}
-// Maps an v_cmp instruction to its v_cmpx equivalent.
-def getVCMPXOpFromVCMP : InstrMapping {
- let FilterClass = "VCMPVCMPXTable";
- let RowFields = ["VCMPOp"];
- let ColFields = ["IsVCMPX"];
- let KeyCol = ["0"];
- let ValueCols = [["1"]];
-}
-
include "SIInstructions.td"
include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 471a871891aec..9a4cc25f00085 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -11,7 +11,6 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/InitializePasses.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
using namespace llvm;
@@ -293,183 +292,6 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
return false;
}
-// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
-// the beginning of the BB is reached or Pred evaluates to true - which can be
-// an arbitrary condition based on the current MachineInstr, for instance an
-// target instruction. Breaks prematurely by returning nullptr if one of the
-// registers given in NonModifiableRegs is modified by the current instruction.
-static MachineInstr *
-findInstrBackwards(MachineInstr &Origin,
- std::function<bool(MachineInstr *)> Pred,
- ArrayRef<MCRegister> NonModifiableRegs,
- const SIRegisterInfo *TRI, unsigned MaxInstructions = 5) {
- MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
- E = Origin.getParent()->rend();
- unsigned CurrentIteration = 0;
-
- for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
- if (Pred(&*A))
- return &*A;
-
- for (MCRegister Reg : NonModifiableRegs) {
- if (A->modifiesRegister(Reg, TRI))
- return nullptr;
- }
-
- ++CurrentIteration;
- }
-
- return nullptr;
-}
-
-// Determine if a register Reg is not re-defined and still in use
-// in the range (Stop..BB.end].
-// It does so by backwards calculating liveness from the end of the BB until
-// either Stop or the beginning of the BB is reached.
-// After liveness is calculated, we can determine if Reg is still in use and not
-// defined inbetween the instructions.
-static bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg,
- const SIRegisterInfo *TRI,
- MachineRegisterInfo &MRI) {
- LivePhysRegs LR(*TRI);
- LR.addLiveOuts(*Stop.getParent());
-
- for (auto A = Stop.getParent()->rbegin();
- A != Stop.getParent()->rend() && A != Stop; ++A) {
- LR.stepBackward(*A);
- }
-
- return !LR.available(MRI, Reg);
-}
-
-// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
-// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
-// to the v_cmp instruction if it is safe to replace the sequence (see the
-// conditions in the function body). This is after register allocation, so some
-// checks on operand dependencies need to be considered.
-static MachineInstr *findPossibleVCMPVCMPXOptimization(
- MachineInstr &SaveExec, MCRegister Exec, const SIRegisterInfo *TRI,
- const SIInstrInfo *TII, MachineRegisterInfo &MRI) {
-
- MachineInstr *VCmp = nullptr;
-
- Register SaveExecDest = SaveExec.getOperand(0).getReg();
- if (!TRI->isSGPRReg(MRI, SaveExecDest))
- return nullptr;
-
- MachineOperand *SaveExecSrc0 =
- TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
- if (!SaveExecSrc0->isReg())
- return nullptr;
-
- // Try to find the last v_cmp instruction that defs the saveexec input
- // operand without any write to Exec inbetween.
- VCmp = findInstrBackwards(
- SaveExec,
- [&](MachineInstr *Check) {
- return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
- Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
- },
- {Exec, SaveExecSrc0->getReg()}, TRI);
-
- if (!VCmp)
- return nullptr;
-
- MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
- assert(VCmpDest && "Should have an sdst operand!");
-
- // Check if any of the v_cmp source operands is written by the saveexec.
- MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
- if (Src0->isReg() && TRI->isSGPRReg(MRI, Src0->getReg()) &&
- SaveExec.modifiesRegister(Src0->getReg(), TRI))
- return nullptr;
-
- MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
- if (Src1->isReg() && TRI->isSGPRReg(MRI, Src1->getReg()) &&
- SaveExec.modifiesRegister(Src1->getReg(), TRI))
- return nullptr;
-
- // Don't do the transformation if the destination operand is included in
- // it's MBB Live-outs, meaning it's used in any of it's successors, leading
- // to incorrect code if the v_cmp and therefore the def of
- // the dest operand is removed.
- if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
- return nullptr;
-
- // If the v_cmp target is in use after the s_and_saveexec, skip the
- // optimization.
- if (isRegisterInUseAfter(SaveExec, VCmpDest->getReg(), TRI,
- MRI))
- return nullptr;
-
- // Try to determine if there is a write to any of the VCmp
- // operands between the saveexec and the vcmp.
- // If yes, additional VGPR spilling might need to be inserted. In this case,
- // it's not worth replacing the instruction sequence.
- SmallVector<MCRegister, 2> NonDefRegs;
- if (Src0->isReg())
- NonDefRegs.push_back(Src0->getReg());
-
- if (Src1->isReg())
- NonDefRegs.push_back(Src1->getReg());
-
- if (!findInstrBackwards(
- SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
- NonDefRegs, TRI))
- return nullptr;
-
- return VCmp;
-}
-
-// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
-// operands extracted from a v_cmp ..., s_and_saveexec pattern.
-static bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
- MachineInstr &VCmp, MCRegister Exec,
- const SIInstrInfo *TII,
- const SIRegisterInfo *TRI,
- MachineRegisterInfo &MRI) {
- const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
-
- if (NewOpcode == -1)
- return false;
-
- MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
- MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
-
- Register MoveDest = SaveExecInstr.getOperand(0).getReg();
-
- MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
- if (!SaveExecInstr.uses().empty()) {
- bool isSGPR32 = TRI->getRegSizeInBits(MoveDest, MRI) == 32;
- unsigned MovOpcode = isSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
- SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
- .addReg(Exec);
- }
-
- // Omit dst as V_CMPX is implicitly writing to EXEC.
- // Add dummy src and clamp modifiers, if needed.
- auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
- VCmp.getDebugLoc(), TII->get(NewOpcode));
-
- if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) !=
- -1)
- Builder.addImm(0);
-
- Builder.add(*Src0);
-
- if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1_modifiers) !=
- -1)
- Builder.addImm(0);
-
- Builder.add(*Src1);
-
- if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) != -1)
- Builder.addImm(0);
-
- return true;
-}
-
bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -477,7 +299,6 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
- MachineRegisterInfo *MRI = &MF.getRegInfo();
MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Optimize sequences emitted for control flow lowering. They are originally
@@ -641,45 +462,5 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
Changed = true;
}
- // After all s_op_saveexec instructions are inserted,
- // replace (on GFX10.3 and later)
- // v_cmp_* SGPR, IMM, VGPR
- // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
- // with
- // s_mov_b32 EXEC_SGPR_DEST, exec_lo
- // v_cmpx_* IMM, VGPR
- // to reduce pipeline stalls.
- if (ST.hasGFX10_3Insts()) {
- DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
- const unsigned AndSaveExecOpcode =
- ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
-
- for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
- // Record relevant v_cmp / s_and_saveexec instruction pairs for
- // replacement.
- if (MI.getOpcode() != AndSaveExecOpcode)
- continue;
-
- if (MachineInstr *VCmp =
- findPossibleVCMPVCMPXOptimization(MI, Exec, TRI, TII, *MRI))
- SaveExecVCmpMapping[&MI] = VCmp;
- }
- }
-
- for (const auto &Entry : SaveExecVCmpMapping) {
- MachineInstr *SaveExecInstr = Entry.getFirst();
- MachineInstr *VCmpInstr = Entry.getSecond();
-
- if (optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec, TII,
- TRI, *MRI)) {
- SaveExecInstr->eraseFromParent();
- VCmpInstr->eraseFromParent();
-
- Changed = true;
- }
- }
- }
-
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 880df8bdb67ec..e437552c2afdc 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -728,27 +728,21 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
if (TII->isVOPC(Op32)) {
- MachineOperand &Op0 = MI.getOperand(0);
- if (Op0.isReg()) {
- // Exclude VOPCX instructions as these don't explicitly write a
- // dst.
- Register DstReg = Op0.getReg();
- if (DstReg.isVirtual()) {
- // VOPC instructions can only write to the VCC register. We can't
- // force them to use VCC here, because this is only one register and
- // cannot deal with sequences which would require multiple copies of
- // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
- //
- // So, instead of forcing the instruction to write to VCC, we
- // provide a hint to the register allocator to use VCC and then we
- // will run this pass again after RA and shrink it if it outputs to
- // VCC.
- MRI.setRegAllocationHint(DstReg, 0, VCCReg);
- continue;
- }
- if (DstReg != VCCReg)
- continue;
+ Register DstReg = MI.getOperand(0).getReg();
+ if (DstReg.isVirtual()) {
+ // VOPC instructions can only write to the VCC register. We can't
+ // force them to use VCC here, because this is only one register and
+ // cannot deal with sequences which would require multiple copies of
+ // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
+ //
+ // So, instead of forcing the instruction to write to VCC, we provide
+ // a hint to the register allocator to use VCC and then we will run
+ // this pass again after RA and shrink it if it outputs to VCC.
+ MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
+ continue;
}
+ if (DstReg != VCCReg)
+ continue;
}
if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 1220b5c8ac35d..c0cc91029d111 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -205,11 +205,6 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> {
string NoSDstOp = Name;
}
-class VCMPVCMPXTable <string Name> {
- bit IsVCMPX = 0;
- string VCMPOp = Name;
-}
-
multiclass VOPC_Pseudos <string opName,
VOPC_Profile P,
SDPatternOperator cond = COND_NULL,
@@ -218,8 +213,7 @@ multiclass VOPC_Pseudos <string opName,
def _e32 : VOPC_Pseudo <opName, P>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
- VCMPXNoSDstTable<1, opName#"_e32">,
- VCMPVCMPXTable<opName#"_e32"> {
+ VCMPXNoSDstTable<1, opName#"_e32"> {
let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
let SchedRW = P.Schedule;
let isConvergent = DefExec;
@@ -229,8 +223,7 @@ multiclass VOPC_Pseudos <string opName,
def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>,
- VCMPXNoSDstTable<1, opName#"_e64">,
- VCMPVCMPXTable<opName#"_e64"> {
+ VCMPXNoSDstTable<1, opName#"_e64"> {
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = P.Schedule;
let isCompare = 1;
@@ -255,27 +248,23 @@ multiclass VOPCX_Pseudos <string opName,
def _nosdst_e32 : VOPC_Pseudo <opName#"_nosdst", P_NoSDst, [], 0>,
Commutable_REV<revOp#"_nosdst_e32", !eq(revOp, opName)>,
- VCMPXNoSDstTable<0, opName#"_e32">,
- VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e32")> {
+ VCMPXNoSDstTable<0, opName#"_e32"> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
let isConvergent = 1;
let isCompare = 1;
let isCommutable = 1;
let SubtargetPredicate = HasNoSdstCMPX;
- let IsVCMPX = 1;
}
def _nosdst_e64 : VOP3_Pseudo<opName#"_nosdst", P_NoSDst>,
Commutable_REV<revOp#"_nosdst_e64", !eq(revOp, opName)>,
- VCMPXNoSDstTable<0, opName#"_e64">,
- VCMPVCMPXTable<!subst("v_cmpx", "v_cmp", opName#"_e64")> {
+ VCMPXNoSDstTable<0, opName#"_e64"> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
let isCompare = 1;
let isCommutable = 1;
let SubtargetPredicate = HasNoSdstCMPX;
- let IsVCMPX = 1;
}
foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
index a2c35b97aef65..2fed25fff3d50 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll
@@ -6,7 +6,7 @@
; GCN-LABEL: long_forward_scc_branch_3f_offset_bug:
; GFX1030: s_cmp_lg_u32
-; GFX1030: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]]
+; GFX1030-NEXT: s_cbranch_scc1 [[ENDBB:.LBB[0-9]+_[0-9]+]]
; GFX1010: s_cmp_lg_u32
; GFX1010-NEXT: s_cbranch_scc0 [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
@@ -51,9 +51,9 @@ bb3:
}
; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug:
-; GFX1030: s_mov_b32
-; GFX1030: v_cmpx_eq_u32
-; GFX1030: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
+; GFX1030: v_cmp_eq_u32
+; GFX1030: s_and_saveexec_b32
+; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:.LBB[0-9]+_[0-9]+]]
; GFX1010: v_cmp_eq_u32
; GFX1010: s_and_saveexec_b32
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
deleted file mode 100644
index f8aae95f62b12..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
+++ /dev/null
@@ -1,167 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_lt:
-; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 15, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_lt_i32_e32 15, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_lt(i32 %x) {
-entry:
- %bc = icmp slt i32 %x, 16
- br i1 %bc, label %endif, label %if
-
-if:
- %ret = shl i32 %x, 2
- ret i32 %ret
-
-endif:
- ret i32 %x
-}
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_gt:
-; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 17, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_gt_i32_e32 17, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_gt(i32 %x) {
-entry:
- %bc = icmp sgt i32 %x, 16
- br i1 %bc, label %endif, label %if
-
-if:
- %ret = shl i32 %x, 2
- ret i32 %ret
-
-endif:
- ret i32 %x
-}
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_eq:
-; GFX1010: v_cmp_ne_u32_e32 vcc_lo, 16, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_ne_u32_e32 16, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_eq(i32 %x) {
-entry:
- %bc = icmp eq i32 %x, 16
- br i1 %bc, label %endif, label %if
-
-if:
- %ret = shl i32 %x, 2
- ret i32 %ret
-
-endif:
- ret i32 %x
-}
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ne:
-; GFX1010: v_cmp_eq_u32_e32 vcc_lo, 16, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_eq_u32_e32 16, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_ne(i32 %x) {
-entry:
- %bc = icmp ne i32 %x, 16
- br i1 %bc, label %endif, label %if
-
-if:
- %ret = shl i32 %x, 2
- ret i32 %ret
-
-endif:
- ret i32 %x
-}
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_le:
-; GFX1010: v_cmp_lt_i32_e32 vcc_lo, 16, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_lt_i32_e32 16, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_le(i32 %x) {
-entry:
- %bc = icmp sle i32 %x, 16
- br i1 %bc, label %endif, label %if
-
-if:
- %ret = shl i32 %x, 2
- ret i32 %ret
-
-endif:
- ret i32 %x
-}
-
-; GCN-LABEL: {{^}}test_insert_vcmpx_pattern_ge:
-; GFX1010: v_cmp_gt_i32_e32 vcc_lo, 16, v{{.*}}
-; GFX1010-NEXT: s_and_saveexec_b32 s{{.*}}, vcc_lo
-; GFX1030: s_mov_b32 s{{.*}}, exec_lo
-; GFX1030-NEXT: v_cmpx_gt_i32_e32 16, v{{.*}}
-define i32 @test_insert_vcmpx_pattern_ge(i32 %x) {
-entry:
- %bc = icmp sge i32 %x, 16
- br i1 %bc, label %endif, label %if
-
-if:
- %ret = shl i32 %x, 2
- ret i32 %ret
-
-endif:
- ret i32 %x
-}
-
-declare amdgpu_gfx void @check_live_outs_helper(i64) #0
-
-; In cases where the output operand cannot be safely removed,
-; don't apply the v_cmpx transformation.
-
-; GCN-LABEL: {{^}}check_live_outs:
-; GFX1010: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}}
-; GFX1010: s_and_saveexec_b32 s{{.*}}, s{{.*}}
-; GFX1030: v_cmp_eq_u32_e64 s{{.*}}, v{{.*}}, v{{.*}}
-; GFX1030: s_and_saveexec_b32 s{{.*}}, s{{.*}}
-define amdgpu_cs void @check_live_outs(i32 %a, i32 %b) {
- %cond = icmp eq i32 %a, %b
- %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
- br i1 %cond, label %l1, label %l2
-l1:
- call amdgpu_gfx void @check_live_outs_helper(i64 %result)
- br label %l2
-l2:
- ret void
-}
-
-; Omit the transformation if the s_and_saveexec instruction overwrites
-; any of the v_cmp source operands.
-
-; GCN-LABEL: check_saveexec_overwrites_vcmp_source:
-; GCN: ; %bb.1: ; %then
-; GFX1010: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
-; GFX1010-NEXT: v_mov_b32_e32 {{.*}}, s[[A]]
-; GFX1010-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo
-; GFX1030: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
-; GFX1030-NEXT: v_mov_b32_e32 {{.*}}, s[[A]]
-; GFX1030-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo
-define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) {
-entry:
- %0 = icmp sge i32 %a, 0
- br i1 %0, label %if, label %then
-
-if:
- %1 = shl i32 %a, 2
- %2 = or i32 %1, %b
- ret i32 %2
-
-then:
- %3 = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
- %4 = trunc i64 %3 to i32
- %5 = icmp slt i32 %4, %b
- br i1 %5, label %after, label %end
-
-after:
- ret i32 %4
-
-end:
- ret i32 %a
-}
-
-declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
deleted file mode 100644
index 1a39ad862827c..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.mir
+++ /dev/null
@@ -1,24 +0,0 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 %s -o - | FileCheck -check-prefix=GCN %s
-# RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 %s -o - | FileCheck -check-prefix=GCN %s
-
----
-
-# After the Optimize exec masking (post-RA) pass, there's a change of having v_cmpx instructions
-# being introduced whenever there's a sequence of v_cmp and s_and_saveexec instructions
-# which can be safely replaced in various cases.
-# However, it is not safe to do so when the generated code sequence would omit part of the EXEC mask
-# which could occur when a subset of EXEC is used as input operand in the v_cmp instruction.
-# The idea behind this test is to check if the subregisters are correctly handled here.
-
-# GCN-LABEL: vcmp_saveexec_to_mov_vcmpx_exec_subreg:
-# GCN: v_cmp_gt_u32_e64 s[[[SDST_LO:[0-9]+]]:[[SDST_HI:[0-9]+]]], exec_lo, v{{.*}}
-# GCN: s_and_saveexec_b64 s[[[EXEC_LO:[0-9]+]]:[[EXEC_HI:[0-9]+]]], s[[[SDST_LO]]:[[SDST_HI]]]
-name: vcmp_saveexec_to_mov_vcmpx_exec_subreg
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $vgpr0
- renamable $sgpr0_sgpr1 = V_CMP_GT_U32_e64 $exec_lo, killed $vgpr0, implicit $exec
- $sgpr2_sgpr3 = COPY $exec, implicit-def $exec
- $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr2_sgpr3, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
- $exec = S_MOV_B64_term killed renamable $sgpr2_sgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 7cfb43ba802ac..e462a460c93fa 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1250,8 +1250,8 @@ define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
-; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1
+; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
; GFX10-W32-NEXT: s_cbranch_execz .LBB23_2
; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
@@ -1329,8 +1329,8 @@ define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
-; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1
+; GFX10-W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
@@ -1508,10 +1508,10 @@ define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
+; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v1
; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
-; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: ; implicit-def: $vgpr0
-; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1
+; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1
@@ -1577,8 +1577,8 @@ define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i3
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
-; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
+; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
@@ -2960,9 +2960,9 @@ define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
+; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
-; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
+; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo
More information about the llvm-commits
mailing list