[llvm] fee4151 - AMDGPU/GlobalISel: Introduce post-legalize combiner
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 24 19:12:26 PST 2020
Author: Matt Arsenault
Date: 2020-02-24T22:12:12-05:00
New Revision: fee41517fe0f7ff9f0e204dd9200ebf32ca03cb8
URL: https://github.com/llvm/llvm-project/commit/fee41517fe0f7ff9f0e204dd9200ebf32ca03cb8
DIFF: https://github.com/llvm/llvm-project/commit/fee41517fe0f7ff9f0e204dd9200ebf32ca03cb8.diff
LOG: AMDGPU/GlobalISel: Introduce post-legalize combiner
The current set of custom combines are only really useful after
legalization, so move them there. There is a lot of overlap in the
boilerplate here, but I think we do want a pretty different set of
combines before and after legalize. I think we will want a lot of
overlap between the post-legalize and a post-regbankselect combiner.
Added:
llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUCombine.td
llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/CMakeLists.txt
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 1859cfaba5bc..bae5e4059c37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -30,6 +30,8 @@ class Module;
// GlobalISel passes
void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &);
FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone);
+void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &);
+FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
// R600 Passes
FunctionPass *createR600VectorRegMerger();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 3f298fd1dafa..4063bb1fca2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -26,7 +26,12 @@ def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
"AMDGPUGenPreLegalizerCombinerHelper", [all_combines,
- elide_br_by_inverting_cond,
- gfx6gfx7_combines]> {
+ elide_br_by_inverting_cond]> {
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
}
+
+def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
+ "AMDGPUGenPostLegalizerCombinerHelper", [all_combines,
+ gfx6gfx7_combines]> {
+ let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
new file mode 100644
index 000000000000..8f43e6e9f0e4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -0,0 +1,261 @@
+//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// after the legalizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPULegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+
+#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+struct FMinFMaxLegacyInfo {
+ Register LHS;
+ Register RHS;
+ Register True;
+ Register False;
+ CmpInst::Predicate Pred;
+};
+
+// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
+static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
+ // FIXME: Combines should have subtarget predicates, and we shouldn't need
+ // this here.
+ if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
+ return false;
+
+ // FIXME: Type predicate on pattern
+ if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
+ return false;
+
+ Register Cond = MI.getOperand(1).getReg();
+ if (!MRI.hasOneNonDBGUse(Cond) ||
+ !mi_match(Cond, MRI,
+ m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
+ return false;
+
+ Info.True = MI.getOperand(2).getReg();
+ Info.False = MI.getOperand(3).getReg();
+
+ if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
+ !(Info.LHS == Info.False && Info.RHS == Info.True))
+ return false;
+
+ switch (Info.Pred) {
+ case CmpInst::FCMP_FALSE:
+ case CmpInst::FCMP_OEQ:
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_ORD:
+ case CmpInst::FCMP_UNO:
+ case CmpInst::FCMP_UEQ:
+ case CmpInst::FCMP_UNE:
+ case CmpInst::FCMP_TRUE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
+ const FMinFMaxLegacyInfo &Info) {
+
+ auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
+ MachineIRBuilder MIB(MI);
+ MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
+ };
+
+ switch (Info.Pred) {
+ case CmpInst::FCMP_ULT:
+ case CmpInst::FCMP_ULE:
+ if (Info.LHS == Info.True)
+ buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
+ else
+ buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
+ break;
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_OLT: {
+ // We need to permute the operands to get the correct NaN behavior. The
+ // selected operand is the second one based on the failing compare with NaN,
+ // so permute it based on the compare type the hardware uses.
+ if (Info.LHS == Info.True)
+ buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
+ else
+ buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
+ break;
+ }
+ case CmpInst::FCMP_UGE:
+ case CmpInst::FCMP_UGT: {
+ if (Info.LHS == Info.True)
+ buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
+ else
+ buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
+ break;
+ }
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_OGE: {
+ if (Info.LHS == Info.True)
+ buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
+ else
+ buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
+ break;
+ }
+ default:
+ llvm_unreachable("predicate should not have matched");
+ }
+
+ MI.eraseFromParent();
+}
+
+
+#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
+ GISelKnownBits *KB;
+ MachineDominatorTree *MDT;
+
+public:
+ AMDGPUGenPostLegalizerCombinerHelper Generated;
+
+ AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+ const AMDGPULegalizerInfo *LI,
+ GISelKnownBits *KB, MachineDominatorTree *MDT)
+ : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
+ /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
+ KB(KB), MDT(MDT) {
+ if (!Generated.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
+ }
+
+ virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+ MachineIRBuilder &B) const override;
+};
+
+bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
+ MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ CombinerHelper Helper(Observer, B, KB, MDT);
+
+ if (Generated.tryCombineAll(Observer, MI, B, Helper))
+ return true;
+
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR:
+ // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+ // common case, splitting this into a move and a 32-bit shift is faster and
+ // the same code size.
+ return Helper.tryCombineShiftToUnmerge(MI, 32);
+ }
+
+ return false;
+}
+
+#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+// Pass boilerplate
+// ================
+
+class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
+
+ StringRef getPassName() const override {
+ return "AMDGPUPostLegalizerCombiner";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+ bool IsOptNone;
+};
+} // end anonymous namespace
+
+void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ getSelectionDAGFallbackAnalysisUsage(AU);
+ AU.addRequired<GISelKnownBitsAnalysis>();
+ AU.addPreserved<GISelKnownBitsAnalysis>();
+ if (!IsOptNone) {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ }
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+ initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::FailedISel))
+ return false;
+ auto *TPC = &getAnalysis<TargetPassConfig>();
+ const Function &F = MF.getFunction();
+ bool EnableOpt =
+ MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const AMDGPULegalizerInfo *LI
+ = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
+
+ GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+ MachineDominatorTree *MDT =
+ IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+ F.hasMinSize(), LI, KB, MDT);
+ Combiner C(PCInfo, TPC);
+ return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AMDGPUPostLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
+ "Combine AMDGPU machine instrs after legalization",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
+ "Combine AMDGPU machine instrs after legalization", false,
+ false)
+
+namespace llvm {
+FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
+ return new AMDGPUPostLegalizerCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 2757dde6f257..1c337afadd43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -28,112 +28,13 @@
using namespace llvm;
using namespace MIPatternMatch;
-struct FMinFMaxLegacyInfo {
- Register LHS;
- Register RHS;
- Register True;
- Register False;
- CmpInst::Predicate Pred;
-};
-
-// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
-static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
- // FIXME: Combines should have subtarget predicates, and we shouldn't need
- // this here.
- if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
- return false;
-
- // FIXME: Type predicate on pattern
- if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
- return false;
-
- Register Cond = MI.getOperand(1).getReg();
- if (!MRI.hasOneNonDBGUse(Cond) ||
- !mi_match(Cond, MRI,
- m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
- return false;
-
- Info.True = MI.getOperand(2).getReg();
- Info.False = MI.getOperand(3).getReg();
-
- if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
- !(Info.LHS == Info.False && Info.RHS == Info.True))
- return false;
-
- switch (Info.Pred) {
- case CmpInst::FCMP_FALSE:
- case CmpInst::FCMP_OEQ:
- case CmpInst::FCMP_ONE:
- case CmpInst::FCMP_ORD:
- case CmpInst::FCMP_UNO:
- case CmpInst::FCMP_UEQ:
- case CmpInst::FCMP_UNE:
- case CmpInst::FCMP_TRUE:
- return false;
- default:
- return true;
- }
-}
-
-static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
- const FMinFMaxLegacyInfo &Info) {
-
- auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
- MachineIRBuilder MIB(MI);
- MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
- };
-
- switch (Info.Pred) {
- case CmpInst::FCMP_ULT:
- case CmpInst::FCMP_ULE:
- if (Info.LHS == Info.True)
- buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
- else
- buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
- break;
- case CmpInst::FCMP_OLE:
- case CmpInst::FCMP_OLT: {
- // We need to permute the operands to get the correct NaN behavior. The
- // selected operand is the second one based on the failing compare with NaN,
- // so permute it based on the compare type the hardware uses.
- if (Info.LHS == Info.True)
- buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
- else
- buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
- break;
- }
- case CmpInst::FCMP_UGE:
- case CmpInst::FCMP_UGT: {
- if (Info.LHS == Info.True)
- buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
- else
- buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
- break;
- }
- case CmpInst::FCMP_OGT:
- case CmpInst::FCMP_OGE: {
- if (Info.LHS == Info.True)
- buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
- else
- buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
- break;
- }
- default:
- llvm_unreachable("predicate should not have matched");
- }
-
- MI.eraseFromParent();
-}
-
-
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
-#include "AMDGPUGenGICombiner.inc"
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
namespace {
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
-#include "AMDGPUGenGICombiner.inc"
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo {
@@ -165,13 +66,6 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
return true;
switch (MI.getOpcode()) {
- case TargetOpcode::G_SHL:
- case TargetOpcode::G_LSHR:
- case TargetOpcode::G_ASHR:
- // On some subtargets, 64-bit shift is a quarter rate instruction. In the
- // common case, splitting this into a move and a 32-bit shift is faster and
- // the same code size.
- return Helper.tryCombineShiftToUnmerge(MI, 32);
case TargetOpcode::G_CONCAT_VECTORS:
return Helper.tryCombineConcatVectors(MI);
case TargetOpcode::G_SHUFFLE_VECTOR:
@@ -182,7 +76,7 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
}
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
-#include "AMDGPUGenGICombiner.inc"
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
// Pass boilerplate
@@ -194,7 +88,9 @@ class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
- StringRef getPassName() const override { return "AMDGPUPreLegalizerCombiner"; }
+ StringRef getPassName() const override {
+ return "AMDGPUPreLegalizerCombiner";
+ }
bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 624d099018ac..555b215d8e5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -218,6 +218,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
+ initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
@@ -623,6 +624,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
bool addIRTranslator() override;
void addPreLegalizeMachineIR() override;
bool addLegalizeMachineIR() override;
+ void addPreRegBankSelect() override;
bool addRegBankSelect() override;
bool addGlobalInstructionSelect() override;
void addFastRegAlloc() override;
@@ -911,6 +913,11 @@ bool GCNPassConfig::addLegalizeMachineIR() {
return false;
}
+void GCNPassConfig::addPreRegBankSelect() {
+ bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
+}
+
bool GCNPassConfig::addRegBankSelect() {
addPass(new RegBankSelect());
return false;
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index bce539dfb6bc..972d90db026a 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -15,8 +15,10 @@ tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
set(LLVM_TARGET_DEFINITIONS AMDGPUGISel.td)
tablegen(LLVM AMDGPUGenGlobalISel.inc -gen-global-isel)
-tablegen(LLVM AMDGPUGenGICombiner.inc -gen-global-isel-combiner
+tablegen(LLVM AMDGPUGenPreLegalizeGICombiner.inc -gen-global-isel-combiner
-combiners="AMDGPUPreLegalizerCombinerHelper")
+tablegen(LLVM AMDGPUGenPostLegalizeGICombiner.inc -gen-global-isel-combiner
+ -combiners="AMDGPUPostLegalizerCombinerHelper")
set(LLVM_TARGET_DEFINITIONS R600.td)
tablegen(LLVM R600GenAsmWriter.inc -gen-asm-writer)
@@ -60,6 +62,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
+ AMDGPUPostLegalizerCombiner.cpp
AMDGPUPreLegalizerCombiner.cpp
AMDGPUPromoteAlloca.cpp
AMDGPUPropagateAttributes.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir
index bd044c77c38a..f57623ff6457 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: narrow_ashr_s64_32_s64amt
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir
index 3c41f3409166..c3f1093a3b15 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: narrow_lshr_s64_32_s64amt
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir
index 313303e155bf..41d0260c81f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll
index 2ddc379c79f8..6ac4dc886db0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll
@@ -253,3 +253,24 @@ define double @v_test_fmax_legacy_ult_f64(double %a, double %b) {
%val = select i1 %cmp, double %b, double %a
ret double %val
}
+
+define <2 x float> @v_test_fmax_legacy_ogt_v2f32(<2 x float> %a, <2 x float> %b) {
+; GFX6-LABEL: v_test_fmax_legacy_ogt_v2f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_max_legacy_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_max_legacy_f32_e32 v1, v1, v3
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_test_fmax_legacy_ogt_v2f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %cmp = fcmp ogt <2 x float> %a, %b
+ %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %val
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll
index d11d05365fc7..23b7c9d129aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll
@@ -382,3 +382,24 @@ define float @v_test_fcmp_select_false(float %a, float %b) {
%val = select i1 %cmp, float %a, float %b
ret float %val
}
+
+define <2 x float> @v_test_fmin_legacy_ole_v2f32(<2 x float> %a, <2 x float> %b) {
+; GFX6-LABEL: v_test_fmin_legacy_ole_v2f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_min_legacy_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_min_legacy_f32_e32 v1, v1, v3
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_test_fmin_legacy_ole_v2f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+ %cmp = fcmp ole <2 x float> %a, %b
+ %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+ ret <2 x float> %val
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
index f4ede38b26aa..28e4684fface 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -26,9 +26,7 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
-; OS-UNKNOWN: s_add_u32 s[[LO:[0-9]+]], s0, 44
-; OS-UNKNOWN-NEXT: s_addc_u32 s[[HI:[0-9]+]], s1, 0
-; OS-UNKNOWN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO]]:[[HI]]{{\]}}, 0xa
+; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x15
define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
%implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%header.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
index e23df92a3030..b968982585f0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
@@ -286,30 +286,31 @@ define i32 @v_udiv_i32_pow2k_denom(i32 %num) {
; CHECK-LABEL: v_udiv_i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s6, 0x1000
-; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s6
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
-; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT: v_mul_lo_u32 v2, v1, s6
-; CHECK-NEXT: v_mul_hi_u32 v3, v1, s6
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; CHECK-NEXT: s_movk_i32 s4, 0x1000
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000
+; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4
+; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2
+; CHECK-NEXT: v_mul_hi_u32 v4, v2, s4
+; CHECK-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; CHECK-NEXT: v_mul_hi_u32 v3, v3, v2
+; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v2, v3
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; CHECK-NEXT: v_mul_hi_u32 v2, v2, v1
-; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v1, v2
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; CHECK-NEXT: v_mul_hi_u32 v1, v1, v0
-; CHECK-NEXT: v_mul_lo_u32 v2, v1, s6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1
-; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, 1, v1
-; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v2
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v5
+; CHECK-NEXT: v_mul_hi_u32 v2, v2, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2
+; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, 1, v2
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v3
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1
; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5]
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v0, v4, v2, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = udiv i32 %num, 4096
ret i32 %result
@@ -319,9 +320,9 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
; CHECK-LABEL: v_udiv_v2i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s8, 0x1000
+; CHECK-NEXT: s_movk_i32 s4, 0x1000
; CHECK-NEXT: v_mov_b32_e32 v2, 0x1000
-; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s8
+; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s4
; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4
@@ -329,9 +330,9 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
; CHECK-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, s8
-; CHECK-NEXT: v_mul_hi_u32 v6, v3, s8
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2
+; CHECK-NEXT: v_lshlrev_b32_e32 v5, 12, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v3, s4
+; CHECK-NEXT: v_lshlrev_b32_e32 v7, 12, v4
; CHECK-NEXT: v_mul_hi_u32 v8, v4, v2
; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v5
; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7
@@ -349,17 +350,17 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
; CHECK-NEXT: v_mul_hi_u32 v3, v3, v0
; CHECK-NEXT: v_mul_hi_u32 v4, v4, v1
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, s8
+; CHECK-NEXT: v_lshlrev_b32_e32 v5, 12, v3
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v3
; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 1, v3
-; CHECK-NEXT: v_mul_lo_u32 v8, v4, v2
+; CHECK-NEXT: v_lshlrev_b32_e32 v8, 12, v4
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v4
; CHECK-NEXT: v_subrev_i32_e32 v10, vcc, 1, v4
; CHECK-NEXT: v_sub_i32_e32 v11, vcc, v0, v5
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v1, v8
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v8
-; CHECK-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v11
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v2
; CHECK-NEXT: v_cmp_ge_u32_e64 s[8:9], v0, v2
; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], vcc
; CHECK-NEXT: v_cndmask_b32_e64 v0, v6, v3, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 74ba97b6c95d..45ce6cdf4210 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -286,14 +286,14 @@ define i32 @v_urem_i32_pow2k_denom(i32 %num) {
; CHECK-LABEL: v_urem_i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s6, 0x1000
+; CHECK-NEXT: s_movk_i32 s4, 0x1000
; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000
-; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s6
+; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6
-; CHECK-NEXT: v_mul_hi_u32 v4, v2, s6
+; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2
+; CHECK-NEXT: v_mul_hi_u32 v4, v2, s4
; CHECK-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
@@ -302,9 +302,9 @@ define i32 @v_urem_i32_pow2k_denom(i32 %num) {
; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CHECK-NEXT: v_mul_hi_u32 v2, v2, v0
-; CHECK-NEXT: v_mul_lo_u32 v2, v2, s6
+; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2
-; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1
; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v3, v1
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
; CHECK-NEXT: v_sub_i32_e64 v0, s[6:7], v3, v1
@@ -320,9 +320,9 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
; CHECK-LABEL: v_urem_v2i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s8, 0x1000
+; CHECK-NEXT: s_movk_i32 s4, 0x1000
; CHECK-NEXT: v_mov_b32_e32 v2, 0x1000
-; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s8
+; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s4
; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4
@@ -330,9 +330,9 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
; CHECK-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, s8
-; CHECK-NEXT: v_mul_hi_u32 v6, v3, s8
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2
+; CHECK-NEXT: v_lshlrev_b32_e32 v5, 12, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v3, s4
+; CHECK-NEXT: v_lshlrev_b32_e32 v7, 12, v4
; CHECK-NEXT: v_mul_hi_u32 v8, v4, v2
; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v5
; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7
@@ -350,11 +350,11 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5]
; CHECK-NEXT: v_mul_hi_u32 v3, v3, v0
; CHECK-NEXT: v_mul_hi_u32 v4, v4, v1
-; CHECK-NEXT: v_mul_lo_u32 v3, v3, s8
-; CHECK-NEXT: v_mul_lo_u32 v4, v4, v2
+; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v3
+; CHECK-NEXT: v_lshlrev_b32_e32 v4, 12, v4
; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v3
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v1, v4
-; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s8, v5
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v5, v2
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3
; CHECK-NEXT: v_sub_i32_e64 v0, s[6:7], v5, v2
More information about the llvm-commits
mailing list