[llvm] [SimplifyCFG] Increase budget for FoldTwoEntryPHINode() if the branch is unpredictable. (PR #98495)

Tianqing Wang via llvm-commits llvm-commits at lists.llvm.org
Sun Jul 21 22:33:26 PDT 2024


https://github.com/tianqingw updated https://github.com/llvm/llvm-project/pull/98495

>From 6b5fb0fc8c94ad61bcabe48023b438976629e79a Mon Sep 17 00:00:00 2001
From: Tianqing Wang <tianqing.wang at intel.com>
Date: Thu, 11 Jul 2024 23:16:09 +0800
Subject: [PATCH 1/5] [SimplifyCFG] Increase budget for FoldTwoEntryPHINode()
 if the branch is unpredictable.

The `!unpredictable` metadata has been present for a long time, but
it's usage in optimizations is still limited. This patch teaches
`FoldTwoEntryPHINode()` to be more aggressive with an unpredictable
branch to reduce mispredictions.

A TTI interface `getBranchMispredictPenalty()` is added to distinguish
between different hardwares to ensure we don't go too far for simpler
cores. For simplicity, only a naive x86 implementation is included for
the time being.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  9 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  2 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  4 +
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  5 +
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |  2 +
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  9 +-
 .../two-entry-phi-fold-unpredictable.ll       | 96 +++++++++++++++++++
 7 files changed, 124 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index dcdd9f82cde8e..9fbb6e1cff445 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -419,6 +419,11 @@ class TargetTransformInfo {
   /// this factor, it is very likely to be predicted correctly.
   BranchProbability getPredictableBranchThreshold() const;
 
+  // Returns an integer indicating how aggressive the target wants for
+  // eliminating unpredictable branches. A zero return value means extra
+  // optimization applied to them should be minimal.
+  unsigned getBranchMispredictPenalty() const;
+
   /// Return true if branch divergence exists.
   ///
   /// Branch divergence has a significantly negative impact on GPU performance
@@ -1820,6 +1825,7 @@ class TargetTransformInfo::Concept {
                                              ArrayRef<const Value *> Operands,
                                              TargetCostKind CostKind) = 0;
   virtual BranchProbability getPredictableBranchThreshold() = 0;
+  virtual unsigned getBranchMispredictPenalty() = 0;
   virtual bool hasBranchDivergence(const Function *F = nullptr) = 0;
   virtual bool isSourceOfDivergence(const Value *V) = 0;
   virtual bool isAlwaysUniform(const Value *V) = 0;
@@ -2228,6 +2234,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   BranchProbability getPredictableBranchThreshold() override {
     return Impl.getPredictableBranchThreshold();
   }
+  unsigned getBranchMispredictPenalty() override {
+    return Impl.getBranchMispredictPenalty();
+  }
   bool hasBranchDivergence(const Function *F = nullptr) override {
     return Impl.hasBranchDivergence(F);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 01624de190d51..d4b65ee6f5bd0 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -99,6 +99,8 @@ class TargetTransformInfoImplBase {
     return BranchProbability(99, 100);
   }
 
+  unsigned getBranchMispredictPenalty() const { return 0; }
+
   bool hasBranchDivergence(const Function *F = nullptr) const { return false; }
 
   bool isSourceOfDivergence(const Value *V) const { return false; }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c175d1737e54b..0ee9c8ee0cdf8 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -279,6 +279,10 @@ BranchProbability TargetTransformInfo::getPredictableBranchThreshold() const {
              : TTIImpl->getPredictableBranchThreshold();
 }
 
+unsigned TargetTransformInfo::getBranchMispredictPenalty() const {
+  return TTIImpl->getBranchMispredictPenalty();
+}
+
 bool TargetTransformInfo::hasBranchDivergence(const Function *F) const {
   return TTIImpl->hasBranchDivergence(F);
 }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 32a3683355b72..984586f4ae5f6 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6756,3 +6756,8 @@ InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
     return AM.Scale != 0;
   return -1;
 }
+
+unsigned X86TTIImpl::getBranchMispredictPenalty() const {
+  // TODO: Hook MispredictPenalty of SchedMachineModel into this.
+  return 14;
+}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 5eccb1aea308d..d2b5c093e7003 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -294,6 +294,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   bool supportsEfficientVectorElementLoadStore() const;
   bool enableInterleavedAccessVectorization();
 
+  unsigned getBranchMispredictPenalty() const;
+
 private:
   bool supportsGather() const;
   InstructionCost getGSVectorCost(unsigned Opcode, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 3fa3c0f1f52b0..128238bba10b0 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3508,7 +3508,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // jump to one specific 'then' block (if we have two of them).
   // It isn't beneficial to speculatively execute the code
   // from the block that we know is predictably not entered.
-  if (!DomBI->getMetadata(LLVMContext::MD_unpredictable)) {
+  bool IsUnpredictable = DomBI->getMetadata(LLVMContext::MD_unpredictable);
+  if (!IsUnpredictable) {
     uint64_t TWeight, FWeight;
     if (extractBranchWeights(*DomBI, TWeight, FWeight) &&
         (TWeight + FWeight) != 0) {
@@ -3549,8 +3550,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // that need to be moved to the dominating block.
   SmallPtrSet<Instruction *, 4> AggressiveInsts;
   InstructionCost Cost = 0;
-  InstructionCost Budget =
-      TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+  unsigned Threshold = TwoEntryPHINodeFoldingThreshold;
+  if (IsUnpredictable)
+    Threshold += TTI.getBranchMispredictPenalty();
+  InstructionCost Budget = Threshold * TargetTransformInfo::TCC_Basic;
 
   bool Changed = false;
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
diff --git a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll
new file mode 100644
index 0000000000000..88aa8a619207d
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; Two-entry phi nodes with unpredictable conditions may get increased budget for folding.
+; RUN: opt < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-FOLD %s
+
+define { <2 x float>, <2 x float> } @foo(float %speed, <2 x float> %velocity.coerce0, <2 x float> %velocity.coerce1) {
+; CHECK-NOFOLD-LABEL: define { <2 x float>, <2 x float> } @foo(
+; CHECK-NOFOLD-SAME: float [[SPEED:%.*]], <2 x float> [[VELOCITY_COERCE0:%.*]], <2 x float> [[VELOCITY_COERCE1:%.*]]) {
+; CHECK-NOFOLD-NEXT:  [[ENTRY:.*]]:
+; CHECK-NOFOLD-NEXT:    [[CMP:%.*]] = fcmp fast ogt float [[SPEED]], 0x3F747AE140000000
+; CHECK-NOFOLD-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]], !unpredictable [[META0:![0-9]+]]
+; CHECK-NOFOLD:       [[IF_THEN]]:
+; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 0
+; CHECK-NOFOLD-NEXT:    [[MUL_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_0_VEC_EXTRACT]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
+; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 1
+; CHECK-NOFOLD-NEXT:    [[MUL8_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_4_VEC_EXTRACT]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
+; CHECK-NOFOLD-NEXT:    [[ADD_I_I_I_I:%.*]] = fadd fast float [[MUL8_I_I_I_I]], [[MUL_I_I_I_I]]
+; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_14_8_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE1]], i64 0
+; CHECK-NOFOLD-NEXT:    [[MUL13_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_14_8_VEC_EXTRACT]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
+; CHECK-NOFOLD-NEXT:    [[ADD14_I_I_I_I:%.*]] = fadd fast float [[ADD_I_I_I_I]], [[MUL13_I_I_I_I]]
+; CHECK-NOFOLD-NEXT:    [[TMP0:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[ADD14_I_I_I_I]])
+; CHECK-NOFOLD-NEXT:    [[MUL_I_I_I:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[TMP0]]
+; CHECK-NOFOLD-NEXT:    [[SUB_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
+; CHECK-NOFOLD-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB_I]], i64 0
+; CHECK-NOFOLD-NEXT:    [[SUB8_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
+; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_0_4_VEC_INSERT25:%.*]] = insertelement <2 x float> [[TMP1]], float [[SUB8_I]], i64 1
+; CHECK-NOFOLD-NEXT:    [[SUB13_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
+; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_14_8_VEC_INSERT35:%.*]] = insertelement <2 x float> [[VELOCITY_COERCE1]], float [[SUB13_I]], i64 0
+; CHECK-NOFOLD-NEXT:    br label %[[IF_END]]
+; CHECK-NOFOLD:       [[IF_END]]:
+; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_0_0:%.*]] = phi nsz <2 x float> [ [[VELOCITY_SROA_0_4_VEC_INSERT25]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_14_0:%.*]] = phi nsz <2 x float> [ [[VELOCITY_SROA_14_8_VEC_INSERT35]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
+; CHECK-NOFOLD-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VELOCITY_SROA_0_0]], 0
+; CHECK-NOFOLD-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[VELOCITY_SROA_14_0]], 1
+; CHECK-NOFOLD-NEXT:    ret { <2 x float>, <2 x float> } [[DOTFCA_1_INSERT]]
+;
+; CHECK-FOLD-LABEL: define { <2 x float>, <2 x float> } @foo(
+; CHECK-FOLD-SAME: float [[SPEED:%.*]], <2 x float> [[VELOCITY_COERCE0:%.*]], <2 x float> [[VELOCITY_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-FOLD-NEXT:  [[ENTRY:.*:]]
+; CHECK-FOLD-NEXT:    [[CMP:%.*]] = fcmp fast ogt float [[SPEED]], 0x3F747AE140000000
+; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 0
+; CHECK-FOLD-NEXT:    [[MUL_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_0_VEC_EXTRACT]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
+; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 1
+; CHECK-FOLD-NEXT:    [[MUL8_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_4_VEC_EXTRACT]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
+; CHECK-FOLD-NEXT:    [[ADD_I_I_I_I:%.*]] = fadd fast float [[MUL8_I_I_I_I]], [[MUL_I_I_I_I]]
+; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_14_8_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE1]], i64 0
+; CHECK-FOLD-NEXT:    [[MUL13_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_14_8_VEC_EXTRACT]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
+; CHECK-FOLD-NEXT:    [[ADD14_I_I_I_I:%.*]] = fadd fast float [[ADD_I_I_I_I]], [[MUL13_I_I_I_I]]
+; CHECK-FOLD-NEXT:    [[TMP0:%.*]] = tail call fast float @llvm.sqrt.f32(float [[ADD14_I_I_I_I]])
+; CHECK-FOLD-NEXT:    [[MUL_I_I_I:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[TMP0]]
+; CHECK-FOLD-NEXT:    [[SUB_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
+; CHECK-FOLD-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB_I]], i64 0
+; CHECK-FOLD-NEXT:    [[SUB8_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
+; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_0_4_VEC_INSERT25:%.*]] = insertelement <2 x float> [[TMP1]], float [[SUB8_I]], i64 1
+; CHECK-FOLD-NEXT:    [[SUB13_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
+; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_14_8_VEC_INSERT35:%.*]] = insertelement <2 x float> [[VELOCITY_COERCE1]], float [[SUB13_I]], i64 0
+; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_0_0:%.*]] = select nsz i1 [[CMP]], <2 x float> [[VELOCITY_SROA_0_4_VEC_INSERT25]], <2 x float> zeroinitializer, !unpredictable [[META0:![0-9]+]]
+; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_14_0:%.*]] = select nsz i1 [[CMP]], <2 x float> [[VELOCITY_SROA_14_8_VEC_INSERT35]], <2 x float> zeroinitializer, !unpredictable [[META0]]
+; CHECK-FOLD-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VELOCITY_SROA_0_0]], 0
+; CHECK-FOLD-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[VELOCITY_SROA_14_0]], 1
+; CHECK-FOLD-NEXT:    ret { <2 x float>, <2 x float> } [[DOTFCA_1_INSERT]]
+;
+entry:
+  %cmp = fcmp fast ogt float %speed, 0x3F747AE140000000
+  br i1 %cmp, label %if.then, label %if.end, !unpredictable !0
+
+if.then:
+  %velocity.sroa.0.0.vec.extract = extractelement <2 x float> %velocity.coerce0, i64 0
+  %mul.i.i.i.i = fmul fast float %velocity.sroa.0.0.vec.extract, %velocity.sroa.0.0.vec.extract
+  %velocity.sroa.0.4.vec.extract = extractelement <2 x float> %velocity.coerce0, i64 1
+  %mul8.i.i.i.i = fmul fast float %velocity.sroa.0.4.vec.extract, %velocity.sroa.0.4.vec.extract
+  %add.i.i.i.i = fadd fast float %mul8.i.i.i.i, %mul.i.i.i.i
+  %velocity.sroa.14.8.vec.extract = extractelement <2 x float> %velocity.coerce1, i64 0
+  %mul13.i.i.i.i = fmul fast float %velocity.sroa.14.8.vec.extract, %velocity.sroa.14.8.vec.extract
+  %add14.i.i.i.i = fadd fast float %add.i.i.i.i, %mul13.i.i.i.i
+  %0 = tail call fast noundef float @llvm.sqrt.f32(float %add14.i.i.i.i)
+  %mul.i.i.i = fdiv fast float 0x3FEFD70A40000000, %0
+  %sub.i = fmul fast float %mul.i.i.i, %velocity.sroa.0.0.vec.extract
+  %1 = insertelement <2 x float> poison, float %sub.i, i64 0
+  %sub8.i = fmul fast float %mul.i.i.i, %velocity.sroa.0.4.vec.extract
+  %velocity.sroa.0.4.vec.insert25 = insertelement <2 x float> %1, float %sub8.i, i64 1
+  %sub13.i = fmul fast float %mul.i.i.i, %velocity.sroa.14.8.vec.extract
+  %velocity.sroa.14.8.vec.insert35 = insertelement <2 x float> %velocity.coerce1, float %sub13.i, i64 0
+  br label %if.end
+
+if.end:
+  %velocity.sroa.0.0 = phi nsz <2 x float> [ %velocity.sroa.0.4.vec.insert25, %if.then ], [ zeroinitializer, %entry ]
+  %velocity.sroa.14.0 = phi nsz <2 x float> [ %velocity.sroa.14.8.vec.insert35, %if.then ], [ zeroinitializer, %entry ]
+  %.fca.0.insert = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> %velocity.sroa.0.0, 0
+  %.fca.1.insert = insertvalue { <2 x float>, <2 x float> } %.fca.0.insert, <2 x float> %velocity.sroa.14.0, 1
+  ret { <2 x float>, <2 x float> } %.fca.1.insert
+}
+
+declare float @llvm.sqrt.f32(float)
+
+!0 = !{}

>From b6fa2d609527df7827ca312cd777abce52069061 Mon Sep 17 00:00:00 2001
From: Tianqing Wang <tianqing.wang at intel.com>
Date: Wed, 17 Jul 2024 15:58:29 +0800
Subject: [PATCH 2/5] Add debug log.

---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 128238bba10b0..b3f03cdeafa3e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3623,8 +3623,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
              [](BasicBlock *IfBlock) { return IfBlock->hasAddressTaken(); }))
     return Changed;
 
-  LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond
-                    << "  T: " << IfTrue->getName()
+  LLVM_DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond;
+             if (IsUnpredictable) dbgs() << " (unpredictable)";
+             dbgs() << "  T: " << IfTrue->getName()
                     << "  F: " << IfFalse->getName() << "\n");
 
   // If we can still promote the PHI nodes after this gauntlet of tests,

>From 7d1aff233792c0cc309cf1ee015f993cd2f0e7b7 Mon Sep 17 00:00:00 2001
From: Tianqing Wang <tianqing.wang at intel.com>
Date: Fri, 19 Jul 2024 02:06:56 +0800
Subject: [PATCH 3/5] Returns InstructionCost for getBranchMispredictPenalty().

---
 llvm/include/llvm/Analysis/TargetTransformInfo.h    | 13 +++++++------
 .../include/llvm/Analysis/TargetTransformInfoImpl.h |  2 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp           |  2 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp      |  2 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.h        |  2 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp           |  5 ++---
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9fbb6e1cff445..9f191b2bd51a7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -419,10 +419,11 @@ class TargetTransformInfo {
   /// this factor, it is very likely to be predicted correctly.
   BranchProbability getPredictableBranchThreshold() const;
 
-  // Returns an integer indicating how aggressive the target wants for
-  // eliminating unpredictable branches. A zero return value means extra
-  // optimization applied to them should be minimal.
-  unsigned getBranchMispredictPenalty() const;
+  /// Returns estimated penalty of a branch misprediction in latency. Indicates
+  /// how aggressive the target wants for eliminating unpredictable branches. A
+  /// zero return value means extra optimization applied to them should be
+  /// minimal.
+  InstructionCost getBranchMispredictPenalty() const;
 
   /// Return true if branch divergence exists.
   ///
@@ -1825,7 +1826,7 @@ class TargetTransformInfo::Concept {
                                              ArrayRef<const Value *> Operands,
                                              TargetCostKind CostKind) = 0;
   virtual BranchProbability getPredictableBranchThreshold() = 0;
-  virtual unsigned getBranchMispredictPenalty() = 0;
+  virtual InstructionCost getBranchMispredictPenalty() = 0;
   virtual bool hasBranchDivergence(const Function *F = nullptr) = 0;
   virtual bool isSourceOfDivergence(const Value *V) = 0;
   virtual bool isAlwaysUniform(const Value *V) = 0;
@@ -2234,7 +2235,7 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   BranchProbability getPredictableBranchThreshold() override {
     return Impl.getPredictableBranchThreshold();
   }
-  unsigned getBranchMispredictPenalty() override {
+  InstructionCost getBranchMispredictPenalty() override {
     return Impl.getBranchMispredictPenalty();
   }
   bool hasBranchDivergence(const Function *F = nullptr) override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index d4b65ee6f5bd0..721238e33a237 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -99,7 +99,7 @@ class TargetTransformInfoImplBase {
     return BranchProbability(99, 100);
   }
 
-  unsigned getBranchMispredictPenalty() const { return 0; }
+  InstructionCost getBranchMispredictPenalty() const { return 0; }
 
   bool hasBranchDivergence(const Function *F = nullptr) const { return false; }
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 0ee9c8ee0cdf8..9d650031cee8b 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -279,7 +279,7 @@ BranchProbability TargetTransformInfo::getPredictableBranchThreshold() const {
              : TTIImpl->getPredictableBranchThreshold();
 }
 
-unsigned TargetTransformInfo::getBranchMispredictPenalty() const {
+InstructionCost TargetTransformInfo::getBranchMispredictPenalty() const {
   return TTIImpl->getBranchMispredictPenalty();
 }
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 984586f4ae5f6..dc3ac80bdf5cf 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6757,7 +6757,7 @@ InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
   return -1;
 }
 
-unsigned X86TTIImpl::getBranchMispredictPenalty() const {
+InstructionCost X86TTIImpl::getBranchMispredictPenalty() const {
   // TODO: Hook MispredictPenalty of SchedMachineModel into this.
   return 14;
 }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index d2b5c093e7003..b619090e8e1e0 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -294,7 +294,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   bool supportsEfficientVectorElementLoadStore() const;
   bool enableInterleavedAccessVectorization();
 
-  unsigned getBranchMispredictPenalty() const;
+  InstructionCost getBranchMispredictPenalty() const;
 
 private:
   bool supportsGather() const;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index b3f03cdeafa3e..94ed8b17f96b0 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3550,10 +3550,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // that need to be moved to the dominating block.
   SmallPtrSet<Instruction *, 4> AggressiveInsts;
   InstructionCost Cost = 0;
-  unsigned Threshold = TwoEntryPHINodeFoldingThreshold;
+  InstructionCost Budget = TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
   if (IsUnpredictable)
-    Threshold += TTI.getBranchMispredictPenalty();
-  InstructionCost Budget = Threshold * TargetTransformInfo::TCC_Basic;
+    Budget += TTI.getBranchMispredictPenalty();
 
   bool Changed = false;
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {

>From b0df97907ffe44676a70235edff49ee9077ca899 Mon Sep 17 00:00:00 2001
From: Tianqing Wang <tianqing.wang at intel.com>
Date: Fri, 19 Jul 2024 02:12:23 +0800
Subject: [PATCH 4/5] clang-format.

---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 94ed8b17f96b0..baef29dada701 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3550,7 +3550,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // that need to be moved to the dominating block.
   SmallPtrSet<Instruction *, 4> AggressiveInsts;
   InstructionCost Cost = 0;
-  InstructionCost Budget = TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+  InstructionCost Budget =
+      TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
   if (IsUnpredictable)
     Budget += TTI.getBranchMispredictPenalty();
 

>From c64d6378503f8045cd119f7941a607ebfe78db11 Mon Sep 17 00:00:00 2001
From: Tianqing Wang <tianqing.wang at intel.com>
Date: Mon, 22 Jul 2024 13:33:01 +0800
Subject: [PATCH 5/5] Canonicalize the test.

---
 .../two-entry-phi-fold-unpredictable.ll       | 164 +++++++++---------
 1 file changed, 84 insertions(+), 80 deletions(-)

diff --git a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll
index 88aa8a619207d..0bce8e3ed7dd3 100644
--- a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll
+++ b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-fold-unpredictable.ll
@@ -3,92 +3,96 @@
 ; RUN: opt < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-NOFOLD %s
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s -S -passes=simplifycfg | FileCheck --check-prefix=CHECK-FOLD %s
 
-define { <2 x float>, <2 x float> } @foo(float %speed, <2 x float> %velocity.coerce0, <2 x float> %velocity.coerce1) {
+define { <2 x float>, <2 x float> } @foo(float %arg, <2 x float> %arg1, <2 x float> %arg2) {
 ; CHECK-NOFOLD-LABEL: define { <2 x float>, <2 x float> } @foo(
-; CHECK-NOFOLD-SAME: float [[SPEED:%.*]], <2 x float> [[VELOCITY_COERCE0:%.*]], <2 x float> [[VELOCITY_COERCE1:%.*]]) {
-; CHECK-NOFOLD-NEXT:  [[ENTRY:.*]]:
-; CHECK-NOFOLD-NEXT:    [[CMP:%.*]] = fcmp fast ogt float [[SPEED]], 0x3F747AE140000000
-; CHECK-NOFOLD-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]], !unpredictable [[META0:![0-9]+]]
-; CHECK-NOFOLD:       [[IF_THEN]]:
-; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 0
-; CHECK-NOFOLD-NEXT:    [[MUL_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_0_VEC_EXTRACT]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
-; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 1
-; CHECK-NOFOLD-NEXT:    [[MUL8_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_4_VEC_EXTRACT]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
-; CHECK-NOFOLD-NEXT:    [[ADD_I_I_I_I:%.*]] = fadd fast float [[MUL8_I_I_I_I]], [[MUL_I_I_I_I]]
-; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_14_8_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE1]], i64 0
-; CHECK-NOFOLD-NEXT:    [[MUL13_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_14_8_VEC_EXTRACT]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
-; CHECK-NOFOLD-NEXT:    [[ADD14_I_I_I_I:%.*]] = fadd fast float [[ADD_I_I_I_I]], [[MUL13_I_I_I_I]]
-; CHECK-NOFOLD-NEXT:    [[TMP0:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[ADD14_I_I_I_I]])
-; CHECK-NOFOLD-NEXT:    [[MUL_I_I_I:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[TMP0]]
-; CHECK-NOFOLD-NEXT:    [[SUB_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
-; CHECK-NOFOLD-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB_I]], i64 0
-; CHECK-NOFOLD-NEXT:    [[SUB8_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
-; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_0_4_VEC_INSERT25:%.*]] = insertelement <2 x float> [[TMP1]], float [[SUB8_I]], i64 1
-; CHECK-NOFOLD-NEXT:    [[SUB13_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
-; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_14_8_VEC_INSERT35:%.*]] = insertelement <2 x float> [[VELOCITY_COERCE1]], float [[SUB13_I]], i64 0
-; CHECK-NOFOLD-NEXT:    br label %[[IF_END]]
-; CHECK-NOFOLD:       [[IF_END]]:
-; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_0_0:%.*]] = phi nsz <2 x float> [ [[VELOCITY_SROA_0_4_VEC_INSERT25]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NOFOLD-NEXT:    [[VELOCITY_SROA_14_0:%.*]] = phi nsz <2 x float> [ [[VELOCITY_SROA_14_8_VEC_INSERT35]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
-; CHECK-NOFOLD-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VELOCITY_SROA_0_0]], 0
-; CHECK-NOFOLD-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[VELOCITY_SROA_14_0]], 1
-; CHECK-NOFOLD-NEXT:    ret { <2 x float>, <2 x float> } [[DOTFCA_1_INSERT]]
+; CHECK-NOFOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) {
+; CHECK-NOFOLD-NEXT:  [[BB:.*]]:
+; CHECK-NOFOLD-NEXT:    [[I:%.*]] = fcmp fast ogt float [[ARG]], 0x3F747AE140000000
+; CHECK-NOFOLD-NEXT:    br i1 [[I]], label %[[BB3:.*]], label %[[BB20:.*]], !unpredictable [[META0:![0-9]+]]
+; CHECK-NOFOLD:       [[BB3]]:
+; CHECK-NOFOLD-NEXT:    [[I4:%.*]] = extractelement <2 x float> [[ARG1]], i64 0
+; CHECK-NOFOLD-NEXT:    [[I5:%.*]] = fmul fast float [[I4]], [[I4]]
+; CHECK-NOFOLD-NEXT:    [[I6:%.*]] = extractelement <2 x float> [[ARG1]], i64 1
+; CHECK-NOFOLD-NEXT:    [[I7:%.*]] = fmul fast float [[I6]], [[I6]]
+; CHECK-NOFOLD-NEXT:    [[I8:%.*]] = fadd fast float [[I7]], [[I5]]
+; CHECK-NOFOLD-NEXT:    [[I9:%.*]] = extractelement <2 x float> [[ARG2]], i64 0
+; CHECK-NOFOLD-NEXT:    [[I10:%.*]] = fmul fast float [[I9]], [[I9]]
+; CHECK-NOFOLD-NEXT:    [[I11:%.*]] = fadd fast float [[I8]], [[I10]]
+; CHECK-NOFOLD-NEXT:    [[I12:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[I11]])
+; CHECK-NOFOLD-NEXT:    [[I13:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[I12]]
+; CHECK-NOFOLD-NEXT:    [[I14:%.*]] = fmul fast float [[I13]], [[I4]]
+; CHECK-NOFOLD-NEXT:    [[I15:%.*]] = insertelement <2 x float> poison, float [[I14]], i64 0
+; CHECK-NOFOLD-NEXT:    [[I16:%.*]] = fmul fast float [[I13]], [[I6]]
+; CHECK-NOFOLD-NEXT:    [[I17:%.*]] = insertelement <2 x float> [[I15]], float [[I16]], i64 1
+; CHECK-NOFOLD-NEXT:    [[I18:%.*]] = fmul fast float [[I13]], [[I9]]
+; CHECK-NOFOLD-NEXT:    [[I19:%.*]] = insertelement <2 x float> [[ARG2]], float [[I18]], i64 0
+; CHECK-NOFOLD-NEXT:    br label %[[BB20]]
+; CHECK-NOFOLD:       [[BB20]]:
+; CHECK-NOFOLD-NEXT:    [[I21:%.*]] = phi nsz <2 x float> [ [[I17]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ]
+; CHECK-NOFOLD-NEXT:    [[I22:%.*]] = phi nsz <2 x float> [ [[I19]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ]
+; CHECK-NOFOLD-NEXT:    [[I23:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[I21]], 0
+; CHECK-NOFOLD-NEXT:    [[I24:%.*]] = insertvalue { <2 x float>, <2 x float> } [[I23]], <2 x float> [[I22]], 1
+; CHECK-NOFOLD-NEXT:    ret { <2 x float>, <2 x float> } [[I24]]
 ;
 ; CHECK-FOLD-LABEL: define { <2 x float>, <2 x float> } @foo(
-; CHECK-FOLD-SAME: float [[SPEED:%.*]], <2 x float> [[VELOCITY_COERCE0:%.*]], <2 x float> [[VELOCITY_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-FOLD-NEXT:  [[ENTRY:.*:]]
-; CHECK-FOLD-NEXT:    [[CMP:%.*]] = fcmp fast ogt float [[SPEED]], 0x3F747AE140000000
-; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 0
-; CHECK-FOLD-NEXT:    [[MUL_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_0_VEC_EXTRACT]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
-; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE0]], i64 1
-; CHECK-FOLD-NEXT:    [[MUL8_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_0_4_VEC_EXTRACT]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
-; CHECK-FOLD-NEXT:    [[ADD_I_I_I_I:%.*]] = fadd fast float [[MUL8_I_I_I_I]], [[MUL_I_I_I_I]]
-; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_14_8_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[VELOCITY_COERCE1]], i64 0
-; CHECK-FOLD-NEXT:    [[MUL13_I_I_I_I:%.*]] = fmul fast float [[VELOCITY_SROA_14_8_VEC_EXTRACT]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
-; CHECK-FOLD-NEXT:    [[ADD14_I_I_I_I:%.*]] = fadd fast float [[ADD_I_I_I_I]], [[MUL13_I_I_I_I]]
-; CHECK-FOLD-NEXT:    [[TMP0:%.*]] = tail call fast float @llvm.sqrt.f32(float [[ADD14_I_I_I_I]])
-; CHECK-FOLD-NEXT:    [[MUL_I_I_I:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[TMP0]]
-; CHECK-FOLD-NEXT:    [[SUB_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_0_VEC_EXTRACT]]
-; CHECK-FOLD-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[SUB_I]], i64 0
-; CHECK-FOLD-NEXT:    [[SUB8_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_0_4_VEC_EXTRACT]]
-; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_0_4_VEC_INSERT25:%.*]] = insertelement <2 x float> [[TMP1]], float [[SUB8_I]], i64 1
-; CHECK-FOLD-NEXT:    [[SUB13_I:%.*]] = fmul fast float [[MUL_I_I_I]], [[VELOCITY_SROA_14_8_VEC_EXTRACT]]
-; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_14_8_VEC_INSERT35:%.*]] = insertelement <2 x float> [[VELOCITY_COERCE1]], float [[SUB13_I]], i64 0
-; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_0_0:%.*]] = select nsz i1 [[CMP]], <2 x float> [[VELOCITY_SROA_0_4_VEC_INSERT25]], <2 x float> zeroinitializer, !unpredictable [[META0:![0-9]+]]
-; CHECK-FOLD-NEXT:    [[VELOCITY_SROA_14_0:%.*]] = select nsz i1 [[CMP]], <2 x float> [[VELOCITY_SROA_14_8_VEC_INSERT35]], <2 x float> zeroinitializer, !unpredictable [[META0]]
-; CHECK-FOLD-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[VELOCITY_SROA_0_0]], 0
-; CHECK-FOLD-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[DOTFCA_0_INSERT]], <2 x float> [[VELOCITY_SROA_14_0]], 1
-; CHECK-FOLD-NEXT:    ret { <2 x float>, <2 x float> } [[DOTFCA_1_INSERT]]
+; CHECK-FOLD-SAME: float [[ARG:%.*]], <2 x float> [[ARG1:%.*]], <2 x float> [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-FOLD-NEXT:  [[BB:.*]]:
+; CHECK-FOLD-NEXT:    [[I:%.*]] = fcmp fast ogt float [[ARG]], 0x3F747AE140000000
+; CHECK-FOLD-NEXT:    br i1 [[I]], label %[[BB3:.*]], label %[[BB20:.*]], !unpredictable [[META0:![0-9]+]]
+; CHECK-FOLD:       [[BB3]]:
+; CHECK-FOLD-NEXT:    [[I4:%.*]] = extractelement <2 x float> [[ARG1]], i64 0
+; CHECK-FOLD-NEXT:    [[I5:%.*]] = fmul fast float [[I4]], [[I4]]
+; CHECK-FOLD-NEXT:    [[I6:%.*]] = extractelement <2 x float> [[ARG1]], i64 1
+; CHECK-FOLD-NEXT:    [[I7:%.*]] = fmul fast float [[I6]], [[I6]]
+; CHECK-FOLD-NEXT:    [[I8:%.*]] = fadd fast float [[I7]], [[I5]]
+; CHECK-FOLD-NEXT:    [[I9:%.*]] = extractelement <2 x float> [[ARG2]], i64 0
+; CHECK-FOLD-NEXT:    [[I10:%.*]] = fmul fast float [[I9]], [[I9]]
+; CHECK-FOLD-NEXT:    [[I11:%.*]] = fadd fast float [[I8]], [[I10]]
+; CHECK-FOLD-NEXT:    [[I12:%.*]] = tail call fast noundef float @llvm.sqrt.f32(float [[I11]])
+; CHECK-FOLD-NEXT:    [[I13:%.*]] = fdiv fast float 0x3FEFD70A40000000, [[I12]]
+; CHECK-FOLD-NEXT:    [[I14:%.*]] = fmul fast float [[I13]], [[I4]]
+; CHECK-FOLD-NEXT:    [[I15:%.*]] = insertelement <2 x float> poison, float [[I14]], i64 0
+; CHECK-FOLD-NEXT:    [[I16:%.*]] = fmul fast float [[I13]], [[I6]]
+; CHECK-FOLD-NEXT:    [[I17:%.*]] = insertelement <2 x float> [[I15]], float [[I16]], i64 1
+; CHECK-FOLD-NEXT:    [[I18:%.*]] = fmul fast float [[I13]], [[I9]]
+; CHECK-FOLD-NEXT:    [[I19:%.*]] = insertelement <2 x float> [[ARG2]], float [[I18]], i64 0
+; CHECK-FOLD-NEXT:    br label %[[BB20]]
+; CHECK-FOLD:       [[BB20]]:
+; CHECK-FOLD-NEXT:    [[I21:%.*]] = phi nsz <2 x float> [ [[I17]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ]
+; CHECK-FOLD-NEXT:    [[I22:%.*]] = phi nsz <2 x float> [ [[I19]], %[[BB3]] ], [ zeroinitializer, %[[BB]] ]
+; CHECK-FOLD-NEXT:    [[I23:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[I21]], 0
+; CHECK-FOLD-NEXT:    [[I24:%.*]] = insertvalue { <2 x float>, <2 x float> } [[I23]], <2 x float> [[I22]], 1
+; CHECK-FOLD-NEXT:    ret { <2 x float>, <2 x float> } [[I24]]
 ;
-entry:
-  %cmp = fcmp fast ogt float %speed, 0x3F747AE140000000
-  br i1 %cmp, label %if.then, label %if.end, !unpredictable !0
+bb:
+  %i = fcmp fast ogt float %arg, 0x3F747AE140000000
+  br i1 %i, label %bb3, label %bb20, !unpredictable !0
 
-if.then:
-  %velocity.sroa.0.0.vec.extract = extractelement <2 x float> %velocity.coerce0, i64 0
-  %mul.i.i.i.i = fmul fast float %velocity.sroa.0.0.vec.extract, %velocity.sroa.0.0.vec.extract
-  %velocity.sroa.0.4.vec.extract = extractelement <2 x float> %velocity.coerce0, i64 1
-  %mul8.i.i.i.i = fmul fast float %velocity.sroa.0.4.vec.extract, %velocity.sroa.0.4.vec.extract
-  %add.i.i.i.i = fadd fast float %mul8.i.i.i.i, %mul.i.i.i.i
-  %velocity.sroa.14.8.vec.extract = extractelement <2 x float> %velocity.coerce1, i64 0
-  %mul13.i.i.i.i = fmul fast float %velocity.sroa.14.8.vec.extract, %velocity.sroa.14.8.vec.extract
-  %add14.i.i.i.i = fadd fast float %add.i.i.i.i, %mul13.i.i.i.i
-  %0 = tail call fast noundef float @llvm.sqrt.f32(float %add14.i.i.i.i)
-  %mul.i.i.i = fdiv fast float 0x3FEFD70A40000000, %0
-  %sub.i = fmul fast float %mul.i.i.i, %velocity.sroa.0.0.vec.extract
-  %1 = insertelement <2 x float> poison, float %sub.i, i64 0
-  %sub8.i = fmul fast float %mul.i.i.i, %velocity.sroa.0.4.vec.extract
-  %velocity.sroa.0.4.vec.insert25 = insertelement <2 x float> %1, float %sub8.i, i64 1
-  %sub13.i = fmul fast float %mul.i.i.i, %velocity.sroa.14.8.vec.extract
-  %velocity.sroa.14.8.vec.insert35 = insertelement <2 x float> %velocity.coerce1, float %sub13.i, i64 0
-  br label %if.end
+bb3:                                              ; preds = %bb
+  %i4 = extractelement <2 x float> %arg1, i64 0
+  %i5 = fmul fast float %i4, %i4
+  %i6 = extractelement <2 x float> %arg1, i64 1
+  %i7 = fmul fast float %i6, %i6
+  %i8 = fadd fast float %i7, %i5
+  %i9 = extractelement <2 x float> %arg2, i64 0
+  %i10 = fmul fast float %i9, %i9
+  %i11 = fadd fast float %i8, %i10
+  %i12 = tail call fast noundef float @llvm.sqrt.f32(float %i11)
+  %i13 = fdiv fast float 0x3FEFD70A40000000, %i12
+  %i14 = fmul fast float %i13, %i4
+  %i15 = insertelement <2 x float> poison, float %i14, i64 0
+  %i16 = fmul fast float %i13, %i6
+  %i17 = insertelement <2 x float> %i15, float %i16, i64 1
+  %i18 = fmul fast float %i13, %i9
+  %i19 = insertelement <2 x float> %arg2, float %i18, i64 0
+  br label %bb20
 
-if.end:
-  %velocity.sroa.0.0 = phi nsz <2 x float> [ %velocity.sroa.0.4.vec.insert25, %if.then ], [ zeroinitializer, %entry ]
-  %velocity.sroa.14.0 = phi nsz <2 x float> [ %velocity.sroa.14.8.vec.insert35, %if.then ], [ zeroinitializer, %entry ]
-  %.fca.0.insert = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> %velocity.sroa.0.0, 0
-  %.fca.1.insert = insertvalue { <2 x float>, <2 x float> } %.fca.0.insert, <2 x float> %velocity.sroa.14.0, 1
-  ret { <2 x float>, <2 x float> } %.fca.1.insert
+bb20:                                             ; preds = %bb3, %bb
+  %i21 = phi nsz <2 x float> [ %i17, %bb3 ], [ zeroinitializer, %bb ]
+  %i22 = phi nsz <2 x float> [ %i19, %bb3 ], [ zeroinitializer, %bb ]
+  %i23 = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> %i21, 0
+  %i24 = insertvalue { <2 x float>, <2 x float> } %i23, <2 x float> %i22, 1
+  ret { <2 x float>, <2 x float> } %i24
 }
 
 declare float @llvm.sqrt.f32(float)



More information about the llvm-commits mailing list