[llvm] [InstCombine] Transform high latency, dependent FSQRT/FDIV into FMUL (PR #87474)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 7 22:07:54 PDT 2024
https://github.com/sushgokh updated https://github.com/llvm/llvm-project/pull/87474
>From e468b2d0f42fc4e5cc0e5903ba086a9a42f9673d Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Wed, 3 Apr 2024 15:25:17 +0530
Subject: [PATCH] [InstCombine] Transform high latency, dependent FSQRT/FDIV
into FMUL The proposed patch, in general, tries to transform the below code
sequence: x = 1.0 / sqrt (a); r1 = x * x; // same as 1.0 / a r2 = a * x; //
same as sqrt (a)
TO
(If x, r1 and r2 are all used further in the code)
tmp1 = 1.0 / a
tmp2 = sqrt (a)
tmp3 = tmp1 * tmp2
x = tmp3
r1 = tmp1
r2 = tmp2
The transform tries to make high latency sqrt and div operations independent and also saves on one multiplication.
The patch was tested with SPEC17 suite with cpu=neoverse-v2.
The performance uplift achieved was:
544.nab_r ~4%
No other regressions were observed. Also, no compile time differences were observed with the patch.
Closes #54652
---
.../InstCombine/InstCombineMulDivRem.cpp | 179 ++++++-
.../InstCombine/fsqrtdiv-transform.ll | 463 ++++++++++++++++++
2 files changed, 639 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 8c698e52b5a0e6..34f375dc913f28 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -626,6 +626,129 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
return nullptr;
}
+static bool isFSqrtDivToFMulLegal(Instruction *X,
+ SmallSetVector<Instruction *, 2> &R1,
+ SmallSetVector<Instruction *, 2> &R2) {
+
+ BasicBlock *BBx = X->getParent();
+ BasicBlock *BBr1 = R1[0]->getParent();
+ BasicBlock *BBr2 = R2[0]->getParent();
+
+ auto IsStrictFP = [](Instruction *I) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+ return II && II->isStrictFP();
+ };
+
+ // Check the constaints on instruction X.
+ auto XConstraintsSatisfied = [X, &IsStrictFP]() {
+ if (IsStrictFP(X))
+ return false;
+ // X must atleast have 4 uses.
+ // 3 uses as part of
+ // r1 = x * x
+ // r2 = a * x
+ // Now, post-transform, r1/r2 will no longer have usage of 'x' and if the
+ // changes to 'x' need to persist, we must have one more usage of 'x'
+ if (!X->hasNUsesOrMore(4))
+ return false;
+ // Check if reciprocalFP is enabled.
+ bool RecipFPMath = dyn_cast<FPMathOperator>(X)->hasAllowReciprocal();
+ return RecipFPMath;
+ };
+ if (!XConstraintsSatisfied())
+ return false;
+
+ // Check the constraints on instructions in R1.
+ auto R1ConstraintsSatisfied = [BBr1, &IsStrictFP](Instruction *I) {
+ if (IsStrictFP(I))
+ return false;
+ // When you have multiple instructions residing in R1 and R2 respectively,
+ // it's difficult to generate combinations of (R1,R2) and then check if we
+ // have the required pattern. So, for now, just be conservative.
+ if (I->getParent() != BBr1)
+ return false;
+ if (!I->hasNUsesOrMore(1))
+ return false;
+ // The optimization tries to convert
+ // R1 = div * div where, div = 1/sqrt(a)
+ // to
+ // R1 = 1/a
+ // Now, this simplication does not work because sqrt(a)=NaN when a<0
+ if (!I->hasNoNaNs())
+ return false;
+ // sqrt(-0.0) = -0.0, and doing this simplication would change the sign of
+ // the result.
+ return I->hasNoSignedZeros();
+ };
+ if (!std::all_of(R1.begin(), R1.end(), R1ConstraintsSatisfied))
+ return false;
+
+ // Check the constraints on instructions in R2.
+ auto R2ConstraintsSatisfied = [BBr2, &IsStrictFP](Instruction *I) {
+ if (IsStrictFP(I))
+ return false;
+ // When you have multiple instructions residing in R1 and R2 respectively,
+ // it's difficult to generate combination of (R1,R2) and then check if we
+ // have the required pattern. So, for now, just be conservative.
+ if (I->getParent() != BBr2)
+ return false;
+ if (!I->hasNUsesOrMore(1))
+ return false;
+ // This simplication changes
+ // R2 = a * 1/sqrt(a)
+ // to
+ // R2 = sqrt(a)
+ // Now, sqrt(-0.0) = -0.0 and doing this simplication would produce -0.0
+ // instead of NaN.
+ return I->hasNoSignedZeros();
+ };
+ if (!std::all_of(R2.begin(), R2.end(), R2ConstraintsSatisfied))
+ return false;
+
+ // Check the constraints on X, R1 and R2 combined.
+ // fdiv instruction and one of the multiplications must reside in the same
+ // block. If not, the optimized code may execute more ops than before and
+ // this may hamper the performance.
+ return (BBx == BBr1 || BBx == BBr2);
+}
+
+static void getFSqrtDivOptPattern(Value *Div,
+ SmallSetVector<Instruction *, 2> &R1,
+ SmallSetVector<Instruction *, 2> &R2) {
+ Value *A;
+ if (match(Div, m_FDiv(m_FPOne(), m_Sqrt(m_Value(A)))) ||
+ match(Div, m_FDiv(m_SpecificFP(-1.0), m_Sqrt(m_Value(A))))) {
+ for (auto U : Div->users()) {
+ Instruction *I = dyn_cast<Instruction>(U);
+ if (!(I && I->getOpcode() == Instruction::FMul))
+ continue;
+
+ if (match(I, m_FMul(m_Specific(Div), m_Specific(Div)))) {
+ R1.insert(I);
+ continue;
+ }
+
+ Value *X;
+ if (match(I, m_FMul(m_Specific(Div), m_Value(X))) && X == A) {
+ R2.insert(I);
+ continue;
+ }
+
+ if (match(I, m_FMul(m_Value(X), m_Specific(Div))) && X == A) {
+ R2.insert(I);
+ continue;
+ }
+ }
+ }
+}
+
+static bool delayFMulSqrtTransform(Value *Div) {
+ SmallSetVector<Instruction *, 2> R1, R2;
+ getFSqrtDivOptPattern(Div, R1, R2);
+ return (!(R1.empty() || R2.empty()) &&
+ isFSqrtDivToFMulLegal((Instruction *)Div, R1, R2));
+}
+
Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
Value *Op0 = I.getOperand(0);
Value *Op1 = I.getOperand(1);
@@ -705,11 +828,11 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
// has the necessary (reassoc) fast-math-flags.
if (I.hasNoSignedZeros() &&
match(Op0, (m_FDiv(m_SpecificFP(1.0), m_Value(Y)))) &&
- match(Y, m_Sqrt(m_Value(X))) && Op1 == X)
+ match(Y, m_Sqrt(m_Value(X))) && Op1 == X && !delayFMulSqrtTransform(Op0))
return BinaryOperator::CreateFDivFMF(X, Y, &I);
if (I.hasNoSignedZeros() &&
match(Op1, (m_FDiv(m_SpecificFP(1.0), m_Value(Y)))) &&
- match(Y, m_Sqrt(m_Value(X))) && Op0 == X)
+ match(Y, m_Sqrt(m_Value(X))) && Op0 == X && !delayFMulSqrtTransform(Op1))
return BinaryOperator::CreateFDivFMF(X, Y, &I);
// Like the similar transform in instsimplify, this requires 'nsz' because
@@ -717,7 +840,8 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
if (I.hasNoNaNs() && I.hasNoSignedZeros() && Op0 == Op1 && Op0->hasNUses(2)) {
// Peek through fdiv to find squaring of square root:
// (X / sqrt(Y)) * (X / sqrt(Y)) --> (X * X) / Y
- if (match(Op0, m_FDiv(m_Value(X), m_Sqrt(m_Value(Y))))) {
+ if (match(Op0, m_FDiv(m_Value(X), m_Sqrt(m_Value(Y)))) &&
+ !delayFMulSqrtTransform(Op0)) {
Value *XX = Builder.CreateFMulFMF(X, X, &I);
return BinaryOperator::CreateFDivFMF(XX, Y, &I);
}
@@ -1796,6 +1920,35 @@ static Instruction *foldFDivSqrtDivisor(BinaryOperator &I,
return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I);
}
+Value *convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X,
+ SmallSetVector<Instruction *, 2> &R1,
+ SmallSetVector<Instruction *, 2> &R2,
+ Value *SqrtOp, InstCombiner::BuilderTy &B) {
+
+ // 1. synthesize tmp1 = 1/a and replace uses of r1
+ B.SetInsertPoint(X);
+ Value *Tmp1 =
+ B.CreateFDivFMF(ConstantFP::get(R1[0]->getType(), 1.0), SqrtOp, R1[0]);
+ for (auto *I : R1)
+ I->replaceAllUsesWith(Tmp1);
+
+ // 2. No need of synthesizing Tmp2 again. In this scenario, tmp2 = CI. Replace
+ // uses of r2 with tmp2
+ for (auto *I : R2)
+ I->replaceAllUsesWith(CI);
+
+ // 3. synthesize tmp3 = tmp1 * tmp2 . Replace uses of 'x' with tmp3
+ Value *Tmp3;
+ // If x = -1/sqrt(a) initially,then Tmp3 = -(Tmp1*tmp2)
+ if (match(X, m_FDiv(m_SpecificFP(-1.0), m_Specific(CI)))) {
+ Value *Mul = B.CreateFMul(Tmp1, CI);
+ Tmp3 = B.CreateFNegFMF(Mul, X);
+ } else
+ Tmp3 = B.CreateFMulFMF(Tmp1, CI, X);
+
+ return Tmp3;
+}
+
Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
Module *M = I.getModule();
@@ -1820,6 +1973,26 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
return R;
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+ // Convert
+ // x = 1.0/sqrt(a)
+ // r1 = x * x;
+ // r2 = a * x;
+ //
+ // TO
+ //
+ // r1 = 1/a
+ // r2 = sqrt(a)
+ // x = r1 * r2
+ SmallSetVector<Instruction *, 2> R1, R2;
+ getFSqrtDivOptPattern(&I, R1, R2);
+ if (!(R1.empty() || R2.empty()) && isFSqrtDivToFMulLegal(&I, R1, R2)) {
+ CallInst *CI = (CallInst *)((&I)->getOperand(1));
+ Value *SqrtOp = CI->getArgOperand(0);
+ if (Value *D = convertFSqrtDivIntoFMul(CI, &I, R1, R2, SqrtOp, Builder))
+ return replaceInstUsesWith(I, D);
+ }
+
if (isa<Constant>(Op0))
if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
if (Instruction *R = FoldOpIntoSelect(I, SI))
diff --git a/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
new file mode 100644
index 00000000000000..4852337d4b6586
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
@@ -0,0 +1,463 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='instcombine<no-verify-fixpoint>' < %s | FileCheck %s
+
+ at x = global double 0.000000e+00
+ at r1 = global double 0.000000e+00
+ at r2 = global double 0.000000e+00
+ at r3 = global double 0.000000e+00
+
+; div/mul/mul1 all in the same block.
+define void @bb_constraint_case1(double %a) {
+; CHECK-LABEL: define void @bb_constraint_case1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[TMP1:%.*]] = fdiv nnan nsz double 1.000000e+00, [[A]]
+; CHECK-NEXT: [[DIV:%.*]] = fmul arcp double [[TMP1]], [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: store double [[TMP1]], ptr @r1, align 8
+; CHECK-NEXT: store double [[TMP0]], ptr @r2, align 8
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = tail call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %mul = fmul nnan nsz double %div, %div
+ store double %mul, ptr @r1
+ %mul1 = fmul nsz double %a, %div
+ store double %mul1, ptr @r2
+ ret void
+}
+; div/mul in one block and mul1 in other block with conditional guard.
+define void @bb_constraint_case2(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case2(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[TMP1:%.*]] = fdiv nnan nsz double 1.000000e+00, [[A]]
+; CHECK-NEXT: [[DIV:%.*]] = fmul arcp double [[TMP1]], [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: store double [[TMP1]], ptr @r1, align 8
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: store double [[TMP0]], ptr @r2, align 8
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %mul = fmul nnan nsz double %div, %div
+ store double %mul, ptr @r1
+ %tobool.not = icmp eq i32 %d, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %mul1 = fmul nsz double %div, %a
+ store double %mul1, ptr @r2
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ ret void
+}
+
+; div in one block. mul/mul1 in other block and conditionally guarded. Don't optimize.
+define void @bb_constraint_case3(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case3(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[DIV:%.*]] = fdiv arcp double 1.000000e+00, [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[MUL:%.*]] = fmul nnan nsz double [[DIV]], [[DIV]]
+; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @x, align 8
+; CHECK-NEXT: [[MUL1:%.*]] = fmul nsz double [[TMP1]], [[A]]
+; CHECK-NEXT: store double [[MUL1]], ptr @r2, align 8
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %tobool = icmp ne i32 %d, 0
+ br i1 %tobool, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %mul = fmul nnan nsz double %div, %div
+ store double %mul, ptr @r1
+ %1 = load double, ptr @x
+ %mul1 = fmul nsz double %a, %1
+ store double %mul1, ptr @r2
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ ret void
+}
+
+; div in one block. mul/mul3 each in different block and conditionally guarded. Don't optimize.
+define void @bb_constraint_case4(double %a, i32 %c, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case4(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[DIV:%.*]] = fdiv arcp double 1.000000e+00, [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[MUL:%.*]] = fmul nnan nsz double [[DIV]], [[DIV]]
+; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_END4:%.*]], label [[IF_THEN2:%.*]]
+; CHECK: if.then2:
+; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @x, align 8
+; CHECK-NEXT: [[MUL3:%.*]] = fmul nsz double [[TMP1]], [[A]]
+; CHECK-NEXT: store double [[MUL3]], ptr @r2, align 8
+; CHECK-NEXT: br label [[IF_END4]]
+; CHECK: if.end4:
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %tobool = icmp ne i32 %c, 0
+ br i1 %tobool, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %mul = fmul nnan nsz double %div, %div
+ store double %mul, ptr @r1
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %tobool1 = icmp ne i32 %d, 0
+ br i1 %tobool1, label %if.then2, label %if.end4
+
+if.then2: ; preds = %if.end
+ %1 = load double, ptr @x
+ %mul3 = fmul nsz double %a, %1
+ store double %mul3, ptr @r2
+ br label %if.end4
+
+if.end4: ; preds = %if.then2, %if.end
+ ret void
+}
+
+; sqrt value comes from different blocks. Don't optimize.
+define void @bb_constraint_case5(double %a, i32 %c) {
+; CHECK-LABEL: define void @bb_constraint_case5(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.else:
+; CHECK-NEXT: [[ADD:%.*]] = fadd double [[A]], 1.000000e+01
+; CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.sqrt.f64(double [[ADD]])
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: [[DOTPN:%.*]] = phi double [ [[TMP0]], [[IF_THEN]] ], [ [[TMP1]], [[IF_ELSE]] ]
+; CHECK-NEXT: [[DIV:%.*]] = fdiv arcp double 1.000000e+00, [[DOTPN]]
+; CHECK-NEXT: [[MUL:%.*]] = fmul nnan nsz double [[DIV]], [[DIV]]
+; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT: [[MUL2:%.*]] = fmul nsz double [[DIV]], [[A]]
+; CHECK-NEXT: store double [[MUL2]], ptr @r2, align 8
+; CHECK-NEXT: ret void
+entry:
+ %tobool = icmp ne i32 %c, 0
+ br i1 %tobool, label %if.then, label %if.else
+
+if.then: ; preds = %entry
+ %0 = call double @llvm.sqrt.f64(double %a)
+ br label %if.end
+
+if.else: ; preds = %entry
+ %add = fadd double %a, 1.000000e+01
+ %1 = call double @llvm.sqrt.f64(double %add)
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %sqrt = phi double[ %0, %if.then], [ %1, %if.else]
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ %mul = fmul nnan nsz double %div, %div
+ store double %mul, ptr @r1
+ %mul2 = fmul nsz double %a, %div
+ store double %mul2, ptr @r2
+ ret void
+}
+
+; div in one block and conditionally guarded. mul/mul1 in other block. Don't optimize.
+define void @bb_constraint_case6(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case6(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: entry.if.end_crit_edge:
+; CHECK-NEXT: [[DOTPRE:%.*]] = load double, ptr @x, align 8
+; CHECK-NEXT: br label [[IF_END1:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[DIV:%.*]] = fdiv arcp double 1.000000e+00, [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: br label [[IF_END1]]
+; CHECK: if.end:
+; CHECK-NEXT: [[TMP1:%.*]] = phi double [ [[DOTPRE]], [[IF_END]] ], [ [[DIV]], [[IF_THEN]] ]
+; CHECK-NEXT: [[MUL:%.*]] = fmul nnan nsz double [[TMP1]], [[TMP1]]
+; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT: [[MUL1:%.*]] = fmul nsz double [[TMP1]], [[A]]
+; CHECK-NEXT: store double [[MUL1]], ptr @r2, align 8
+; CHECK-NEXT: ret void
+entry:
+ %tobool.not = icmp eq i32 %d, 0
+ br i1 %tobool.not, label %entry.if.end_crit_edge, label %if.then
+
+entry.if.end_crit_edge: ; preds = %entry
+ %.pre = load double, ptr @x
+ br label %if.end
+
+if.then: ; preds = %entry
+ %sqrt = tail call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ br label %if.end
+
+if.end: ; preds = %entry.if.end_crit_edge, %if.then
+ %1 = phi double [ %.pre, %entry.if.end_crit_edge ], [ %div, %if.then ]
+ %mul = fmul nnan nsz double %1, %1
+ store double %mul, ptr @r1
+ %mul1 = fmul nsz double %1, %a
+ store double %mul1, ptr @r2
+ ret void
+}
+
+; value for first mul(i.e. div4.sink) comes from different blocks. Don't optimize.
+define void @bb_constraint_case7(double %a, i32 %c, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case7(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[DIV:%.*]] = fdiv arcp double 1.000000e+00, [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[DIV1:%.*]] = fdiv double 3.000000e+00, [[A]]
+; CHECK-NEXT: br label [[IF_END6:%.*]]
+; CHECK: if.else:
+; CHECK-NEXT: [[TOBOOL2_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT: br i1 [[TOBOOL2_NOT]], label [[IF_ELSE5:%.*]], label [[IF_THEN3:%.*]]
+; CHECK: if.then3:
+; CHECK-NEXT: [[DIV4:%.*]] = fdiv double 2.000000e+00, [[A]]
+; CHECK-NEXT: br label [[IF_END6]]
+; CHECK: if.else5:
+; CHECK-NEXT: [[MUL:%.*]] = fmul nnan nsz double [[DIV]], [[DIV]]
+; CHECK-NEXT: br label [[IF_END6]]
+; CHECK: if.end6:
+; CHECK-NEXT: [[DIV4_SINK:%.*]] = phi double [ [[DIV4]], [[IF_THEN3]] ], [ [[MUL]], [[IF_ELSE5]] ], [ [[DIV1]], [[IF_THEN]] ]
+; CHECK-NEXT: store double [[DIV4_SINK]], ptr @r1, align 8
+; CHECK-NEXT: [[MUL7:%.*]] = fmul nsz double [[DIV]], [[A]]
+; CHECK-NEXT: store double [[MUL7]], ptr @r2, align 8
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = tail call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %tobool.not = icmp eq i32 %c, 0
+ br i1 %tobool.not, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %div1 = fdiv double 3.000000e+00, %a
+ br label %if.end6
+
+if.else: ; preds = %entry
+ %tobool2.not = icmp eq i32 %d, 0
+ br i1 %tobool2.not, label %if.else5, label %if.then3
+
+if.then3: ; preds = %if.else
+ %div4 = fdiv double 2.000000e+00, %a
+ br label %if.end6
+
+if.else5: ; preds = %if.else
+ %mul = fmul nnan nsz double %div, %div
+ br label %if.end6
+
+if.end6: ; preds = %if.then3, %if.else5, %if.then
+ %div4.sink = phi double [ %div4, %if.then3 ], [ %mul, %if.else5 ], [ %div1, %if.then ]
+ store double %div4.sink, ptr @r1
+ %mul7 = fmul nsz double %a, %div
+ store double %mul7, ptr @r2
+ ret void
+}
+
+; value of first mul comes from two different blocks(as shown by select ins).
+define void @bb_constraint_case8(double %a, i32 %c) {
+; CHECK-LABEL: define void @bb_constraint_case8(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[TMP1:%.*]] = fdiv nnan nsz double 1.000000e+00, [[A]]
+; CHECK-NEXT: [[DIV:%.*]] = fmul arcp double [[TMP1]], [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT: [[MUL1:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT: [[STOREMERGE:%.*]] = select i1 [[TOBOOL_NOT]], double [[MUL1]], double [[TMP1]]
+; CHECK-NEXT: store double [[STOREMERGE]], ptr @r1, align 8
+; CHECK-NEXT: store double [[TMP0]], ptr @r2, align 8
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %tobool.not = icmp eq i32 %c, 0
+ %mul1 = fmul double %a, %a
+ %mul = fmul nnan nsz double %div, %div
+ %storemerge = select i1 %tobool.not, double %mul1, double %mul
+ store double %storemerge, ptr @r1
+ %mul2 = fmul nsz double %div, %a
+ store double %mul2, ptr @r2
+ ret void
+}
+
+; multiple instances of multiply ops to optimize. Optimize all.
+define void @mutiple_multiply_instances(double %a, i32 %c) {
+; CHECK-LABEL: define void @mutiple_multiply_instances(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[MUL:%.*]] = fdiv nnan nsz double 1.000000e+00, [[A]]
+; CHECK-NEXT: [[DIV:%.*]] = fmul arcp double [[MUL]], [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT: [[MUL2:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT: [[MUL1:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT: [[MUL_SINK:%.*]] = select i1 [[TOBOOL_NOT]], double [[MUL2]], double [[MUL]]
+; CHECK-NEXT: [[STOREMERGE:%.*]] = select i1 [[TOBOOL_NOT]], double [[MUL]], double [[MUL1]]
+; CHECK-NEXT: store double [[MUL_SINK]], ptr @r1, align 8
+; CHECK-NEXT: store double [[STOREMERGE]], ptr @r3, align 8
+; CHECK-NEXT: store double [[TMP0]], ptr @r2, align 8
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = tail call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %tobool.not = icmp eq i32 %c, 0
+ %mul2 = fmul double %a, %a
+ %mul3 = fmul nnan nsz double %div, %div
+ %mul = fmul nnan nsz double %div, %div
+ %mul1 = fmul double %a, %a
+ %mul.sink = select i1 %tobool.not, double %mul2, double %mul
+ %storemerge = select i1 %tobool.not, double %mul3, double %mul1
+ store double %mul.sink, ptr @r1
+ store double %storemerge, ptr @r3
+ %mul4 = fmul nsz double %a, %div
+ store double %mul4, ptr @r2
+ ret void
+}
+
+; missing flags for optimization.
+define void @missing_flags_on_div(double %a) {
+; CHECK-LABEL: define void @missing_flags_on_div(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[DIV:%.*]] = fdiv double 1.000000e+00, [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: [[MUL:%.*]] = fmul nnan nsz double [[DIV]], [[DIV]]
+; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT: [[MUL1:%.*]] = fmul nsz double [[DIV]], [[A]]
+; CHECK-NEXT: store double [[MUL1]], ptr @r2, align 8
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = tail call double @llvm.sqrt.f64(double %a)
+ %div = fdiv double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %mul = fmul nnan nsz double %div, %div
+ store double %mul, ptr @r1
+ %mul1 = fmul nsz double %a, %div
+ store double %mul1, ptr @r2
+ ret void
+}
+
+; missing flags for optimization.
+define void @missing_flags_on_first_mul(double %a) {
+; CHECK-LABEL: define void @missing_flags_on_first_mul(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[DIV:%.*]] = fdiv arcp double 1.000000e+00, [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: [[MUL:%.*]] = fmul double [[DIV]], [[DIV]]
+; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT: [[MUL1:%.*]] = fmul nsz double [[DIV]], [[A]]
+; CHECK-NEXT: store double [[MUL1]], ptr @r2, align 8
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = tail call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %mul = fmul double %div, %div
+ store double %mul, ptr @r1
+ %mul1 = fmul nsz double %a, %div
+ store double %mul1, ptr @r2
+ ret void
+}
+
+; missing flags for optimization.
+define void @missing_flags_on_second_mul(double %a) {
+; CHECK-LABEL: define void @missing_flags_on_second_mul(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[DIV:%.*]] = fdiv arcp double 1.000000e+00, [[TMP0]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: [[MUL:%.*]] = fmul nnan nsz double [[DIV]], [[DIV]]
+; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT: [[MUL1:%.*]] = fmul double [[DIV]], [[A]]
+; CHECK-NEXT: store double [[MUL1]], ptr @r2, align 8
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = tail call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double 1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %mul = fmul nnan nsz double %div, %div
+ store double %mul, ptr @r1
+ %mul1 = fmul double %a, %div
+ store double %mul1, ptr @r2
+ ret void
+}
+
+; div = -1/sqrt(a)
+define void @negative_fdiv_val(double %a) {
+; CHECK-LABEL: define void @negative_fdiv_val(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SQRT:%.*]] = tail call double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT: [[MUL:%.*]] = fdiv nnan nsz double 1.000000e+00, [[A]]
+; CHECK-NEXT: [[TMP1:%.*]] = fneg arcp double [[MUL]]
+; CHECK-NEXT: [[DIV:%.*]] = fmul arcp double [[SQRT]], [[TMP1]]
+; CHECK-NEXT: store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT: store double [[SQRT]], ptr @r2, align 8
+; CHECK-NEXT: ret void
+entry:
+ %sqrt = tail call double @llvm.sqrt.f64(double %a)
+ %div = fdiv arcp double -1.000000e+00, %sqrt
+ store double %div, ptr @x
+ %mul = fmul nnan nsz double %div, %div
+ store double %mul, ptr @r1
+ %mul1 = fmul nsz double %a, %div
+ store double %mul1, ptr @r2
+ ret void
+}
+declare double @llvm.sqrt.f64(double)
More information about the llvm-commits
mailing list