[llvm] [InstCombine] Transform high latency, dependent FSQRT/FDIV into FMUL (PR #87474)

Wed Jun 12 22:06:32 PDT 2024

https://github.com/sushgokh updated https://github.com/llvm/llvm-project/pull/87474

>From 6f98efd51e13fdc2bdee75a1401ef821b882136b Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Wed, 3 Apr 2024 15:25:17 +0530
Subject: [PATCH] [InstCombine] Transform high latency, dependent FSQRT/FDIV
 into FMUL The proposed patch, in general, tries to transform the below code
 sequence: x = 1.0 / sqrt (a); r1 = x * x;  // same as 1.0 / a r2 = a /
 sqrt(a); // same as sqrt (a)

TO

(If x, r1 and r2 are all used further in the code)
tmp1 = 1.0 / a
tmp2 = sqrt (a)
tmp3 = tmp1 * tmp2
x = tmp3
r1 = tmp1
r2 = tmp2

The transform tries to make high latency sqrt and div operations independent and also saves on one multiplication.

The patch was tested with SPEC17 suite with cpu=neoverse-v2.
The performance uplift achieved was:
544.nab_r   ~4%

No other regressions were observed. Also, no compile time differences were observed with the patch.

Closes #54652
---
 .../InstCombine/InstCombineMulDivRem.cpp      | 160 ++++++
 .../InstCombine/fsqrtdiv-transform.ll         | 514 ++++++++++++++++++
 2 files changed, 674 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 8c698e52b5a0e..23a21056ceae2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -626,6 +626,88 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
   return nullptr;
 }
 
+// Check legality for transforming
+// x = 1.0/sqrt(a)
+// r1 = x * x;
+// r2 = a/sqrt(a);
+//
+// TO
+//
+// r1 = 1/a
+// r2 = sqrt(a)
+// x = r1 * r2
+static bool isFSqrtDivToFMulLegal(Instruction *X, ArrayRef<Instruction *> R1,
+                                  ArrayRef<Instruction *> R2) {
+  BasicBlock *BBx = X->getParent();
+  BasicBlock *BBr1 = R1[0]->getParent();
+  BasicBlock *BBr2 = R2[0]->getParent();
+
+  CallInst *FSqrt = cast<CallInst>(X->getOperand(1));
+  if (!FSqrt->hasAllowReassoc() || !FSqrt->hasNoNaNs() ||
+      !FSqrt->hasNoSignedZeros() || !FSqrt->hasNoInfs())
+    return false;
+
+  // We change x = 1/sqrt(a) to x = sqrt(a) * 1/a . This change isn't allowed
+  // by recip fp as it is strictly meant to transform ops of type a/b to
+  // a * 1/b. So, this can be considered as algebraic rewrite and reassoc flag
+  // has been used(rather abused)in the past for algebraic rewrites.
+  if (!X->hasAllowReassoc() || !X->hasAllowReciprocal() || !X->hasNoInfs())
+    return false;
+
+  // Check the constraints on instructions in R1.
+  if (any_of(R1, [BBr1](Instruction *I) {
+        // When you have multiple instructions residing in R1 and R2
+        // respectively, it's difficult to generate combinations of (R1,R2) and
+        // then check if we have the required pattern. So, for now, just be
+        // conservative.
+        return (I->getParent() != BBr1 || !I->hasAllowReassoc());
+      }))
+    return false;
+
+  // Check the constraints on instructions in R2.
+  if (any_of(R2, [BBr2](Instruction *I) {
+        // When you have multiple instructions residing in R1 and R2
+        // respectively, it's difficult to generate combination of (R1,R2) and
+        // then check if we have the required pattern. So, for now, just be
+        // conservative.
+        return (I->getParent() != BBr2 || !I->hasAllowReassoc());
+      }))
+    return false;
+
+  // Check the constraints on X, R1 and R2 combined.
+  // fdiv instruction and one of the multiplications must reside in the same
+  // block. If not, the optimized code may execute more ops than before and
+  // this may hamper the performance.
+  return (BBx == BBr1 || BBx == BBr2);
+}
+
+static void getFSqrtDivOptPattern(Instruction *Div,
+                                  SmallVectorImpl<Instruction *> &R1,
+                                  SmallVectorImpl<Instruction *> &R2) {
+  Value *A;
+  if (match(Div, m_FDiv(m_FPOne(), m_Sqrt(m_Value(A)))) ||
+      match(Div, m_FDiv(m_SpecificFP(-1.0), m_Sqrt(m_Value(A))))) {
+    for (User *U : Div->users()) {
+      Instruction *I = dyn_cast<Instruction>(U);
+      if (!I || I->getOpcode() != Instruction::FMul)
+        continue;
+
+      if (match(I, m_FMul(m_Specific(Div), m_Specific(Div)))) {
+        R1.push_back(I);
+        continue;
+      }
+    }
+    CallInst *CI = cast<CallInst>(Div->getOperand(1));
+    for (User *U : CI->users()) {
+      Instruction *I = cast<Instruction>(U);
+      if (match(I, m_FDiv(m_Specific(A), m_Sqrt(m_Specific(A))))) {
+        R2.push_back(I);
+        continue;
+      }
+    }
+  }
+}
+
 Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0);
   Value *Op1 = I.getOperand(1);
@@ -1796,6 +1878,64 @@ static Instruction *foldFDivSqrtDivisor(BinaryOperator &I,
   return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I);
 }
 
+// Change
+// X = 1/sqrt(a)
+// R1 = X * X
+// R2 = a * X
+//
+// TO
+//
+// Tmp1 = 1/a
+// Tmp2 = sqrt(a)
+// Tmp3 = Tmp1 * Tmp2
+// Replace Uses Of R1 With Tmp1
+// Replace Uses Of R2 With Tmp2
+// Replace Uses Of X With Tmp3
+static Value *convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X,
+                                      ArrayRef<Instruction *> R1,
+                                      ArrayRef<Instruction *> R2, Value *SqrtOp,
+                                      InstCombiner::BuilderTy &B) {
+
+  B.SetInsertPoint(X);
+
+  // Every instance of R1 may have different fpmath metadata and fpmath flags.
+  // We try to preserve them by having seperate fdiv instruction per R1
+  // instance.
+  Instruction *Tmp1;
+  for (Instruction *I : R1) {
+    Tmp1 = cast<Instruction>(
+        B.CreateFDiv(ConstantFP::get(R1[0]->getType(), 1.0), SqrtOp));
+    Tmp1->copyMetadata(*I);
+    Tmp1->copyFastMathFlags(I);
+    I->replaceAllUsesWith(Tmp1);
+  }
+
+  // Although, by value, Tmp2 = CI , every instance of R2 may have different
+  // fpmath metadata and fpmath flags. We try to preserve them by cloning the
+  // call instruction per R2 instance.
+  CallInst *Sqrt = B.CreateUnaryIntrinsic(Intrinsic::sqrt, SqrtOp);
+  Instruction *Tmp2;
+  for (Instruction *I : R2) {
+    Tmp2 = Sqrt->clone();
+    Tmp2->insertBefore(CI);
+    Tmp2->setName("sqrt");
+    Tmp2->copyFastMathFlags(I);
+    Tmp2->copyMetadata(*I);
+    I->replaceAllUsesWith(Tmp2);
+  }
+
+  Instruction *Tmp3;
+  // If X = -1/sqrt(a) initially,then Tmp3 = -(Tmp1*tmp2)
+  if (match(X, m_FDiv(m_SpecificFP(-1.0), m_Specific(CI)))) {
+    Value *Mul = B.CreateFMul(Tmp1, Tmp2);
+    Tmp3 = cast<Instruction>(B.CreateFNegFMF(Mul, X));
+  } else
+    Tmp3 = cast<Instruction>(B.CreateFMulFMF(Tmp1, Tmp2, X));
+  Tmp3->copyMetadata(*X);
+
+  return Tmp3;
+}
+
 Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   Module *M = I.getModule();
 
@@ -1820,6 +1960,26 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
     return R;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Convert
+  // x = 1.0/sqrt(a)
+  // r1 = x * x;
+  // r2 = a/sqrt(a);
+  //
+  // TO
+  //
+  // r1 = 1/a
+  // r2 = sqrt(a)
+  // x = r1 * r2
+  SmallVector<Instruction *, 2> R1, R2;
+  getFSqrtDivOptPattern(&I, R1, R2);
+  if (!R1.empty() && !R2.empty() && isFSqrtDivToFMulLegal(&I, R1, R2)) {
+    CallInst *CI = cast<CallInst>(I.getOperand(1));
+    Value *SqrtOp = CI->getArgOperand(0);
+    if (Value *D = convertFSqrtDivIntoFMul(CI, &I, R1, R2, SqrtOp, Builder))
+      return replaceInstUsesWith(I, D);
+  }
+
   if (isa<Constant>(Op0))
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
       if (Instruction *R = FoldOpIntoSelect(I, SI))
diff --git a/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
new file mode 100644
index 0000000000000..1e59f73763235
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll
@@ -0,0 +1,514 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='instcombine<no-verify-fixpoint>' < %s | FileCheck %s
+
+ at x = global double 0.000000e+00
+ at r1 = global double 0.000000e+00
+ at r2 = global double 0.000000e+00
+ at r3 = global double 0.000000e+00
+ at v = global [2 x double] zeroinitializer
+ at v1 = global [2 x double] zeroinitializer
+ at v2 = global [2 x double] zeroinitializer
+
+; div/mul/div1 in the same block.
+define void @bb_constraint_case1(double %a) {
+; CHECK-LABEL: define void @bb_constraint_case1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double [[SQRT1]], [[A]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+; div/mul in one block and div1 in other block with conditional guard.
+define void @bb_constraint_case2(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case2(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double [[SQRT1]], [[A]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; div in one block. mul/div1 in other block and conditionally guarded. Don't optimize.
+define void @bb_constraint_case3(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case3(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; div in one block. mul/div1 each in different block and conditionally guarded. Don't optimize.
+define void @bb_constraint_case4(double %a, i32 %c, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case4(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_END1:%.*]], label [[IF_THEN1:%.*]]
+; CHECK:       if.then1:
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    br label [[IF_END1]]
+; CHECK:       if.end1:
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.end1, label %if.then1
+
+if.then1:                                         ; preds = %if.end
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  br label %if.end1
+
+if.end1:                                          ; preds = %if.then1, %if.end
+  ret void
+}
+
+; sqrt value comes from different blocks. Don't optimize.
+define void @bb_constraint_case5(double %a, i32 %c) {
+; CHECK-LABEL: define void @bb_constraint_case5(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[A]], 1.000000e+01
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[ADD]])
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[SQRT:%.*]] = phi double [ [[TMP0]], [[IF_THEN]] ], [ [[TMP1]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %0 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %add = fadd double %a, 1.000000e+01
+  %1 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %add)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %sqrt = phi double[ %0, %if.then], [ %1, %if.else]
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div in one block and conditionally guarded. mul/div1 in other block. Don't optimize.
+define void @bb_constraint_case6(double %a, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case6(
+; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr @x, align 8
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[TMP1]], ptr @x, align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[DIV:%.*]] = phi double [ [[TMP0]], [[IF_ELSE]] ], [ [[TMP1]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.else, label %if.then
+
+if.else:                                          ; preds = %entry
+  %1 = load double, ptr @x
+  br label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %2, ptr @x
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %div = phi double [ %1, %if.else ], [ %2, %if.then ]
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; value for mul comes from different blocks. Don't optimize.
+define void @bb_constraint_case7(double %a, i32 %c, i32 %d) {
+; CHECK-LABEL: define void @bb_constraint_case7(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv double 3.000000e+00, [[A]]
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[D_NOT:%.*]] = icmp eq i32 [[D]], 0
+; CHECK-NEXT:    br i1 [[D_NOT]], label [[IF_ELSE1:%.*]], label [[IF_THEN1:%.*]]
+; CHECK:       if.then1:
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv double 2.000000e+00, [[A]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.else1:
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[MUL:%.*]] = phi double [ [[TMP1]], [[IF_THEN1]] ], [ [[TMP2]], [[IF_ELSE1]] ], [ [[TMP0]], [[IF_THEN]] ]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  br i1 %c.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = fdiv double 3.000000e+00, %a
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %d.not = icmp eq i32 %d, 0
+  br i1 %d.not, label %if.else1, label %if.then1
+
+if.then1:                                         ; preds = %if.else
+  %2 = fdiv double 2.000000e+00, %a
+  br label %if.end
+
+if.else1:                                         ; preds = %if.else
+  %3 = fmul reassoc double %div, %div
+  br label %if.end
+
+if.end:                                           ; preds = %if.then1, %if.else1, %if.then
+  %mul = phi double [ %2, %if.then1 ], [ %3, %if.else1 ], [ %1, %if.then ]
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; value of mul  comes from two different blocks(as shown by select ins).
+define void @bb_constraint_case8(double %a, i32 %c) {
+; CHECK-LABEL: define void @bb_constraint_case8(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double [[SQRT1]], [[A]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP0]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  %1 = fmul double %a, %a
+  %2 = fmul reassoc double %div, %div
+  %mul = select i1 %c.not, double %1, double %2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; multiple instances of multiply ops to optimize. Optimize all.
+define void @mutiple_multiply_instances(double %a, i32 %c) {
+; CHECK-LABEL: define void @mutiple_multiply_instances(
+; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double [[SQRT1]], [[A]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[C_NOT:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[A]], [[A]]
+; CHECK-NEXT:    [[MUL1:%.*]] = select i1 [[C_NOT]], double [[TMP2]], double [[TMP1]]
+; CHECK-NEXT:    [[MUL2:%.*]] = select i1 [[C_NOT]], double [[TMP0]], double [[TMP3]]
+; CHECK-NEXT:    store double [[MUL1]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[MUL2]], ptr @r3, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %c.not = icmp eq i32 %c, 0
+  %1 = fmul double %a, %a
+  %2 = fmul double %a, %a
+  %3 = fmul reassoc double %div, %div
+  %4 = fmul reassoc double %div, %div
+  %mul1 = select i1 %c.not, double %1, double %3
+  %mul2 = select i1 %c.not, double %4, double %2
+  store double %mul1, ptr @r1
+  store double %mul2, ptr @r3
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_arcp_flag_on_div(double %a) {
+; CHECK-LABEL: define void @missing_arcp_flag_on_div(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_reassoc_flag_on_mul(double %a) {
+; CHECK-LABEL: define void @missing_reassoc_flag_on_mul(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; missing flags for optimization.
+define void @missing_reassoc_flag_on_div1(double %a) {
+; CHECK-LABEL: define void @missing_reassoc_flag_on_div1(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]]
+; CHECK-NEXT:    store double [[MUL]], ptr @r1, align 8
+; CHECK-NEXT:    [[DIV1:%.*]] = fdiv double [[A]], [[SQRT]]
+; CHECK-NEXT:    store double [[DIV1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+; div = -1/sqrt(a)
+define void @negative_fdiv_val(double %a) {
+; CHECK-LABEL: define void @negative_fdiv_val(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg reassoc ninf arcp double [[SQRT1]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double [[TMP1]], [[A]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a)
+  %div = fdiv reassoc arcp ninf double -1.000000e+00, %sqrt
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @preserve_fpmath_metadata(double %a) {
+; CHECK-LABEL: define void @preserve_fpmath_metadata(
+; CHECK-SAME: double [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1:![0-9]+]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp double [[SQRT1]], [[A]]
+; CHECK-NEXT:    store double [[DIV]], ptr @x, align 8
+; CHECK-NEXT:    store double [[TMP0]], ptr @r1, align 8
+; CHECK-NEXT:    store double [[SQRT1]], ptr @r2, align 8
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a), !fpmath !0
+  %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1
+  store double %div, ptr @x
+  %mul = fmul reassoc double %div, %div, !fpmath !2
+  store double %mul, ptr @r1
+  %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3
+  store double %div1, ptr @r2
+  ret void
+}
+
+define void @bb_constraint_vector_input(<2 x double> %a) {
+; CHECK-LABEL: define void @bb_constraint_vector_input(
+; CHECK-SAME: <2 x double> [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SQRT1:%.*]] = call reassoc <2 x double> @llvm.sqrt.v2f64(<2 x double> [[A]])
+; CHECK-NEXT:    [[TMP0:%.*]] = fdiv reassoc <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[A]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv reassoc ninf arcp <2 x double> [[SQRT1]], [[A]]
+; CHECK-NEXT:    store <2 x double> [[DIV]], ptr @v, align 16
+; CHECK-NEXT:    store <2 x double> [[TMP0]], ptr @v1, align 16
+; CHECK-NEXT:    store <2 x double> [[SQRT1]], ptr @v2, align 16
+; CHECK-NEXT:    ret void
+entry:
+  %sqrt = call reassoc nnan nsz ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
+  %div = fdiv reassoc arcp ninf <2 x double><double 1.000000e+00, double 1.000000e+00>, %sqrt
+  store <2 x double> %div, ptr @v
+  %mul = fmul reassoc <2 x double> %div, %div
+  store <2 x double> %mul, ptr @v1
+  %div1 = fdiv reassoc <2 x double> %a, %sqrt
+  store <2 x double> %div1, ptr @v2
+  ret void
+}
+
+declare double @llvm.sqrt.f64(double)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+
+!0 = !{float 2.5}
+!1 = !{float 3.5}
+!2 = !{float 4.5}
+!3 = !{float 5.5}