[llvm] f3e1f44 - [IVDescriptor] Get the exact FP instruction that does not allow reordering

Wed Jan 26 21:34:54 PST 2022

Author: Congzhe Cao
Date: 2022-01-27T00:33:46-05:00
New Revision: f3e1f44340dc26e3810d601edf0e052813b7a11c

URL: https://github.com/llvm/llvm-project/commit/f3e1f44340dc26e3810d601edf0e052813b7a11c
DIFF: https://github.com/llvm/llvm-project/commit/f3e1f44340dc26e3810d601edf0e052813b7a11c.diff

LOG: [IVDescriptor] Get the exact FP instruction that does not allow reordering

This is a bugfix in IVDescriptor.cpp.

The helper function `RecurrenceDescriptor::getExactFPMathInst()`
is supposed to return the 1st FP instruction that does not allow
reordering. However, when constructing the RecurrenceDescriptor,
we trace the use-def chain staring from a PHI node and for each
instruction in the use-def chain, its descriptor overrides the
previous one. Therefore in the final RecurrenceDescriptor we
constructed, we lose previous FP instructions that does not allow
reordering.

Reviewed By: kmclaughlin

Differential Revision: https://reviews.llvm.org/D118073

Added: 
    

Modified: 
    llvm/lib/Analysis/IVDescriptors.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index e40b08b322637..44b1d94ebdc80 100644

--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -309,6 +309,10 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // flags from all the reduction operations.
   FastMathFlags FMF = FastMathFlags::getFast();
 
+  // The first instruction in the use-def chain of the Phi node that requires
+  // exact floating point operations.
+  Instruction *ExactFPMathInst = nullptr;
+
   // A value in the reduction can be used:
   //  - By the reduction:
   //      - Reduction operation:
@@ -352,6 +356,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
     if (Cur != Start) {
       ReduxDesc =
           isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF);
+      ExactFPMathInst = ExactFPMathInst == nullptr
+                            ? ReduxDesc.getExactFPMathInst()
+                            : ExactFPMathInst;
       if (!ReduxDesc.isRecurrence())
         return false;
       // FIXME: FMF is allowed on phi, but propagation is not handled correctly.
@@ -480,8 +487,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
-  const bool IsOrdered = checkOrderedReduction(
-      Kind, ReduxDesc.getExactFPMathInst(), ExitInstruction, Phi);
+  const bool IsOrdered =
+      checkOrderedReduction(Kind, ExactFPMathInst, ExitInstruction, Phi);
 
   if (Start != Phi) {
     // If the starting value is not the same as the phi node, we speculatively
@@ -538,9 +545,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // is saved as part of the RecurrenceDescriptor.
 
   // Save the description of this reduction variable.
-  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF,
-                          ReduxDesc.getExactFPMathInst(), RecurrenceType,
-                          IsSigned, IsOrdered, CastInsts,
+  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst,
+                          RecurrenceType, IsSigned, IsOrdered, CastInsts,
                           MinWidthCastToRecurrenceType);
   RedDes = RD;
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
index 3aa88d7fc18c1..a0dba427ce3a5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -585,6 +585,55 @@ for.end:                                         ; preds = %for.body
   ret float %rdx
 }
 
+; Negative test - loop contains two fadds and only one fadd has the fast flag,
+; which we cannot safely reorder.
+define float @fadd_multiple_one_flag(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
+; CHECK-ORDERED-LABEL: @fadd_multiple_one_flag
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fadd_multiple_one_flag
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd fast <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2]] = fadd fast float %[[FADD1]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RET]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_one_flag
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum = phi float [ -0.000000e+00, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %sum, %0
+  %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add3 = fadd fast float %add, %1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                         ; preds = %for.body
+  %rdx = phi float [ %add3, %for.body ]
+  ret float %rdx
+}
+
 ; Tests with both a floating point reduction & induction, e.g.
 ;
 ;float fp_iv_rdx_loop(float *values, float init, float * __restrict__ A, int N) {