[llvm] 8ff47f6 - [LoopVectorize] Enable integer Mul and Add as select reduction patterns

Mon Jan 30 01:42:20 PST 2023

Author: Matt Devereau
Date: 2023-01-30T09:41:40Z
New Revision: 8ff47f6032cbfd49f8fe22d46a48eb602b224661

URL: https://github.com/llvm/llvm-project/commit/8ff47f6032cbfd49f8fe22d46a48eb602b224661
DIFF: https://github.com/llvm/llvm-project/commit/8ff47f6032cbfd49f8fe22d46a48eb602b224661.diff

LOG: [LoopVectorize] Enable integer Mul and Add as select reduction patterns

This patch vectorizes Phi node loop reductions for select's whos condition
comes from a floating-point comparison, with its operands being integers
for Add, Sub, and Mul reductions.

Example:

int foo(float *x, int n) {
    int sum = 0;
    for (int i=0; i<n; ++i) {
        float elem = x[i];
        if (elem > 0) {
            sum += 2;
        }
    }
    return sum;
}

This would previously fail to vectorize due to the integer reduction.

Added: 
    

Modified: 
    llvm/lib/Analysis/IVDescriptors.cpp
    llvm/test/Transforms/LoopVectorize/if-reduction.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 8b9e5c6f56f70..ead48e4028152 100644

--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -745,15 +745,21 @@ RecurrenceDescriptor::isConditionalRdxPattern(RecurKind Kind, Instruction *I) {
     return InstDesc(false, I);
 
   Value *Op1, *Op2;
-  if ((m_FAdd(m_Value(Op1), m_Value(Op2)).match(I1)  ||
-       m_FSub(m_Value(Op1), m_Value(Op2)).match(I1)) &&
-      I1->isFast())
-    return InstDesc(Kind == RecurKind::FAdd, SI);
+  if (!(((m_FAdd(m_Value(Op1), m_Value(Op2)).match(I1) ||
+          m_FSub(m_Value(Op1), m_Value(Op2)).match(I1)) &&
+         I1->isFast()) ||
+        (m_FMul(m_Value(Op1), m_Value(Op2)).match(I1) && (I1->isFast())) ||
+        ((m_Add(m_Value(Op1), m_Value(Op2)).match(I1) ||
+          m_Sub(m_Value(Op1), m_Value(Op2)).match(I1))) ||
+        (m_Mul(m_Value(Op1), m_Value(Op2)).match(I1))))
+    return InstDesc(false, I);
 
-  if (m_FMul(m_Value(Op1), m_Value(Op2)).match(I1) && (I1->isFast()))
-    return InstDesc(Kind == RecurKind::FMul, SI);
+  Instruction *IPhi = isa<PHINode>(*Op1) ? dyn_cast<Instruction>(Op1)
+                                         : dyn_cast<Instruction>(Op2);
+  if (!IPhi || IPhi != FalseVal)
+    return InstDesc(false, I);
 
-  return InstDesc(false, I);
+  return InstDesc(true, SI);
 }
 
 RecurrenceDescriptor::InstDesc
@@ -786,7 +792,8 @@ RecurrenceDescriptor::isRecurrenceInstr(Loop *L, PHINode *OrigPhi,
     return InstDesc(Kind == RecurKind::FAdd, I,
                     I->hasAllowReassoc() ? nullptr : I);
   case Instruction::Select:
-    if (Kind == RecurKind::FAdd || Kind == RecurKind::FMul)
+    if (Kind == RecurKind::FAdd || Kind == RecurKind::FMul ||
+        Kind == RecurKind::Add || Kind == RecurKind::Mul)
       return isConditionalRdxPattern(Kind, I);
     [[fallthrough]];
   case Instruction::FCmp:

diff  --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
index b82c5bb473c54..6ef5d62b65051 100644
--- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
@@ -821,5 +821,142 @@ for.end:                                          ; preds = %for.body, %entry
   ret float %sum.0.lcssa
 }
 
+; CHECK-LABEL: @fcmp_0_add_select2(
+; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = add <4 x i64> %[[V2:.*]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK: select <4 x i1> %[[V1]], <4 x i64> %[[V3]], <4 x i64> %[[V2]]
+define i64 @fcmp_0_add_select2(ptr noalias %x, i64 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i64 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi i64 [ 0, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %add = add nsw i64 %sum.1, 2
+  %sum.2 = select i1 %cmp.2, i64 %add, i64 %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi i64 [ 0, %entry ], [ %sum.2, %for.body ]
+  ret i64 %1
+}
+
+; CHECK-LABEL: @fcmp_0_sub_select1(
+; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = sub <4 x i32> %[[V2:.*]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK: select <4 x i1> %[[V1]], <4 x i32> %[[V3]], <4 x i32> %[[V2]]
+define i32 @fcmp_0_sub_select1(ptr noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi i32 [ 0, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %sub = sub nsw i32 %sum.1, 2
+  %sum.2 = select i1 %cmp.2, i32 %sub, i32 %sum.1
+  %indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi i32 [ 0, %entry ], [ %sum.2, %for.body ]
+  ret i32 %1
+}
+
+; CHECK-LABEL: @fcmp_0_mult_select1(
+; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = mul <4 x i32> %[[V2:.*]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK: select <4 x i1> %[[V1]], <4 x i32> %[[V3]], <4 x i32> %[[V2]]
+define i32 @fcmp_0_mult_select1(ptr noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi i32 [ 0, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %mult = mul nsw i32 %sum.1, 2
+  %sum.2 = select i1 %cmp.2, i32 %mult, i32 %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi i32 [ 0, %entry ], [ %sum.2, %for.body ]
+  ret i32 %1
+}
+
+ at table = constant [13 x i16] [i16 10, i16 35, i16 69, i16 147, i16 280, i16 472, i16 682, i16 1013, i16 1559, i16 2544, i16 4553, i16 6494, i16 10000], align 1 
+
+; CHECK-LABEL: @non_reduction_index(
+; CHECK-NOT:     <4 x i16>
+define i16 @non_reduction_index(i16 noundef %val) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %spec.select.lcssa = phi i16 [ %spec.select, %for.body ]
+  ret i16 %spec.select.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i16 [ 12, %entry ], [ %sub, %for.body ]
+  %k.04 = phi i16 [ 0, %entry ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 %i.05
+  %0 = load i16, ptr %arrayidx, align 1
+  %cmp1 = icmp ugt i16 %0, %val
+  %sub = add nsw i16 %i.05, -1
+  %spec.select = select i1 %cmp1, i16 %sub, i16 %k.04
+  %cmp.not = icmp eq i16 %sub, 0
+  br i1 %cmp.not, label %for.cond.cleanup, label %for.body
+}
+
+ at tablef = constant [13 x half] [half 10.0, half 35.0, half 69.0, half 147.0, half 280.0, half 472.0, half 682.0, half 1013.0, half 1559.0, half 2544.0, half 4556.0, half 6496.0, half 10000.0], align 1 
+
+; CHECK-LABEL: @non_reduction_index_half(
+; CHECK-NOT:     <4 x half>
+define i16 @non_reduction_index_half(half noundef %val) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %spec.select.lcssa = phi i16 [ %spec.select, %for.body ]
+  ret i16 %spec.select.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i16 [ 12, %entry ], [ %sub, %for.body ]
+  %k.04 = phi i16 [ 0, %entry ], [ %spec.select, %for.body ]
+  %arrayidx = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 %i.05
+  %0 = load half, ptr %arrayidx, align 1
+  %fcmp1 = fcmp ugt half %0, %val
+  %sub = add nsw i16 %i.05, -1
+  %spec.select = select i1 %fcmp1, i16 %sub, i16 %k.04
+  %cmp.not = icmp eq i16 %sub, 0
+  br i1 %cmp.not, label %for.cond.cleanup, label %for.body
+}
+
 ; Make sure any check-not directives are not triggered by function declarations.
 ; CHECK: declare