[llvm] 494b5ba - [VPlan] Support to widen call intructions in VPlan native path

Sat Mar 6 14:03:53 PST 2021

Author: Mauri Mustonen
Date: 2021-03-06T21:59:52Z
New Revision: 494b5ba364a9c02da1b7b67fe5528d9f66a285e6

URL: https://github.com/llvm/llvm-project/commit/494b5ba364a9c02da1b7b67fe5528d9f66a285e6
DIFF: https://github.com/llvm/llvm-project/commit/494b5ba364a9c02da1b7b67fe5528d9f66a285e6.diff

LOG: [VPlan] Support to widen call intructions in VPlan native path

Add support to widen call instructions in VPlan native path by using a correct recipe when such instructions are encountered. This is already used by inner loop vectorizer.

Previously call instructions got handled by wrong recipes and resulted in unreachable instruction errors like this one: https://bugs.llvm.org/show_bug.cgi?id=48139.

Patch by Mauri Mustonen <mauri.mustonen at tuni.fi>

Reviewed By: fhahn

Differential Revision: https://reviews.llvm.org/D97278

Added: 
    llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll

Modified: 
    llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 0c78871dfacb..b568da7bfc63 100644

--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -71,6 +71,9 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(
               GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
+        } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
+          NewRecipe = new VPWidenCallRecipe(
+              *CI, Plan->mapToVPValues(CI->arg_operands()));
         } else {
           NewRecipe =
               new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));

diff  --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
new file mode 100644
index 000000000000..81e947633316
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
@@ -0,0 +1,73 @@
+; RUN: opt -loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s
+
+; Test that VPlan native path is able to widen call intructions like
+; llvm.sqrt.* intrincis calls.
+
+declare double @llvm.sqrt.f64(double %0)
+define void @widen_call_instruction(double* noalias nocapture readonly %a.in, double* noalias nocapture readonly %b.in, double* noalias nocapture %c.out) {
+; CHECK-LABEL: @widen_call_instruction(
+
+; CHECK: vector.body:
+; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ]
+; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ]
+; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, double* %a.in, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[A_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, double* %b.in, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[B_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT: %[[B_SQRT:.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %[[MASKED_GATHER2]])
+; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
+
+; CHECK: [[FOR2_HEADER]]:
+; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[B_SQRT]], %[[REDUCTION]]
+; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], <i32 10000, i32 10000, i32 10000, i32 10000>
+; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0
+; CHECK-NEXT: br i1 %[[EXIT_COND]], label %[[FOR1_LATCH:.*]], label %{{.*}}
+
+; CHECK: [[FOR1_LATCH]]:
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, double* %c.out, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %[[REDUCTION]], <4 x double*> %[[C_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], <i64 1000, i64 1000, i64 1000, i64 1000>
+; CHECK-NEXT: %{{.*}} = extractelement <4 x i1> %[[VEC_PTR]], i32 0
+; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add i64 %[[FOR1_INDEX]], 4
+; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body
+
+entry:
+  br label %for1.header
+
+for1.header:
+  %indvar1 = phi i64 [ 0, %entry ], [ %indvar11, %for1.latch ]
+  %a.ptr = getelementptr inbounds double, double* %a.in, i64 %indvar1
+  %a = load double, double* %a.ptr, align 8
+  %b.ptr = getelementptr inbounds double, double* %b.in, i64 %indvar1
+  %b = load double, double* %b.ptr, align 8
+  %b.sqrt = call double @llvm.sqrt.f64(double %b)
+  br label %for2.header
+
+for2.header:
+  %indvar2 = phi i32 [ 0, %for1.header ], [ %indvar21, %for2.header ]
+  %a.reduction = phi double [ %a, %for1.header ], [ %a.reduction1, %for2.header ]
+  %a.reduction1 = fadd double %b.sqrt, %a.reduction
+  %indvar21 = add nuw nsw i32 %indvar2, 1
+  %for2.cond = icmp eq i32 %indvar21, 10000
+  br i1 %for2.cond, label %for1.latch, label %for2.header
+
+for1.latch:
+  %c.ptr = getelementptr inbounds double, double* %c.out, i64 %indvar1
+  store double %a.reduction1, double* %c.ptr, align 8
+  %indvar11 = add nuw nsw i64 %indvar1, 1
+  %for1.cond = icmp eq i64 %indvar11, 1000
+  br i1 %for1.cond, label %exit, label %for1.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}