[llvm] 981e9dc - [LV] Don't assume isScalarAfterVectorization if one of the uses needs widening.

Mon Jul 26 08:02:28 PDT 2021

Author: Sander de Smalen
Date: 2021-07-26T16:01:55+01:00
New Revision: 981e9dce548277eaf3f6725bf5ffe84e03f658b1

URL: https://github.com/llvm/llvm-project/commit/981e9dce548277eaf3f6725bf5ffe84e03f658b1
DIFF: https://github.com/llvm/llvm-project/commit/981e9dce548277eaf3f6725bf5ffe84e03f658b1.diff

LOG: [LV] Don't assume isScalarAfterVectorization if one of the uses needs widening.

This fixes an issue that was found in D105199, where a GEP instruction
is used both as the address of a store, as well as the value of a store.
For the former, the value is scalar after vectorization, but the latter
(as value) requires widening.

Other code in that function seems to prevent similar cases from happening,
but it seems this case was missed.

Reviewed By: david-arm

Differential Revision: https://reviews.llvm.org/D106164

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
    llvm/test/Transforms/LoopVectorize/pointer-induction.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 27d6161adfe2..f1d83d9e81f2 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5123,13 +5123,12 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
     if (isScalarPtrInduction(MemAccess, Ptr)) {
       Worklist.insert(cast<Instruction>(Ptr));
-      Instruction *Update = cast<Instruction>(
-          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
-      Worklist.insert(Update);
       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
                         << "\n");
-      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
-                        << "\n");
+
+      Instruction *Update = cast<Instruction>(
+          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
+      ScalarPtrs.insert(Update);
       return;
     }
     // We only care about bitcast and getelementptr instructions contained in
@@ -5143,11 +5142,12 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
     if (Worklist.count(I))
       return;
 
-    // If the use of the pointer will be a scalar use, and all users of the
-    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
-    // place the pointer in PossibleNonScalarPtrs.
-    if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
-          return isa<LoadInst>(U) || isa<StoreInst>(U);
+    // If all users of the pointer will be memory accesses and scalar, place the
+    // pointer in ScalarPtrs. Otherwise, place the pointer in
+    // PossibleNonScalarPtrs.
+    if (llvm::all_of(I->users(), [&](User *U) {
+          return (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+                 isScalarUse(cast<Instruction>(U), Ptr);
         }))
       ScalarPtrs.insert(I);
     else

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
index 858b28ddd321..450af8c48819 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -91,7 +91,7 @@ for.end:
 ; Same as predicate_store except we use a pointer PHI to maintain the address
 ;
 ; CHECK: Found new scalar instruction:   %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
-; CHECK: Found new scalar instruction:   %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
+; CHECK: Found scalar instruction:   %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4
 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
 ; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %addr, align 4

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
new file mode 100644
index 000000000000..0020a0951fa4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -0,0 +1,81 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -scalable-vectorization=on -S -mtriple=aarch64 -mattr=+sve -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; In the test below the PHI instruction:
+;  %0 = phi i8* [ %incdec.ptr190, %loop.body ], [ %src, %entry ]
+; has multiple uses, i.e.
+;  1. As a uniform address for the load, and
+;  2. Non-uniform use by the getelementptr + store, which leads to replication.
+
+; CHECK-LABEL:  LV: Checking a loop in "phi_multiple_use"
+; CHECK-NOT:    LV: Found new scalar instruction:   %incdec.ptr190 = getelementptr inbounds i8, i8* %0, i64 1
+;
+; CHECK:        VPlan 'Initial VPlan for VF={vscale x 2},UF>=1' {
+; CHECK-NEXT:   loop.body:
+; CHECK-NEXT:     WIDEN-INDUCTION %index = phi 0, %index.next
+; CHECK-NEXT:     WIDEN-PHI %curchar = phi %curchar.next, %curptr
+; CHECK-NEXT:     WIDEN-PHI %0 = phi %incdec.ptr190, %src
+; CHECK-NEXT:     WIDEN-GEP Var[Inv] ir<%incdec.ptr190> = getelementptr ir<%0>, ir<1>
+; CHECK-NEXT:     WIDEN store ir<%curchar>, ir<%incdec.ptr190>
+; CHECK-NEXT:     WIDEN ir<%1> = load ir<%0>
+; CHECK-NEXT:     WIDEN ir<%2> = add ir<%1>, ir<1>
+; CHECK-NEXT:     WIDEN store ir<%0>, ir<%2>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT:   }
+
+define void @phi_multiple_use(i8** noalias %curptr, i8* noalias %src, i64 %N) #0 {
+; CHECK-LABEL: @phi_multiple_use(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK-NEXT:    {{.*}} = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8*, i8** %curptr, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX1]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP3]]
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, i8* %src, <vscale x 2 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP6]], i64 1
+; CHECK:         store <vscale x 2 x i8*> [[TMP5]], <vscale x 2 x i8*>*
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 2 x i8*> [[NEXT_GEP6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <vscale x 2 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]],
+; CHECK:         store <vscale x 2 x i8> [[TMP9]], <vscale x 2 x i8>*
+
+entry:
+  br label %loop.body
+
+loop.body:                                    ; preds = %loop.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop.body ]
+  %curchar = phi i8** [ %curchar.next, %loop.body ], [ %curptr, %entry ]
+  %0 = phi i8* [ %incdec.ptr190, %loop.body ], [ %src, %entry ]
+  %incdec.ptr190 = getelementptr inbounds i8, i8* %0, i64 1
+  %curchar.next = getelementptr inbounds i8*, i8** %curchar, i64 1
+  store i8* %incdec.ptr190, i8** %curchar, align 8
+  %1 = load i8, i8* %0, align 1
+  %2 = add i8 %1, 1
+  store i8 %2, i8* %0, align 1
+  %index.next = add nuw i64 %index, 1
+  %3 = icmp ne i64 %index.next, %N
+  br i1 %3, label %loop.body, label %exit, !llvm.loop !0
+
+exit:                            ; preds = %loop.body
+  ret void
+}
+
+
+attributes #0 = {"target-features"="+sve"}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.interleave.count", i32 1}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{ !5 }
+!5 = distinct !{ !5, !6 }
+!6 = distinct !{ !7 }
+!7 = distinct !{ !7, !6 }

diff  --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index 828e72d3fb28..d77ce32e753e 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -146,26 +146,23 @@ define void @pointer_induction_used_as_vector(i8** noalias %start.1, i8* noalias
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP4]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP5]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP6]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP7]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i8*> poison, i8* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i8*> [[TMP10]], i8* [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i8*> [[TMP11]], i8* [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i8*> [[TMP12]], i8* [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to <4 x i8*>*
-; CHECK-NEXT:    store <4 x i8*> [[TMP13]], <4 x i8*>* [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, i8* [[NEXT_GEP4]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP18]], <4 x i8>* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i8*> poison, i8* [[NEXT_GEP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i8*> [[TMP6]], i8* [[NEXT_GEP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i8*> [[TMP7]], i8* [[NEXT_GEP6]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i8*> [[TMP8]], i8* [[NEXT_GEP7]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to <4 x i8*>*
+; CHECK-NEXT:    store <4 x i8*> [[TMP10]], <4 x i8*>* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[NEXT_GEP4]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[TMP15]], <4 x i8>* [[TMP16]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]