[llvm] 63d8058 - LoopVectorize: guard appending InstsToScalarize; fix bug (#88720)

Thu Apr 18 02:03:11 PDT 2024

Author: Ramkumar Ramachandra
Date: 2024-04-18T10:03:07+01:00
New Revision: 63d8058ef50a3186b6b6a5db254f44673fea3d19

URL: https://github.com/llvm/llvm-project/commit/63d8058ef50a3186b6b6a5db254f44673fea3d19
DIFF: https://github.com/llvm/llvm-project/commit/63d8058ef50a3186b6b6a5db254f44673fea3d19.diff

LOG: LoopVectorize: guard appending InstsToScalarize; fix bug (#88720)

In the process of collecting instructions to scalarize, LoopVectorize
uses faulty reasoning whereby it also adds instructions that will be
scalar after vectorization. If an instruction satisfies
isScalarAfterVectorization() for the given VF, it should not be appended
to InstsToScalarize. Add this extra guard, fixing a crash.

Fixes #55096.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a8272f45025358..b3b54cc36877f7 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5814,7 +5814,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
         // invalid scalarization costs.
         // Do not apply discount logic if hacked cost is needed
         // for emulated masked memrefs.
-        if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
+        if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&
+            !useEmulatedMaskMemRefHack(&I, VF) &&
             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
         // Remember that BB will remain after vectorization.

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
index d75ab660461ecc..cc820f2a8f2a05 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
@@ -1,13 +1,48 @@
-; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
-
-; REQUIRES: asserts
-; XFAIL: *
-
-target triple = "x86_64-apple-macosx"
-
-; CHECK: vector.body
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=x86_64-apple-macosx -passes=loop-vectorize,simplifycfg,dce -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
 
 define void @test_pr55096(i64 %c, ptr %p) {
+; CHECK-LABEL: define void @test_pr55096(
+; CHECK-SAME: i64 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 122, i64 123>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[INDEX]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], 2008
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 6229, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[TMP4]], 2008
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i16 4943, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[P]], i16 [[TMP6]]
+; CHECK-NEXT:    store i16 0, ptr [[TMP7]], align 2
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
+; CHECK:       pred.store.if2:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i16 [[OFFSET_IDX]], 2008
+; CHECK-NEXT:    [[TMP11:%.*]] = add i16 [[TMP10]], 2008
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv i16 4943, [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[P]], i16 [[TMP12]]
+; CHECK-NEXT:    store i16 0, ptr [[TMP13]], align 2
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; CHECK:       pred.store.continue3:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 340
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop.header
 
@@ -32,3 +67,8 @@ loop.latch:
 exit:
   ret void
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.