[llvm] LoopVectorize: guard appending InstsToScalarize; fix bug (PR #88720)

Mon Apr 15 05:07:30 PDT 2024

https://github.com/artagnon created https://github.com/llvm/llvm-project/pull/88720

In the process of collecting instructions to scalarize, LoopVectorize uses faulty reasoning whereby it also adds instructions that will be scalar after vectorization. If an instruction satisfies isScalarAfterVectorization() for the given VF, it should not be appended to InstsToScalarize. Add this extra guard, fixing a crash.

Fixes #55096.

>From 19feeefb1e84d5ce9f2dbdf07e443688d999771d Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ram.ramachandra at arm.com>
Date: Mon, 15 Apr 2024 12:55:08 +0100
Subject: [PATCH] LoopVectorize: guard appending InstsToScalarize; fix bug

In the process of collecting instructions to scalarize, LoopVectorize
uses faulty reasoning whereby it also adds instructions that will be
scalar after vectorization. If an instruction satisfies
isScalarAfterVectorization() for the given VF, it should not be appended
to InstsToScalarize. Add this extra guard, fixing a crash.

Fixes #55096.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  3 +-
 .../X86/pr55096-scalarize-add.ll              | 56 ++++++++++++++++---
 2 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5535cc55e93216..edc7b89fd96f1b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5815,7 +5815,8 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
     if (!blockNeedsPredicationForAnyReason(BB))
       continue;
     for (Instruction &I : *BB)
-      if (isScalarWithPredication(&I, VF)) {
+      if (isScalarWithPredication(&I, VF) &&
+          !isScalarAfterVectorization(&I, VF)) {
         ScalarCostsTy ScalarCosts;
         // Do not apply discount if scalable, because that would lead to
         // invalid scalarization costs.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
index d75ab660461ecc..cc820f2a8f2a05 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
@@ -1,13 +1,48 @@
-; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
-
-; REQUIRES: asserts
-; XFAIL: *
-
-target triple = "x86_64-apple-macosx"
-
-; CHECK: vector.body
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mtriple=x86_64-apple-macosx -passes=loop-vectorize,simplifycfg,dce -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck %s
 
 define void @test_pr55096(i64 %c, ptr %p) {
+; CHECK-LABEL: define void @test_pr55096(
+; CHECK-SAME: i64 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 122, i64 123>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[INDEX]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], 2008
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 6229, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add i16 [[TMP4]], 2008
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i16 4943, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[P]], i16 [[TMP6]]
+; CHECK-NEXT:    store i16 0, ptr [[TMP7]], align 2
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
+; CHECK:       pred.store.if2:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i16 [[OFFSET_IDX]], 2008
+; CHECK-NEXT:    [[TMP11:%.*]] = add i16 [[TMP10]], 2008
+; CHECK-NEXT:    [[TMP12:%.*]] = udiv i16 4943, [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[P]], i16 [[TMP12]]
+; CHECK-NEXT:    store i16 0, ptr [[TMP13]], align 2
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE3]]
+; CHECK:       pred.store.continue3:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 340
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop.header
 
@@ -32,3 +67,8 @@ loop.latch:
 exit:
   ret void
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.