[llvm] LoopVectorize: fix logical error in trunc cost (PR #91136)

Sun May 5 10:53:40 PDT 2024

https://github.com/artagnon created https://github.com/llvm/llvm-project/pull/91136

In LoopVectorizationCostModel::getInstructionCost(), when the condition canTruncateToMinimalBitwidth() is satisfied, for a trunc, the source type is computed as the smallest type of the source vector and the destination vector, and the destination type is computed as the largest type of the instruction and destination type. This is clearly a logical error, as the original source vector type could be smaller than the original destination vector type, and the trunc semantics are broken because we're attempting to widen. Even if this were not the case, there is clearly no benefit of computing the largest vector type for the final destination type, as the cost of a trunc only depends on the vectorization factor.

>From 7a4f8bcf1211c4289e1850c7927e2f3f967bb2f8 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <r at artagnon.com>
Date: Sun, 5 May 2024 18:25:08 +0100
Subject: [PATCH 1/2] LoopVectorize: add test for crash #47665

---
 .../LoopVectorize/SystemZ/pr47665.ll          | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll

diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
new file mode 100644
index 00000000000000..586f4bdc65ef00
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -0,0 +1,24 @@
+; REQUIRES: asserts
+; RUN: not --crash opt -passes=loop-vectorize -mtriple=s390x -mcpu=z14 -disable-output %s
+; RUN: not --crash opt -passes=loop-vectorize -force-vector-width=2 -mtriple=s390x -mcpu=z14 -disable-output %s
+
+define void @test(ptr %p) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %trunc = trunc i40 0 to i32
+  %icmp.eq = icmp eq i32 %trunc, 0
+  %zext = zext i1 %icmp.eq to i32
+  %icmp.ult = icmp ult i32 0, %zext
+  %or = or i1 %icmp.ult, true
+  %icmp.sgt = icmp sgt i1 %or, false
+  store i1 %icmp.sgt, ptr %p, align 1
+  %iv.next = add i32 %iv, 1
+  %cond = icmp ult i32 %iv.next, 10
+  br i1 %cond, label %for.body, label %exit
+
+exit:                                             ; preds = %for.body
+  ret void
+}

>From 2074a8beded2475295a5ca93efba3e7a808e1b86 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <r at artagnon.com>
Date: Sun, 5 May 2024 18:26:48 +0100
Subject: [PATCH 2/2] LoopVectorize: fix logical error in trunc cost

In LoopVectorizationCostModel::getInstructionCost(), when the condition
canTruncateToMinimalBitwidth() is satisfied, for a trunc, the source
type is computed as the smallest type of the source vector and the
destination vector, and the destination type is computed as the largest
type of the instruction and destination type. This is clearly a logical
error, as the original source vector type could be smaller than the
original destination vector type, and the trunc semantics are broken
because we're attempting to widen. Even if this were not the case, there
is clearly no benefit of computing the largest vector type for the final
destination type, as the cost of a trunc only depends on the
vectorization factor.

Fixes #47765.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  13 +-
 .../LoopVectorize/SystemZ/pr47665.ll          | 130 +++++++++++++++++-
 2 files changed, 130 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3be0102bea3e34..96debb8e33b436 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3390,12 +3390,6 @@ static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
 }
 
-static Type *largestIntegerVectorType(Type *T1, Type *T2) {
-  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
-  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
-  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
-}
-
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
                                             VPlan &Plan) {
   // Fix widened non-induction PHIs by setting up the PHI operands.
@@ -7123,16 +7117,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
       //
       // Calculate the modified src and dest types.
-      Type *MinVecTy = VectorTy;
       if (Opcode == Instruction::Trunc) {
-        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
+        SrcVecTy = smallestIntegerVectorType(SrcVecTy, VectorTy);
         VectorTy =
-            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), VectorTy);
       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
         // Leave SrcVecTy unchanged - we only shrink the destination element
         // type.
         VectorTy =
-            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), VectorTy);
       }
     }
 
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
index 586f4bdc65ef00..1b793464340804 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -1,8 +1,127 @@
-; REQUIRES: asserts
-; RUN: not --crash opt -passes=loop-vectorize -mtriple=s390x -mcpu=z14 -disable-output %s
-; RUN: not --crash opt -passes=loop-vectorize -force-vector-width=2 -mtriple=s390x -mcpu=z14 -disable-output %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize,dce,simplifycfg -mtriple=s390x -mcpu=z14 -S %s | FileCheck %s --check-prefix=NOVEC
+; RUN: opt -passes=loop-vectorize,dce,simplifycfg -force-vector-width=2 -mtriple=s390x -mcpu=z14 -S %s | FileCheck %s
 
 define void @test(ptr %p) {
+; NOVEC-LABEL: define void @test(
+; NOVEC-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; NOVEC-NEXT:  entry:
+; NOVEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 0, i64 0
+; NOVEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; NOVEC-NEXT:    [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; NOVEC-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
+; NOVEC-NEXT:    [[TMP1:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0
+; NOVEC-NEXT:    [[TMP2:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
+; NOVEC-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP1]], true
+; NOVEC-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP2]], true
+; NOVEC-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP3]], true
+; NOVEC-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP4]], true
+; NOVEC-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; NOVEC-NEXT:    [[TMP8:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
+; NOVEC-NEXT:    [[TMP9:%.*]] = xor i1 [[TMP7]], true
+; NOVEC-NEXT:    [[TMP10:%.*]] = xor i1 [[TMP8]], true
+; NOVEC-NEXT:    [[TMP11:%.*]] = xor i1 [[TMP9]], true
+; NOVEC-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP10]], true
+; NOVEC-NEXT:    [[TMP13:%.*]] = or i1 [[TMP11]], [[TMP12]]
+; NOVEC-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
+; NOVEC-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP13]], true
+; NOVEC-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP14]], true
+; NOVEC-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP15]], true
+; NOVEC-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP16]], true
+; NOVEC-NEXT:    [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
+; NOVEC-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
+; NOVEC-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP19]], true
+; NOVEC-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP20]], true
+; NOVEC-NEXT:    [[TMP23:%.*]] = xor i1 [[TMP21]], true
+; NOVEC-NEXT:    [[TMP24:%.*]] = xor i1 [[TMP22]], true
+; NOVEC-NEXT:    [[TMP25:%.*]] = or i1 [[TMP23]], [[TMP24]]
+; NOVEC-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
+; NOVEC-NEXT:    [[TMP27:%.*]] = xor i1 [[TMP25]], true
+; NOVEC-NEXT:    [[TMP28:%.*]] = xor i1 [[TMP26]], true
+; NOVEC-NEXT:    [[TMP29:%.*]] = xor i1 [[TMP27]], true
+; NOVEC-NEXT:    [[TMP30:%.*]] = xor i1 [[TMP28]], true
+; NOVEC-NEXT:    [[TMP31:%.*]] = or i1 [[TMP29]], [[TMP30]]
+; NOVEC-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
+; NOVEC-NEXT:    [[TMP33:%.*]] = xor i1 [[TMP31]], true
+; NOVEC-NEXT:    [[TMP34:%.*]] = xor i1 [[TMP32]], true
+; NOVEC-NEXT:    [[TMP35:%.*]] = xor i1 [[TMP33]], true
+; NOVEC-NEXT:    [[TMP36:%.*]] = xor i1 [[TMP34]], true
+; NOVEC-NEXT:    [[TMP37:%.*]] = or i1 [[TMP35]], [[TMP36]]
+; NOVEC-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
+; NOVEC-NEXT:    [[TMP39:%.*]] = xor i1 [[TMP37]], true
+; NOVEC-NEXT:    [[TMP40:%.*]] = xor i1 [[TMP38]], true
+; NOVEC-NEXT:    [[TMP41:%.*]] = xor i1 [[TMP39]], true
+; NOVEC-NEXT:    [[TMP42:%.*]] = xor i1 [[TMP40]], true
+; NOVEC-NEXT:    [[TMP43:%.*]] = or i1 [[TMP41]], [[TMP42]]
+; NOVEC-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
+; NOVEC-NEXT:    [[TMP45:%.*]] = xor i1 [[TMP43]], true
+; NOVEC-NEXT:    [[TMP46:%.*]] = xor i1 [[TMP44]], true
+; NOVEC-NEXT:    [[TMP47:%.*]] = xor i1 [[TMP45]], true
+; NOVEC-NEXT:    [[TMP48:%.*]] = xor i1 [[TMP46]], true
+; NOVEC-NEXT:    [[TMP49:%.*]] = or i1 [[TMP47]], [[TMP48]]
+; NOVEC-NEXT:    [[TMP50:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
+; NOVEC-NEXT:    [[TMP51:%.*]] = xor i1 [[TMP49]], true
+; NOVEC-NEXT:    [[TMP52:%.*]] = xor i1 [[TMP50]], true
+; NOVEC-NEXT:    [[TMP53:%.*]] = xor i1 [[TMP51]], true
+; NOVEC-NEXT:    [[TMP54:%.*]] = xor i1 [[TMP52]], true
+; NOVEC-NEXT:    [[TMP55:%.*]] = or i1 [[TMP53]], [[TMP54]]
+; NOVEC-NEXT:    [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
+; NOVEC-NEXT:    [[TMP57:%.*]] = xor i1 [[TMP55]], true
+; NOVEC-NEXT:    [[TMP58:%.*]] = xor i1 [[TMP56]], true
+; NOVEC-NEXT:    [[TMP59:%.*]] = xor i1 [[TMP57]], true
+; NOVEC-NEXT:    [[TMP60:%.*]] = xor i1 [[TMP58]], true
+; NOVEC-NEXT:    [[TMP61:%.*]] = or i1 [[TMP59]], [[TMP60]]
+; NOVEC-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
+; NOVEC-NEXT:    [[TMP63:%.*]] = xor i1 [[TMP61]], true
+; NOVEC-NEXT:    [[TMP64:%.*]] = xor i1 [[TMP62]], true
+; NOVEC-NEXT:    [[TMP65:%.*]] = xor i1 [[TMP63]], true
+; NOVEC-NEXT:    [[TMP66:%.*]] = xor i1 [[TMP64]], true
+; NOVEC-NEXT:    [[TMP67:%.*]] = or i1 [[TMP65]], [[TMP66]]
+; NOVEC-NEXT:    [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
+; NOVEC-NEXT:    [[TMP69:%.*]] = xor i1 [[TMP67]], true
+; NOVEC-NEXT:    [[TMP70:%.*]] = xor i1 [[TMP68]], true
+; NOVEC-NEXT:    [[TMP71:%.*]] = xor i1 [[TMP69]], true
+; NOVEC-NEXT:    [[TMP72:%.*]] = xor i1 [[TMP70]], true
+; NOVEC-NEXT:    [[TMP73:%.*]] = or i1 [[TMP71]], [[TMP72]]
+; NOVEC-NEXT:    [[TMP74:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
+; NOVEC-NEXT:    [[TMP75:%.*]] = xor i1 [[TMP73]], true
+; NOVEC-NEXT:    [[TMP76:%.*]] = xor i1 [[TMP74]], true
+; NOVEC-NEXT:    [[TMP77:%.*]] = xor i1 [[TMP75]], true
+; NOVEC-NEXT:    [[TMP78:%.*]] = xor i1 [[TMP76]], true
+; NOVEC-NEXT:    [[TMP79:%.*]] = or i1 [[TMP77]], [[TMP78]]
+; NOVEC-NEXT:    [[TMP80:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
+; NOVEC-NEXT:    [[TMP81:%.*]] = xor i1 [[TMP79]], true
+; NOVEC-NEXT:    [[TMP82:%.*]] = xor i1 [[TMP80]], true
+; NOVEC-NEXT:    [[TMP83:%.*]] = xor i1 [[TMP81]], true
+; NOVEC-NEXT:    [[TMP84:%.*]] = xor i1 [[TMP82]], true
+; NOVEC-NEXT:    [[TMP85:%.*]] = or i1 [[TMP83]], [[TMP84]]
+; NOVEC-NEXT:    [[TMP86:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
+; NOVEC-NEXT:    [[TMP87:%.*]] = xor i1 [[TMP85]], true
+; NOVEC-NEXT:    [[TMP88:%.*]] = xor i1 [[TMP86]], true
+; NOVEC-NEXT:    [[TMP89:%.*]] = xor i1 [[TMP87]], true
+; NOVEC-NEXT:    [[TMP90:%.*]] = xor i1 [[TMP88]], true
+; NOVEC-NEXT:    [[TMP91:%.*]] = or i1 [[TMP89]], [[TMP90]]
+; NOVEC-NEXT:    br i1 [[TMP91]], label [[TMP92:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; NOVEC:       92:
+; NOVEC-NEXT:    store i1 false, ptr [[P]], align 1
+; NOVEC-NEXT:    br label [[MIDDLE_BLOCK]]
+; NOVEC:       middle.block:
+; NOVEC-NEXT:    [[INDEX_NEXT:%.*]] = add i32 0, 16
+; NOVEC-NEXT:    ret void
+;
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 10
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[PRED_STORE_CONTINUE30]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -22,3 +141,8 @@ for.body:                                         ; preds = %for.body, %entry
 exit:                                             ; preds = %for.body
   ret void
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.