[llvm] LoopVectorize: fix logical error in trunc cost (PR #91136)

via llvm-commits llvm-commits at lists.llvm.org
Sun May 5 10:54:10 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-transforms

Author: Ramkumar Ramachandra (artagnon)

<details>
<summary>Changes</summary>

In LoopVectorizationCostModel::getInstructionCost(), when the condition canTruncateToMinimalBitwidth() is satisfied, for a trunc, the source type is computed as the smallest type of the source vector and the destination vector, and the destination type is computed as the largest type of the instruction and destination type. This is clearly a logical error, as the original source vector type could be smaller than the original destination vector type, and the trunc semantics are broken because we're attempting to widen. Even if this were not the case, there is clearly no benefit of computing the largest vector type for the final destination type, as the cost of a trunc only depends on the vectorization factor.

---
Full diff: https://github.com/llvm/llvm-project/pull/91136.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+3-10) 
- (added) llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll (+148) 


``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3be0102bea3e34..96debb8e33b436 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3390,12 +3390,6 @@ static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
 }
 
-static Type *largestIntegerVectorType(Type *T1, Type *T2) {
-  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
-  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
-  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
-}
-
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
                                             VPlan &Plan) {
   // Fix widened non-induction PHIs by setting up the PHI operands.
@@ -7123,16 +7117,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
       //
       // Calculate the modified src and dest types.
-      Type *MinVecTy = VectorTy;
       if (Opcode == Instruction::Trunc) {
-        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
+        SrcVecTy = smallestIntegerVectorType(SrcVecTy, VectorTy);
         VectorTy =
-            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), VectorTy);
       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
         // Leave SrcVecTy unchanged - we only shrink the destination element
         // type.
         VectorTy =
-            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), VectorTy);
       }
     }
 
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
new file mode 100644
index 00000000000000..1b793464340804
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize,dce,simplifycfg -mtriple=s390x -mcpu=z14 -S %s | FileCheck %s --check-prefix=NOVEC
+; RUN: opt -passes=loop-vectorize,dce,simplifycfg -force-vector-width=2 -mtriple=s390x -mcpu=z14 -S %s | FileCheck %s
+
+define void @test(ptr %p) {
+; NOVEC-LABEL: define void @test(
+; NOVEC-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; NOVEC-NEXT:  entry:
+; NOVEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 0, i64 0
+; NOVEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; NOVEC-NEXT:    [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; NOVEC-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
+; NOVEC-NEXT:    [[TMP1:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0
+; NOVEC-NEXT:    [[TMP2:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
+; NOVEC-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP1]], true
+; NOVEC-NEXT:    [[TMP4:%.*]] = xor i1 [[TMP2]], true
+; NOVEC-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP3]], true
+; NOVEC-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP4]], true
+; NOVEC-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; NOVEC-NEXT:    [[TMP8:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
+; NOVEC-NEXT:    [[TMP9:%.*]] = xor i1 [[TMP7]], true
+; NOVEC-NEXT:    [[TMP10:%.*]] = xor i1 [[TMP8]], true
+; NOVEC-NEXT:    [[TMP11:%.*]] = xor i1 [[TMP9]], true
+; NOVEC-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP10]], true
+; NOVEC-NEXT:    [[TMP13:%.*]] = or i1 [[TMP11]], [[TMP12]]
+; NOVEC-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
+; NOVEC-NEXT:    [[TMP15:%.*]] = xor i1 [[TMP13]], true
+; NOVEC-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP14]], true
+; NOVEC-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP15]], true
+; NOVEC-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP16]], true
+; NOVEC-NEXT:    [[TMP19:%.*]] = or i1 [[TMP17]], [[TMP18]]
+; NOVEC-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
+; NOVEC-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP19]], true
+; NOVEC-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP20]], true
+; NOVEC-NEXT:    [[TMP23:%.*]] = xor i1 [[TMP21]], true
+; NOVEC-NEXT:    [[TMP24:%.*]] = xor i1 [[TMP22]], true
+; NOVEC-NEXT:    [[TMP25:%.*]] = or i1 [[TMP23]], [[TMP24]]
+; NOVEC-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
+; NOVEC-NEXT:    [[TMP27:%.*]] = xor i1 [[TMP25]], true
+; NOVEC-NEXT:    [[TMP28:%.*]] = xor i1 [[TMP26]], true
+; NOVEC-NEXT:    [[TMP29:%.*]] = xor i1 [[TMP27]], true
+; NOVEC-NEXT:    [[TMP30:%.*]] = xor i1 [[TMP28]], true
+; NOVEC-NEXT:    [[TMP31:%.*]] = or i1 [[TMP29]], [[TMP30]]
+; NOVEC-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
+; NOVEC-NEXT:    [[TMP33:%.*]] = xor i1 [[TMP31]], true
+; NOVEC-NEXT:    [[TMP34:%.*]] = xor i1 [[TMP32]], true
+; NOVEC-NEXT:    [[TMP35:%.*]] = xor i1 [[TMP33]], true
+; NOVEC-NEXT:    [[TMP36:%.*]] = xor i1 [[TMP34]], true
+; NOVEC-NEXT:    [[TMP37:%.*]] = or i1 [[TMP35]], [[TMP36]]
+; NOVEC-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
+; NOVEC-NEXT:    [[TMP39:%.*]] = xor i1 [[TMP37]], true
+; NOVEC-NEXT:    [[TMP40:%.*]] = xor i1 [[TMP38]], true
+; NOVEC-NEXT:    [[TMP41:%.*]] = xor i1 [[TMP39]], true
+; NOVEC-NEXT:    [[TMP42:%.*]] = xor i1 [[TMP40]], true
+; NOVEC-NEXT:    [[TMP43:%.*]] = or i1 [[TMP41]], [[TMP42]]
+; NOVEC-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
+; NOVEC-NEXT:    [[TMP45:%.*]] = xor i1 [[TMP43]], true
+; NOVEC-NEXT:    [[TMP46:%.*]] = xor i1 [[TMP44]], true
+; NOVEC-NEXT:    [[TMP47:%.*]] = xor i1 [[TMP45]], true
+; NOVEC-NEXT:    [[TMP48:%.*]] = xor i1 [[TMP46]], true
+; NOVEC-NEXT:    [[TMP49:%.*]] = or i1 [[TMP47]], [[TMP48]]
+; NOVEC-NEXT:    [[TMP50:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
+; NOVEC-NEXT:    [[TMP51:%.*]] = xor i1 [[TMP49]], true
+; NOVEC-NEXT:    [[TMP52:%.*]] = xor i1 [[TMP50]], true
+; NOVEC-NEXT:    [[TMP53:%.*]] = xor i1 [[TMP51]], true
+; NOVEC-NEXT:    [[TMP54:%.*]] = xor i1 [[TMP52]], true
+; NOVEC-NEXT:    [[TMP55:%.*]] = or i1 [[TMP53]], [[TMP54]]
+; NOVEC-NEXT:    [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
+; NOVEC-NEXT:    [[TMP57:%.*]] = xor i1 [[TMP55]], true
+; NOVEC-NEXT:    [[TMP58:%.*]] = xor i1 [[TMP56]], true
+; NOVEC-NEXT:    [[TMP59:%.*]] = xor i1 [[TMP57]], true
+; NOVEC-NEXT:    [[TMP60:%.*]] = xor i1 [[TMP58]], true
+; NOVEC-NEXT:    [[TMP61:%.*]] = or i1 [[TMP59]], [[TMP60]]
+; NOVEC-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
+; NOVEC-NEXT:    [[TMP63:%.*]] = xor i1 [[TMP61]], true
+; NOVEC-NEXT:    [[TMP64:%.*]] = xor i1 [[TMP62]], true
+; NOVEC-NEXT:    [[TMP65:%.*]] = xor i1 [[TMP63]], true
+; NOVEC-NEXT:    [[TMP66:%.*]] = xor i1 [[TMP64]], true
+; NOVEC-NEXT:    [[TMP67:%.*]] = or i1 [[TMP65]], [[TMP66]]
+; NOVEC-NEXT:    [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
+; NOVEC-NEXT:    [[TMP69:%.*]] = xor i1 [[TMP67]], true
+; NOVEC-NEXT:    [[TMP70:%.*]] = xor i1 [[TMP68]], true
+; NOVEC-NEXT:    [[TMP71:%.*]] = xor i1 [[TMP69]], true
+; NOVEC-NEXT:    [[TMP72:%.*]] = xor i1 [[TMP70]], true
+; NOVEC-NEXT:    [[TMP73:%.*]] = or i1 [[TMP71]], [[TMP72]]
+; NOVEC-NEXT:    [[TMP74:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
+; NOVEC-NEXT:    [[TMP75:%.*]] = xor i1 [[TMP73]], true
+; NOVEC-NEXT:    [[TMP76:%.*]] = xor i1 [[TMP74]], true
+; NOVEC-NEXT:    [[TMP77:%.*]] = xor i1 [[TMP75]], true
+; NOVEC-NEXT:    [[TMP78:%.*]] = xor i1 [[TMP76]], true
+; NOVEC-NEXT:    [[TMP79:%.*]] = or i1 [[TMP77]], [[TMP78]]
+; NOVEC-NEXT:    [[TMP80:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
+; NOVEC-NEXT:    [[TMP81:%.*]] = xor i1 [[TMP79]], true
+; NOVEC-NEXT:    [[TMP82:%.*]] = xor i1 [[TMP80]], true
+; NOVEC-NEXT:    [[TMP83:%.*]] = xor i1 [[TMP81]], true
+; NOVEC-NEXT:    [[TMP84:%.*]] = xor i1 [[TMP82]], true
+; NOVEC-NEXT:    [[TMP85:%.*]] = or i1 [[TMP83]], [[TMP84]]
+; NOVEC-NEXT:    [[TMP86:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
+; NOVEC-NEXT:    [[TMP87:%.*]] = xor i1 [[TMP85]], true
+; NOVEC-NEXT:    [[TMP88:%.*]] = xor i1 [[TMP86]], true
+; NOVEC-NEXT:    [[TMP89:%.*]] = xor i1 [[TMP87]], true
+; NOVEC-NEXT:    [[TMP90:%.*]] = xor i1 [[TMP88]], true
+; NOVEC-NEXT:    [[TMP91:%.*]] = or i1 [[TMP89]], [[TMP90]]
+; NOVEC-NEXT:    br i1 [[TMP91]], label [[TMP92:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; NOVEC:       92:
+; NOVEC-NEXT:    store i1 false, ptr [[P]], align 1
+; NOVEC-NEXT:    br label [[MIDDLE_BLOCK]]
+; NOVEC:       middle.block:
+; NOVEC-NEXT:    [[INDEX_NEXT:%.*]] = add i32 0, 16
+; NOVEC-NEXT:    ret void
+;
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; CHECK-NEXT:    store i1 false, ptr [[P]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 10
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[PRED_STORE_CONTINUE30]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %trunc = trunc i40 0 to i32
+  %icmp.eq = icmp eq i32 %trunc, 0
+  %zext = zext i1 %icmp.eq to i32
+  %icmp.ult = icmp ult i32 0, %zext
+  %or = or i1 %icmp.ult, true
+  %icmp.sgt = icmp sgt i1 %or, false
+  store i1 %icmp.sgt, ptr %p, align 1
+  %iv.next = add i32 %iv, 1
+  %cond = icmp ult i32 %iv.next, 10
+  br i1 %cond, label %for.body, label %exit
+
+exit:                                             ; preds = %for.body
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.

``````````

</details>


https://github.com/llvm/llvm-project/pull/91136


More information about the llvm-commits mailing list