[llvm] [LV] Avoid querying the cost of invalid operations (PR #89161)

Wed Apr 17 18:33:14 PDT 2024

https://github.com/arcbbb created https://github.com/llvm/llvm-project/pull/89161

In issue #88802, the LV cost model would query the cost of the TRUNC for source type 2xi1 and destination type 2xi32 after minimal bitwidth truncation.

This patch adds a check and return a cost of 0 for invalid cast operatons.

>From 96a5d3d8865ce9e535f8a040975ff982959ae04b Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Tue, 16 Apr 2024 00:59:45 -0700
Subject: [PATCH] [LV] Avoid querying the cost of invalid operations

In issue #88802, the LV cost model would query the cost of the TRUNC
for source type 2xi1 and destination type 2xi32 after minimal bitwidth
truncation.

This patch adds a check and return a cost of 0 for invalid cast operatons.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  4 +
 .../RISCV/cost-on-invalid-cast.ll             | 93 +++++++++++++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/cost-on-invalid-cast.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0834865173b2f1..de02dc9171fec6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7067,6 +7067,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
         VectorTy =
             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+        // The truncation is unnecessary if the source is smaller than the
+        // destination.
+        if (SrcVecTy->getScalarSizeInBits() <= VectorTy->getScalarSizeInBits())
+          return 0;
       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
         // Leave SrcVecTy unchanged - we only shrink the destination element
         // type.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/cost-on-invalid-cast.ll b/llvm/test/Transforms/LoopVectorize/RISCV/cost-on-invalid-cast.ll
new file mode 100644
index 00000000000000..a502a219940635
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/cost-on-invalid-cast.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S 2>&1 | FileCheck %s
+
+; The TTI cost model might trigger an assertion when LV tries to query the cost of an invalid cast operation.
+define void @c() {
+; CHECK-LABEL: define void @c(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 2, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 [[TMP7]], i32 2)
+; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <vscale x 2 x i1> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc <vscale x 2 x i32> [[PREDPHI]] to <vscale x 2 x i8>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <vscale x 2 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> [[TMP12]], <vscale x 2 x ptr> zeroinitializer, i32 1, <vscale x 2 x i1> [[TMP10]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[F_0:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[COND_END:%.*]] ]
+; CHECK-NEXT:    [[ADD]] = add i32 [[F_0]], 1
+; CHECK-NEXT:    br i1 false, label [[COND_FALSE:%.*]], label [[COND_TRUE:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[CONV10:%.*]] = trunc i64 0 to i32
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[TOBOOL15:%.*]] = zext i8 0 to i32
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[CONV10]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i32 [[COND]] to i8
+; CHECK-NEXT:    [[CONV17:%.*]] = and i8 [[TMP14]], 0
+; CHECK-NEXT:    store i8 [[CONV17]], ptr null, align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[F_0]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %f.0 = phi i32 [ 0, %entry ], [ %add, %cond.end ]
+  %add = add i32 %f.0, 1
+  br i1 false, label %cond.false, label %cond.true
+
+cond.true:
+  %conv10 = trunc i64 0 to i32
+  br label %cond.end
+
+cond.false:
+  %tobool15 = zext i8 0 to i32
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ %conv10, %cond.true ], [ 0, %cond.false ]
+  %0 = trunc i32 %cond to i8
+  %conv17 = and i8 %0, 0
+  store i8 %conv17, ptr null, align 1
+  %cmp = icmp slt i32 %f.0, 1
+  br i1 %cmp, label %for.cond, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.