[llvm] [LV] Disable MinBW when the entire chain are cast/load instructions. (PR #117330)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 22 06:23:14 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Elvis Wang (ElvisWang123)
<details>
<summary>Changes</summary>
If all instructions in the MinBW chain are cast/load instructions. We cannot get any benefit when narrowing the type since no other users use it.
And this also cause the legacy cost model compute the wrong cost when the MinBW analysis indicate that the type can be narrow but not transformed in the following vplan optimizations.
This patch also fix: #<!-- -->115744, which the MinBW found the `%3 = trunc i64 %2 to i32` can further narrow to `i8`.
But the user of the `trunc` is `PHI` which remain `i32`. So the vplan transform cannot narrow the type.
And the legacy cost model use the information from the MinBW and get the wrong cost that is misaligned to the VPlan-based cost model.
---
Full diff: https://github.com/llvm/llvm-project/pull/117330.diff
2 Files Affected:
- (modified) llvm/lib/Analysis/VectorUtils.cpp (+7)
- (added) llvm/test/Transforms/LoopVectorize/trunc-cast.ll (+162)
``````````diff
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 1789671276ffaf..6102e54586b4c7 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -742,6 +742,13 @@ llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
Abort = true;
break;
}
+
+ // If all of instructions in the chain are load and cast instructions, we
+ // cannot get any benefit from MinBW.
+ if (all_of(llvm::make_range(ECs.member_begin(I), ECs.member_end()),
+ [](Value *M) { return isa<CastInst, LoadInst>(M); }))
+ Abort = true;
+
if (Abort)
continue;
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-cast.ll b/llvm/test/Transforms/LoopVectorize/trunc-cast.ll
new file mode 100644
index 00000000000000..b76561d5584eb3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/trunc-cast.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --passes=loop-vectorize -S %s -mtriple riscv64 -mattr=+v | FileCheck %s
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+
+define void @h(ptr %i, ptr %k, i64 %idxprom.us) #0 {
+; CHECK-LABEL: define void @h(
+; CHECK-SAME: ptr [[I:%.*]], ptr [[K:%.*]], i64 [[IDXPROM_US:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 1073741824, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1073741824, [[TMP3]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[N_MOD_VF]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1073741824, [[TMP5]]
+; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT: [[TMP9:%.*]] = add <vscale x 2 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = mul <vscale x 2 x i32> [[TMP9]], splat (i32 4)
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP10]]
+; CHECK-NEXT: [[TMP11:%.*]] = mul i32 4, [[TMP7]]
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP11]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[DOTSPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 2
+; CHECK-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1
+; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 2 x i8> poison, i8 0, i32 [[TMP14]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[I]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 2 x i8> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = zext <vscale x 2 x i32> [[VEC_IND]] to <vscale x 2 x i64>
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr [2 x i16], ptr [[K]], <vscale x 2 x i64> [[TMP15]], i64 [[IDXPROM_US]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> [[TMP16]], i32 2, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i16> poison)
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <vscale x 2 x i16> [[WIDE_MASKED_GATHER]], zeroinitializer
+; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i64> poison)
+; CHECK-NEXT: [[TMP18:%.*]] = shl <vscale x 2 x i64> zeroinitializer, [[WIDE_MASKED_GATHER1]]
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <vscale x 2 x i64> [[TMP18]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = xor <vscale x 2 x i1> [[TMP19]], splat (i1 true)
+; CHECK-NEXT: [[TMP21:%.*]] = select <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i1> [[TMP20]], <vscale x 2 x i1> zeroinitializer
+; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 2 x i1> [[TMP21]], <vscale x 2 x i64> poison)
+; CHECK-NEXT: [[TMP22:%.*]] = trunc <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]] to <vscale x 2 x i32>
+; CHECK-NEXT: [[TMP23:%.*]] = xor <vscale x 2 x i1> [[TMP17]], splat (i1 true)
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP21]], <vscale x 2 x i32> [[TMP22]], <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[PREDPHI3:%.*]] = select <vscale x 2 x i1> [[TMP23]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[PREDPHI]]
+; CHECK-NEXT: [[TMP24:%.*]] = trunc <vscale x 2 x i32> [[PREDPHI3]] to <vscale x 2 x i8>
+; CHECK-NEXT: [[TMP25]] = mul <vscale x 2 x i8> zeroinitializer, [[TMP24]]
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr [2 x i64], ptr [[I]], i64 0, <vscale x 2 x i64> [[TMP15]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP26]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; CHECK-NEXT: [[TMP27:%.*]] = trunc <vscale x 2 x i64> [[WIDE_MASKED_GATHER4]] to <vscale x 2 x i16>
+; CHECK-NEXT: [[TMP28:%.*]] = xor <vscale x 2 x i16> [[TMP27]], zeroinitializer
+; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 2
+; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <vscale x 2 x i16> [[TMP28]], i32 [[TMP31]]
+; CHECK-NEXT: store i16 [[TMP32]], ptr null, align 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP7]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 2
+; CHECK-NEXT: [[TMP36:%.*]] = sub i32 [[TMP35]], 1
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 2 x i8> [[TMP25]], i32 [[TMP36]]
+; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_BODY_US:.*]]
+; CHECK: [[FOR_BODY_US]]:
+; CHECK-NEXT: [[L_046_US:%.*]] = phi i32 [ [[ADD_US:%.*]], %[[COND_END23_US:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[CONV284345_US:%.*]] = phi i8 [ [[CONV28_US:%.*]], %[[COND_END23_US]] ], [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[IDXPROM_US1:%.*]] = zext i32 [[L_046_US]] to i64
+; CHECK-NEXT: [[ARRAYIDX3_US:%.*]] = getelementptr [2 x i16], ptr [[K]], i64 [[IDXPROM_US1]], i64 [[IDXPROM_US]]
+; CHECK-NEXT: [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX3_US]], align 2
+; CHECK-NEXT: [[TOBOOL4_NOT_US:%.*]] = icmp eq i16 [[TMP37]], 0
+; CHECK-NEXT: br i1 [[TOBOOL4_NOT_US]], label %[[COND_FALSE7_US:.*]], label %[[COND_END23_US]]
+; CHECK: [[COND_FALSE7_US]]:
+; CHECK-NEXT: [[TMP38:%.*]] = load i64, ptr [[I]], align 8
+; CHECK-NEXT: [[SHL_US:%.*]] = shl i64 0, [[TMP38]]
+; CHECK-NEXT: [[TOBOOL12_NOT_US:%.*]] = icmp eq i64 [[SHL_US]], 0
+; CHECK-NEXT: br i1 [[TOBOOL12_NOT_US]], label %[[COND_END23_US]], label %[[COND_TRUE14_US:.*]]
+; CHECK: [[COND_TRUE14_US]]:
+; CHECK-NEXT: [[TMP39:%.*]] = load i64, ptr [[I]], align 8
+; CHECK-NEXT: [[TMP40:%.*]] = trunc i64 [[TMP39]] to i32
+; CHECK-NEXT: br label %[[COND_END23_US]]
+; CHECK: [[COND_END23_US]]:
+; CHECK-NEXT: [[COND24_US:%.*]] = phi i32 [ [[TMP40]], %[[COND_TRUE14_US]] ], [ 0, %[[FOR_BODY_US]] ], [ 0, %[[COND_FALSE7_US]] ]
+; CHECK-NEXT: [[TMP41:%.*]] = trunc i32 [[COND24_US]] to i8
+; CHECK-NEXT: [[CONV28_US]] = mul i8 0, [[TMP41]]
+; CHECK-NEXT: [[ARRAYIDX31_US:%.*]] = getelementptr [2 x i64], ptr [[I]], i64 0, i64 [[IDXPROM_US1]]
+; CHECK-NEXT: [[TMP42:%.*]] = load i64, ptr [[ARRAYIDX31_US]], align 8
+; CHECK-NEXT: [[TMP43:%.*]] = trunc i64 [[TMP42]] to i16
+; CHECK-NEXT: [[CONV32_US:%.*]] = xor i16 [[TMP43]], 0
+; CHECK-NEXT: store i16 [[CONV32_US]], ptr null, align 2
+; CHECK-NEXT: [[ADD_US]] = add i32 [[L_046_US]], 4
+; CHECK-NEXT: [[TOBOOL_NOT_US:%.*]] = icmp eq i32 [[ADD_US]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT_US]], label %[[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:.*]], label %[[FOR_BODY_US]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]]:
+; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i8 [ [[TMP41]], %[[COND_END23_US]] ]
+; CHECK-NEXT: store i8 [[DOTLCSSA]], ptr null, align 1
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body.us
+
+for.body.us: ; preds = %cond.end23.us, %entry
+ %l.046.us = phi i32 [ %add.us, %cond.end23.us ], [ 0, %entry ]
+ %conv284345.us = phi i8 [ %conv28.us, %cond.end23.us ], [ 0, %entry ]
+ %idxprom.us1 = zext i32 %l.046.us to i64
+ %arrayidx3.us = getelementptr [2 x i16], ptr %k, i64 %idxprom.us1, i64 %idxprom.us
+ %0 = load i16, ptr %arrayidx3.us, align 2
+ %tobool4.not.us = icmp eq i16 %0, 0
+ br i1 %tobool4.not.us, label %cond.false7.us, label %cond.end23.us
+
+cond.false7.us: ; preds = %for.body.us
+ %1 = load i64, ptr %i, align 8
+ %shl.us = shl i64 0, %1
+ %tobool12.not.us = icmp eq i64 %shl.us, 0
+ br i1 %tobool12.not.us, label %cond.end23.us, label %cond.true14.us
+
+cond.true14.us: ; preds = %cond.false7.us
+ %2 = load i64, ptr %i, align 8
+ %3 = trunc i64 %2 to i32
+ br label %cond.end23.us
+
+cond.end23.us: ; preds = %cond.true14.us, %cond.false7.us, %for.body.us
+ %cond24.us = phi i32 [ %3, %cond.true14.us ], [ 0, %for.body.us ], [ 0, %cond.false7.us ]
+ %4 = trunc i32 %cond24.us to i8
+ %conv28.us = mul i8 0, %4
+ %arrayidx31.us = getelementptr [2 x i64], ptr %i, i64 0, i64 %idxprom.us1
+ %5 = load i64, ptr %arrayidx31.us, align 8
+ %6 = trunc i64 %5 to i16
+ %conv32.us = xor i16 %6, 0
+ store i16 %conv32.us, ptr null, align 2
+ %add.us = add i32 %l.046.us, 4
+ %tobool.not.us = icmp eq i32 %add.us, 0
+ br i1 %tobool.not.us, label %for.cond.for.cond.cleanup_crit_edge, label %for.body.us
+
+for.cond.for.cond.cleanup_crit_edge: ; preds = %cond.end23.us
+ store i8 %4, ptr null, align 1
+ ret void
+}
+
+attributes #0 = { "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zicsr,+zifencei,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-b,-e,-experimental-smctr,-experimental-ssctr,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-h,-sha,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-supm,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-za64rs,-zaamo,-zabha,-zacas,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zicond,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
``````````
</details>
https://github.com/llvm/llvm-project/pull/117330
More information about the llvm-commits
mailing list