[llvm] [VPlan][LoopVectorize] Truncate min/max intrinsic ops (PR #90643)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 30 11:14:45 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-analysis
@llvm/pr-subscribers-llvm-transforms
Author: Patrick O'Neill (patrick-rivos)
<details>
<summary>Changes</summary>
This adds support for intrinsics that are understood by DemandedBits.
Fixes #<!-- -->87407.
---
Patch is 62.99 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90643.diff
9 Files Affected:
- (modified) llvm/lib/Analysis/VectorUtils.cpp (+32)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+3-3)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+13-5)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+3-6)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.h (-1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+3-3)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+12-5)
- (added) llvm/test/Transforms/LoopVectorize/pr87407-truncate-intrinsics.ll (+1017)
- (modified) llvm/unittests/Transforms/Vectorize/VPlanTest.cpp (+6-5)
``````````diff
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 917094267d05ae..7d49fc719c8bac 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -611,6 +611,14 @@ llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
!InstructionSet.count(I))
continue;
+ // Byteswaps require at least 16 bits
+ if (const auto *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() == Intrinsic::bswap) {
+ DBits[Leader] |= 0xFFFF;
+ DBits[I] |= 0xFFFF;
+ }
+ }
+
// Unsafe casts terminate a chain unsuccessfully. We can't do anything
// useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
// transform anything that relies on them.
@@ -687,6 +695,30 @@ llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
isa<ShlOperator, LShrOperator, AShrOperator>(U.getUser()) &&
U.getOperandNo() == 1)
return CI->uge(MinBW);
+ // Ignore the call pointer when considering intrinsics that
+ // DemandedBits understands.
+ if (U->getType()->isPointerTy() && isa<CallInst>(U.getUser()) &&
+ dyn_cast<CallInst>(U.getUser())->getCalledFunction() ==
+ dyn_cast<Function>(U)) {
+ if (const auto *II = dyn_cast<IntrinsicInst>(U.getUser())) {
+ // Only ignore cases that DemandedBits understands.
+ switch (II->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::umax:
+ case Intrinsic::umin:
+ case Intrinsic::smax:
+ case Intrinsic::smin:
+ case Intrinsic::fshl:
+ case Intrinsic::fshr:
+ case Intrinsic::cttz:
+ case Intrinsic::ctlz:
+ case Intrinsic::bitreverse:
+ case Intrinsic::bswap:
+ return false;
+ }
+ }
+ }
uint64_t BW = bit_width(DB.getDemandedBits(&U).getZExtValue());
return bit_ceil(BW) > MinBW;
}))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c2..27f94974b70425 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8276,7 +8276,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
Range);
if (ShouldUseVectorIntrinsic)
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
- CI->getDebugLoc());
+ CI->getType(), CI->getDebugLoc());
Function *Variant = nullptr;
std::optional<unsigned> MaskPos;
@@ -8329,8 +8329,8 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
}
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
- Intrinsic::not_intrinsic, CI->getDebugLoc(),
- Variant);
+ Intrinsic::not_intrinsic, CI->getType(),
+ CI->getDebugLoc(), Variant);
}
return nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4ac..936942c5883e2b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1455,21 +1455,24 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
/// chosen vectorized variant, so there will be a different vplan for each
/// VF with a valid variant.
Function *Variant;
+ /// Result type for the cast.
+ Type *ResultTy;
public:
template <typename IterT>
VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
- Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {},
- Function *Variant = nullptr)
+ Intrinsic::ID VectorIntrinsicID, Type *ResultTy,
+ DebugLoc DL = {}, Function *Variant = nullptr)
: VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, &I, DL),
- VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {}
+ VectorIntrinsicID(VectorIntrinsicID), Variant(Variant),
+ ResultTy(ResultTy) {}
~VPWidenCallRecipe() override = default;
VPWidenCallRecipe *clone() override {
return new VPWidenCallRecipe(*cast<CallInst>(getUnderlyingInstr()),
- operands(), VectorIntrinsicID, getDebugLoc(),
- Variant);
+ operands(), VectorIntrinsicID, ResultTy,
+ getDebugLoc(), Variant);
}
VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
@@ -1482,6 +1485,11 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns the result type of the cast.
+ Type *getResultType() const { return ResultTy; }
+
+ void setResultType(Type *newResTy) { ResultTy = newResTy; }
};
/// A recipe for widening select instructions.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c2..e96191fb58e786 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -110,11 +110,6 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
llvm_unreachable("Unhandled opcode!");
}
-Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
- auto &CI = *cast<CallInst>(R->getUnderlyingInstr());
- return CI.getType();
-}
-
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenLoadEVLRecipe>(R)) &&
"Store recipes should not define any values");
@@ -238,7 +233,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
return inferScalarType(R->getOperand(0));
})
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
- VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+ VPWidenMemoryRecipe, VPWidenSelectRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
.Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
// TODO: Use info from interleave group.
@@ -248,6 +243,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
[](const VPWidenCastRecipe *R) { return R->getResultType(); })
.Case<VPScalarCastRecipe>(
[](const VPScalarCastRecipe *R) { return R->getResultType(); })
+ .Case<VPWidenCallRecipe>(
+ [](const VPWidenCallRecipe *R) { return R->getResultType(); })
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
});
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 7d310b1b31b6fe..c41beb79ba9a5e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -43,7 +43,6 @@ class VPTypeAnalysis {
Type *inferScalarTypeForRecipe(const VPBlendRecipe *R);
Type *inferScalarTypeForRecipe(const VPInstruction *R);
- Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenIntOrFpInductionRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c82..392db4a1f12652 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -717,7 +717,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
if (UseIntrinsic &&
isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
TysForDecl.push_back(
- VectorType::get(CI.getType()->getScalarType(), State.VF));
+ VectorType::get(getResultType()->getScalarType(), State.VF));
SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
@@ -770,14 +770,14 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-CALL ";
- auto *CI = cast<CallInst>(getUnderlyingInstr());
- if (CI->getType()->isVoidTy())
+ if (getResultType()->isVoidTy())
O << "void ";
else {
printAsOperand(O, SlotTracker);
O << " = ";
}
+ auto *CI = cast<CallInst>(getUnderlyingInstr());
O << "call @" << CI->getCalledFunction()->getName() << "(";
printOperands(O, SlotTracker);
O << ")";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d7bc128dcfe634..cfc1db03008ba5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -74,9 +74,10 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
- NewRecipe = new VPWidenCallRecipe(
- *CI, drop_end(Ingredient.operands()),
- getVectorIntrinsicIDForCall(CI, &TLI), CI->getDebugLoc());
+ NewRecipe =
+ new VPWidenCallRecipe(*CI, drop_end(Ingredient.operands()),
+ getVectorIntrinsicIDForCall(CI, &TLI),
+ CI->getType(), CI->getDebugLoc());
} else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
} else if (auto *CI = dyn_cast<CastInst>(Inst)) {
@@ -975,8 +976,8 @@ void VPlanTransforms::truncateToMinimalBitwidths(
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
- VPWidenSelectRecipe, VPWidenLoadRecipe>(&R))
+ if (!isa<VPWidenRecipe, VPWidenCallRecipe, VPWidenCastRecipe,
+ VPReplicateRecipe, VPWidenSelectRecipe, VPWidenLoadRecipe>(&R))
continue;
VPValue *ResultVPV = R.getVPSingleValue();
@@ -1082,6 +1083,12 @@ void VPlanTransforms::truncateToMinimalBitwidths(
}
}
+ // If this was a WIDEN-CALL (intrinsic) then we need to update the return
+ // type so it's compatible with the new args.
+ if (isa<VPWidenCallRecipe>(&R)) {
+ auto *callInsn = dyn_cast<VPWidenCallRecipe>(&R);
+ callInsn->setResultType(NewResTy);
+ }
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/pr87407-truncate-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/pr87407-truncate-intrinsics.ll
new file mode 100644
index 00000000000000..36670b8e8fef71
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr87407-truncate-intrinsics.ll
@@ -0,0 +1,1017 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; REQUIRES: asserts
+; RUN: opt -S -passes=loop-vectorize < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+
+define i32 @truncate_umax() #1 {
+; CHECK-LABEL: define i32 @truncate_umax(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.umax.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.umax.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.umax.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_umin() #1 {
+; CHECK-LABEL: define i32 @truncate_umin(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.umin.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.umin.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.umin.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_smax() #1 {
+; CHECK-LABEL: define i32 @truncate_smax(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.smax.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/90643
More information about the llvm-commits
mailing list