[llvm] [VPlan][LoopVectorize] Truncate min/max intrinsic ops (PR #90643)
Patrick O'Neill via llvm-commits
llvm-commits at lists.llvm.org
Wed May 1 15:56:19 PDT 2024
https://github.com/patrick-rivos updated https://github.com/llvm/llvm-project/pull/90643
>From 9b24ba53d33309b8428354a1a27f57c42f604006 Mon Sep 17 00:00:00 2001
From: Patrick O'Neill <patrick at rivosinc.com>
Date: Tue, 30 Apr 2024 10:37:54 -0700
Subject: [PATCH 1/3] [VPlan][LoopVectorize] Truncate min/max intrinsic ops
This adds support for intrinsics that are understood by DemandedBits.
Fixes #87407.
---
llvm/lib/Analysis/VectorUtils.cpp | 32 +
.../Transforms/Vectorize/LoopVectorize.cpp | 6 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 16 +-
.../Transforms/Vectorize/VPlanAnalysis.cpp | 9 +-
llvm/lib/Transforms/Vectorize/VPlanAnalysis.h | 1 -
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +-
.../Transforms/Vectorize/VPlanTransforms.cpp | 17 +-
.../pr87407-truncate-intrinsics.ll | 1017 +++++++++++++++++
.../Transforms/Vectorize/VPlanTest.cpp | 7 +-
9 files changed, 1087 insertions(+), 26 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/pr87407-truncate-intrinsics.ll
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 917094267d05ae..7d49fc719c8bac 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -611,6 +611,14 @@ llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
!InstructionSet.count(I))
continue;
+ // Byteswaps require at least 16 bits
+ if (const auto *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() == Intrinsic::bswap) {
+ DBits[Leader] |= 0xFFFF;
+ DBits[I] |= 0xFFFF;
+ }
+ }
+
// Unsafe casts terminate a chain unsuccessfully. We can't do anything
// useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
// transform anything that relies on them.
@@ -687,6 +695,30 @@ llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
isa<ShlOperator, LShrOperator, AShrOperator>(U.getUser()) &&
U.getOperandNo() == 1)
return CI->uge(MinBW);
+ // Ignore the call pointer when considering intrinsics that
+ // DemandedBits understands.
+ if (U->getType()->isPointerTy() && isa<CallInst>(U.getUser()) &&
+ dyn_cast<CallInst>(U.getUser())->getCalledFunction() ==
+ dyn_cast<Function>(U)) {
+ if (const auto *II = dyn_cast<IntrinsicInst>(U.getUser())) {
+ // Only ignore cases that DemandedBits understands.
+ switch (II->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::umax:
+ case Intrinsic::umin:
+ case Intrinsic::smax:
+ case Intrinsic::smin:
+ case Intrinsic::fshl:
+ case Intrinsic::fshr:
+ case Intrinsic::cttz:
+ case Intrinsic::ctlz:
+ case Intrinsic::bitreverse:
+ case Intrinsic::bswap:
+ return false;
+ }
+ }
+ }
uint64_t BW = bit_width(DB.getDemandedBits(&U).getZExtValue());
return bit_ceil(BW) > MinBW;
}))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d1c54b928f9fa2..6bd4158f0f503c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8284,7 +8284,7 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
Range);
if (ShouldUseVectorIntrinsic)
return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()), ID,
- CI->getDebugLoc());
+ CI->getType(), CI->getDebugLoc());
Function *Variant = nullptr;
std::optional<unsigned> MaskPos;
@@ -8337,8 +8337,8 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
}
return new VPWidenCallRecipe(CI, make_range(Ops.begin(), Ops.end()),
- Intrinsic::not_intrinsic, CI->getDebugLoc(),
- Variant);
+ Intrinsic::not_intrinsic, CI->getType(),
+ CI->getDebugLoc(), Variant);
}
return nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 71594be2b965aa..72122780924a63 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1455,14 +1455,17 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
/// chosen vectorized variant, so there will be a different vplan for each
/// VF with a valid variant.
Function *Variant;
+ /// Result type for the cast.
+ Type *ResultTy;
public:
template <typename IterT>
VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments,
- Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {},
- Function *Variant = nullptr)
+ Intrinsic::ID VectorIntrinsicID, Type *ResultTy,
+ DebugLoc DL = {}, Function *Variant = nullptr)
: VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, UV, DL),
- VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {
+ VectorIntrinsicID(VectorIntrinsicID), Variant(Variant),
+ ResultTy(ResultTy) {
assert(
isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) &&
"last operand must be the called function");
@@ -1472,7 +1475,7 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
VPWidenCallRecipe *clone() override {
return new VPWidenCallRecipe(getUnderlyingValue(), operands(),
- VectorIntrinsicID, getDebugLoc(), Variant);
+ VectorIntrinsicID, ResultTy, getDebugLoc(), Variant);
}
VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
@@ -1496,6 +1499,11 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns the result type of the cast.
+ Type *getResultType() const { return ResultTy; }
+
+ void setResultType(Type *newResTy) { ResultTy = newResTy; }
};
/// A recipe for widening select instructions.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c2..e96191fb58e786 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -110,11 +110,6 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
llvm_unreachable("Unhandled opcode!");
}
-Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
- auto &CI = *cast<CallInst>(R->getUnderlyingInstr());
- return CI.getType();
-}
-
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
assert((isa<VPWidenLoadRecipe>(R) || isa<VPWidenLoadEVLRecipe>(R)) &&
"Store recipes should not define any values");
@@ -238,7 +233,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
return inferScalarType(R->getOperand(0));
})
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
- VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+ VPWidenMemoryRecipe, VPWidenSelectRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
.Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
// TODO: Use info from interleave group.
@@ -248,6 +243,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
[](const VPWidenCastRecipe *R) { return R->getResultType(); })
.Case<VPScalarCastRecipe>(
[](const VPScalarCastRecipe *R) { return R->getResultType(); })
+ .Case<VPWidenCallRecipe>(
+ [](const VPWidenCallRecipe *R) { return R->getResultType(); })
.Case<VPExpandSCEVRecipe>([](const VPExpandSCEVRecipe *R) {
return R->getSCEV()->getType();
});
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 7d310b1b31b6fe..c41beb79ba9a5e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -43,7 +43,6 @@ class VPTypeAnalysis {
Type *inferScalarTypeForRecipe(const VPBlendRecipe *R);
Type *inferScalarTypeForRecipe(const VPInstruction *R);
- Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenIntOrFpInductionRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 29ed001ccd2c77..19fcea4711b490 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -723,8 +723,8 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
// Add return type if intrinsic is overloaded on it.
if (UseIntrinsic &&
isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
- TysForDecl.push_back(VectorType::get(
- CalledScalarFn->getReturnType()->getScalarType(), State.VF));
+ TysForDecl.push_back(
+ VectorType::get(getResultType()->getScalarType(), State.VF));
SmallVector<Value *, 4> Args;
for (const auto &I : enumerate(arg_operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
@@ -780,14 +780,14 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN-CALL ";
- Function *CalledFn = getCalledScalarFunction();
- if (CalledFn->getReturnType()->isVoidTy())
+ if (getResultType()->isVoidTy())
O << "void ";
else {
printAsOperand(O, SlotTracker);
O << " = ";
}
+ Function *CalledFn = getCalledScalarFunction();
O << "call @" << CalledFn->getName() << "(";
interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
Op->printAsOperand(O, SlotTracker);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 017b00c042f4a4..981f31befc0c8c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -74,9 +74,10 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
- NewRecipe = new VPWidenCallRecipe(
- CI, Ingredient.operands(), getVectorIntrinsicIDForCall(CI, &TLI),
- CI->getDebugLoc());
+ NewRecipe =
+ new VPWidenCallRecipe(CI, Ingredient.operands(),
+ getVectorIntrinsicIDForCall(CI, &TLI),
+ CI->getType(), CI->getDebugLoc());
} else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
} else if (auto *CI = dyn_cast<CastInst>(Inst)) {
@@ -971,8 +972,8 @@ void VPlanTransforms::truncateToMinimalBitwidths(
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
- VPWidenSelectRecipe, VPWidenLoadRecipe>(&R))
+ if (!isa<VPWidenRecipe, VPWidenCallRecipe, VPWidenCastRecipe,
+ VPReplicateRecipe, VPWidenSelectRecipe, VPWidenLoadRecipe>(&R))
continue;
VPValue *ResultVPV = R.getVPSingleValue();
@@ -1078,6 +1079,12 @@ void VPlanTransforms::truncateToMinimalBitwidths(
}
}
+ // If this was a WIDEN-CALL (intrinsic) then we need to update the return
+ // type so it's compatible with the new args.
+ if (isa<VPWidenCallRecipe>(&R)) {
+ auto *callInsn = dyn_cast<VPWidenCallRecipe>(&R);
+ callInsn->setResultType(NewResTy);
+ }
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/pr87407-truncate-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/pr87407-truncate-intrinsics.ll
new file mode 100644
index 00000000000000..36670b8e8fef71
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr87407-truncate-intrinsics.ll
@@ -0,0 +1,1017 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; REQUIRES: asserts
+; RUN: opt -S -passes=loop-vectorize < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+
+define i32 @truncate_umax() #1 {
+; CHECK-LABEL: define i32 @truncate_umax(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.umax.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.umax.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.umax.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_umin() #1 {
+; CHECK-LABEL: define i32 @truncate_umin(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.umin.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.umin.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.umin.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_smax() #1 {
+; CHECK-LABEL: define i32 @truncate_smax(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.smax.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.smax.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.smax.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_smin() #1 {
+; CHECK-LABEL: define i32 @truncate_smin(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.smin.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.smin.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.smin.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_fshl() #1 {
+; CHECK-LABEL: define i32 @truncate_fshl(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.fshl.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, <4 x i1> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[ZEXT_0]], i64 0, i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.fshl.i64(i64 %zext.0, i64 0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_fshr() #1 {
+; CHECK-LABEL: define i32 @truncate_fshr(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.fshr.v4i1(<4 x i1> zeroinitializer, <4 x i1> zeroinitializer, <4 x i1> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.fshr.i64(i64 [[ZEXT_0]], i64 0, i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.fshr.i64(i64 %zext.0, i64 0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_cttz() #1 {
+; CHECK-LABEL: define i32 @truncate_cttz(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.cttz.v4i1(<4 x i1> zeroinitializer, i1 false)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.cttz.i64(i64 [[ZEXT_0]], i1 false)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.cttz.i64(i64 %zext.0, i1 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_ctlz() #1 {
+; CHECK-LABEL: define i32 @truncate_ctlz(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.ctlz.v4i1(<4 x i1> zeroinitializer, i1 false)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.ctlz.i64(i64 [[ZEXT_0]], i1 false)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.ctlz.i64(i64 %zext.0, i1 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_bitreverse() #1 {
+; CHECK-LABEL: define i32 @truncate_bitreverse(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i1> @llvm.bitreverse.v4i1(<4 x i1> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i1> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.bitreverse.i64(i64 [[ZEXT_0]])
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.bitreverse.i64(i64 %zext.0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_bswap() #1 {
+; CHECK-LABEL: define i32 @truncate_bswap(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[ZEXT_0]])
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.bswap.i64(i64 %zext.0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+; Unsupported intrinsics
+
+define i32 @truncate_sadd_sat() #1 {
+; CHECK-LABEL: define i32 @truncate_sadd_sat(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i1>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i1> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i8> [[TMP5]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.sadd.sat.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.sadd.sat.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_uadd_sat() #1 {
+; CHECK-LABEL: define i32 @truncate_uadd_sat(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i1>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i1> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i8> [[TMP5]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.uadd.sat.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.uadd.sat.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_ssub_sat() #1 {
+; CHECK-LABEL: define i32 @truncate_ssub_sat(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i1>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i1> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i8> [[TMP5]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.uadd.sat.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.uadd.sat.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_usub_sat() #1 {
+; CHECK-LABEL: define i32 @truncate_usub_sat(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> zeroinitializer, <4 x i64> zeroinitializer)
+; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[TMP0]] to <4 x i1>
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i1> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT: [[TMP5:%.*]] = trunc <4 x i32> [[TMP4]] to <4 x i8>
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i8> [[TMP5]], i32 3
+; CHECK-NEXT: br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.usub.sat.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.usub.sat.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_sshl_sat() #1 {
+; CHECK-LABEL: define i32 @truncate_sshl_sat(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.sshl.sat.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT:%.*]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.sshl.sat.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+define i32 @truncate_ushl_sat() #1 {
+; CHECK-LABEL: define i32 @truncate_ushl_sat(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[PHI_0:%.*]] = phi i64 [ [[INCREMENTOR:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[INCREMENTOR]] = add i64 [[PHI_0]], 1
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i8 0 to i64
+; CHECK-NEXT: [[INTRINSIC_0:%.*]] = tail call i64 @llvm.ushl.sat.i64(i64 [[ZEXT_0]], i64 0)
+; CHECK-NEXT: [[CMP_0:%.*]] = icmp ne i64 [[INTRINSIC_0]], 0
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i1 [[CMP_0]] to i64
+; CHECK-NEXT: [[TRUNC_0:%.*]] = trunc i64 [[ZEXT_1]] to i32
+; CHECK-NEXT: [[SHL_0:%.*]] = shl i32 [[TRUNC_0]], 8
+; CHECK-NEXT: [[TRUNC_1:%.*]] = trunc i32 [[SHL_0]] to i8
+; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[PHI_0]], 16
+; CHECK-NEXT: br i1 [[EXITCOND6]], label [[LOOP]], label [[LOOP_EXIT:%.*]]
+; CHECK: loop.exit:
+; CHECK-NEXT: [[TRUNC_1_LCSSA:%.*]] = phi i8 [ [[TRUNC_1]], [[LOOP]] ]
+; CHECK-NEXT: store i8 [[TRUNC_1_LCSSA]], ptr null, align 1
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %loop
+
+loop: ; preds = %loop, %entry
+ %phi.0 = phi i64 [ %incrementor, %loop ], [ 0, %entry ]
+ %incrementor = add i64 %phi.0, 1
+
+ %zext.0 = zext i8 0 to i64
+ %intrinsic.0 = tail call i64 @llvm.ushl.sat.i64(i64 %zext.0, i64 0)
+ %cmp.0 = icmp ne i64 %intrinsic.0, 0
+ %zext.1 = zext i1 %cmp.0 to i64
+ %trunc.0 = trunc i64 %zext.1 to i32
+ %shl.0 = shl i32 %trunc.0, 8 ; Shift and truncate to remove any alive bits
+ %trunc.1 = trunc i32 %shl.0 to i8
+
+ %exitcond6 = icmp ne i64 %phi.0, 16
+ br i1 %exitcond6, label %loop, label %loop.exit
+
+loop.exit: ; preds = %loop
+ store i8 %trunc.1, ptr null, align 1
+ ret i32 0
+}
+
+attributes #1 = { "target-features"="+v" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
+; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
+; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
+; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]}
+; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
+; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
+; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
+; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index eda4723f67b23d..cf8c1b1e6e618d 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -904,7 +904,8 @@ TEST(VPRecipeTest, CastVPWidenCallRecipeToVPUserAndVPDef) {
Args.push_back(&Op1);
Args.push_back(&Op2);
Args.push_back(&CalledFn);
- VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false);
+ VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false,
+ Call->getType());
EXPECT_TRUE(isa<VPUser>(&Recipe));
VPRecipeBase *BaseR = &Recipe;
EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1170,7 +1171,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
Args.push_back(&Op1);
Args.push_back(&Op2);
Args.push_back(&CalledFn);
- VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false);
+ VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false, Call->getType());
EXPECT_TRUE(Recipe.mayHaveSideEffects());
EXPECT_TRUE(Recipe.mayReadFromMemory());
EXPECT_TRUE(Recipe.mayWriteToMemory());
@@ -1193,7 +1194,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
Args.push_back(&Op1);
Args.push_back(&Op2);
Args.push_back(&CalledFn);
- VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false);
+ VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false, Call->getType());
EXPECT_FALSE(Recipe.mayHaveSideEffects());
EXPECT_FALSE(Recipe.mayReadFromMemory());
EXPECT_FALSE(Recipe.mayWriteToMemory());
>From 7ee7768ab3c8798aa2b33f62ec2121cdf03bd92b Mon Sep 17 00:00:00 2001
From: Patrick O'Neill <patrick at rivosinc.com>
Date: Wed, 1 May 2024 15:33:09 -0700
Subject: [PATCH 2/3] fixup! [VPlan][LoopVectorize] Truncate min/max intrinsic
ops
---
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 981f31befc0c8c..4d897168112924 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -74,10 +74,9 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
- NewRecipe =
- new VPWidenCallRecipe(CI, Ingredient.operands(),
- getVectorIntrinsicIDForCall(CI, &TLI),
- CI->getType(), CI->getDebugLoc());
+ NewRecipe = new VPWidenCallRecipe(
+ CI, Ingredient.operands(), getVectorIntrinsicIDForCall(CI, &TLI),
+ CI->getType(), CI->getDebugLoc());
} else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
} else if (auto *CI = dyn_cast<CastInst>(Inst)) {
@@ -1050,7 +1049,9 @@ void VPlanTransforms::truncateToMinimalBitwidths(
// Shrink operands by introducing truncates as needed.
unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
- for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
+ unsigned EndIdx =
+ R.getNumOperands() - (isa<VPWidenCallRecipe>(&R) ? 1 : 0);
+ for (unsigned Idx = StartIdx; Idx != EndIdx; ++Idx) {
auto *Op = R.getOperand(Idx);
unsigned OpSizeInBits =
TypeInfo.inferScalarType(Op)->getScalarSizeInBits();
>From e864a67ae6b68ea1fc0795ffe6c1a094b53472b8 Mon Sep 17 00:00:00 2001
From: Patrick O'Neill <patrick at rivosinc.com>
Date: Wed, 1 May 2024 15:55:53 -0700
Subject: [PATCH 3/3] fixup! [VPlan][LoopVectorize] Truncate min/max intrinsic
ops
---
llvm/lib/Transforms/Vectorize/VPlan.h | 3 ++-
llvm/unittests/Transforms/Vectorize/VPlanTest.cpp | 6 ++++--
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 72122780924a63..2efe54db38d54d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1475,7 +1475,8 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
VPWidenCallRecipe *clone() override {
return new VPWidenCallRecipe(getUnderlyingValue(), operands(),
- VectorIntrinsicID, ResultTy, getDebugLoc(), Variant);
+ VectorIntrinsicID, ResultTy, getDebugLoc(),
+ Variant);
}
VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index cf8c1b1e6e618d..cc1d36ce12c263 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1171,7 +1171,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
Args.push_back(&Op1);
Args.push_back(&Op2);
Args.push_back(&CalledFn);
- VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false, Call->getType());
+ VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false,
+ Call->getType());
EXPECT_TRUE(Recipe.mayHaveSideEffects());
EXPECT_TRUE(Recipe.mayReadFromMemory());
EXPECT_TRUE(Recipe.mayWriteToMemory());
@@ -1194,7 +1195,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
Args.push_back(&Op1);
Args.push_back(&Op2);
Args.push_back(&CalledFn);
- VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false, Call->getType());
+ VPWidenCallRecipe Recipe(Call, make_range(Args.begin(), Args.end()), false,
+ Call->getType());
EXPECT_FALSE(Recipe.mayHaveSideEffects());
EXPECT_FALSE(Recipe.mayReadFromMemory());
EXPECT_FALSE(Recipe.mayWriteToMemory());
More information about the llvm-commits
mailing list