[llvm-branch-commits] [llvm] bd0c8fd - Revert "[SLP] Vectorize struct-returning intrinsics"
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri May 15 13:56:50 PDT 2026
Author: Kewen Meng
Date: 2026-05-15T13:56:45-07:00
New Revision: bd0c8fd2642cdd5b604dda333d6b545dcc6a780a
URL: https://github.com/llvm/llvm-project/commit/bd0c8fd2642cdd5b604dda333d6b545dcc6a780a
DIFF: https://github.com/llvm/llvm-project/commit/bd0c8fd2642cdd5b604dda333d6b545dcc6a780a.diff
LOG: Revert "[SLP] Vectorize struct-returning intrinsics"
This reverts commit 1c5e395e234b5c4c6048a51842480c0c074f6ccf.
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll
llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll
llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll
llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll
llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll
llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll
llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll
llvm/test/Transforms/SLPVectorizer/sincos.ll
llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll
Removed:
llvm/test/Transforms/SLPVectorizer/RISCV/complex-nonvect-struct-returned.ll
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9cbbb85cf002c..40623dfe922ce 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -28,7 +28,6 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
@@ -72,7 +71,6 @@
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/VectorTypeUtils.h"
#ifdef EXPENSIVE_CHECKS
#include "llvm/IR/Verifier.h"
#endif
@@ -302,10 +300,10 @@ static const unsigned MaxPHINumOperands = 128;
/// be inevitably scalarized.
static bool isValidElementType(Type *Ty) {
// TODO: Support ScalableVectorType.
- if (SLPReVec && isVectorizedTy(Ty))
- Ty = toScalarizedTy(Ty);
- return canVectorizeTy(Ty) && !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty() &&
- !Ty->isVoidTy();
+ if (SLPReVec && isa<FixedVectorType>(Ty))
+ Ty = Ty->getScalarType();
+ return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+ !Ty->isPPC_FP128Ty();
}
/// Returns the "element type" of the given value/instruction \p V.
@@ -330,33 +328,15 @@ static Type *getValueType(Value *V, bool LookThroughCmp = false) {
static unsigned getNumElements(Type *Ty) {
assert(!isa<ScalableVectorType>(Ty) &&
"ScalableVectorType is not supported.");
- if (isVectorizedTy(Ty))
- return getVectorizedTypeVF(Ty).getFixedValue();
+ if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
+ return VecTy->getNumElements();
return 1;
}
/// \returns the vector type of ScalarTy based on vectorization factor.
-static Type *getWidenedType(Type *ScalarTy, unsigned VF) {
- if (VF == 1 && !isVectorizedTy(ScalarTy)) {
- // Workaround for 1 x vector types: toVectorizedTy returns the type
- // unchanged when EC is scalar, but BoUpSLP relies on widening to
- // <1 x ScalarTy> (or struct of <1 x ElTy>) to keep the rest of the
- // pipeline operating on vector types.
- if (auto *StructTy = dyn_cast<StructType>(ScalarTy)) {
- assert(isUnpackedStructLiteral(StructTy) &&
- "expected unpacked struct literal");
- assert(all_of(StructTy->elements(), VectorType::isValidElementType) &&
- "expected all element types to be valid vector element types");
- return StructType::get(
- StructTy->getContext(),
- map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * {
- return FixedVectorType::get(ElTy, 1);
- }));
- }
- return FixedVectorType::get(ScalarTy, 1);
- }
- return toVectorizedTy(toScalarizedTy(ScalarTy),
- ElementCount::getFixed(VF * getNumElements(ScalarTy)));
+static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
+ return FixedVectorType::get(ScalarTy->getScalarType(),
+ VF * getNumElements(ScalarTy));
}
/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
@@ -364,7 +344,7 @@ static Type *getWidenedType(Type *ScalarTy, unsigned VF) {
/// legalization.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
Type *Ty, unsigned Sz) {
- if (!isValidElementType(Ty) || isa<StructType>(Ty))
+ if (!isValidElementType(Ty))
return bit_ceil(Sz);
// Find the number of elements, which forms full vectors.
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
@@ -379,7 +359,7 @@ static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
static unsigned
getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
unsigned Sz) {
- if (!isValidElementType(Ty) || isa<StructType>(Ty))
+ if (!isValidElementType(Ty))
return bit_floor(Sz);
// Find the number of elements, which forms full vectors.
unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
@@ -2059,8 +2039,6 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
return false;
if (has_single_bit(Sz))
return true;
- if (isa<StructType>(Ty))
- return false;
const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
Sz % NumParts == 0;
@@ -2070,20 +2048,19 @@ static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
/// phase. If the type is going to be scalarized or does not uses whole
/// registers, returns 1.
static unsigned
-getNumberOfParts(const TargetTransformInfo &TTI, Type *VecTy, Type *ScalarTy,
+getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
+ Type *ScalarTy,
const unsigned Limit = std::numeric_limits<unsigned>::max()) {
- if (isa<StructType>(VecTy))
- return 1;
unsigned NumParts = TTI.getNumberOfParts(VecTy);
if (NumParts == 0 || NumParts >= Limit)
return 1;
unsigned Sz = getNumElements(VecTy);
unsigned ScalarSz = getNumElements(ScalarTy);
- Type *ElementTy = toScalarizedTy(VecTy);
- unsigned PWSz = getFullVectorNumberOfElements(TTI, ElementTy, Sz);
+ unsigned PWSz =
+ getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz);
if (NumParts >= Sz || PWSz % NumParts != 0 ||
(PWSz / NumParts) % ScalarSz != 0 ||
- !hasFullVectorsOrPowerOf2(TTI, ElementTy, PWSz / NumParts))
+ !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts))
return 1;
const unsigned NumElts = PWSz / NumParts;
if (divideCeil(Sz, NumElts) != NumParts)
@@ -2232,14 +2209,14 @@ class slpvectorizer::BoUpSLP {
ReductionBitWidth >=
DL->getTypeSizeInBits(
VectorizableTree.front()->Scalars.front()->getType()))
- return cast<FixedVectorType>(
- getWidenedType(VectorizableTree.front()->Scalars.front()->getType(),
- VectorizableTree.front()->getVectorFactor()));
- return cast<FixedVectorType>(getWidenedType(
+ return getWidenedType(
+ VectorizableTree.front()->Scalars.front()->getType(),
+ VectorizableTree.front()->getVectorFactor());
+ return getWidenedType(
IntegerType::get(
VectorizableTree.front()->Scalars.front()->getContext(),
ReductionBitWidth),
- VectorizableTree.front()->getVectorFactor()));
+ VectorizableTree.front()->getVectorFactor());
}
/// Returns true if the tree results in one of the reduced bitcasts variants.
@@ -4012,7 +3989,8 @@ class slpvectorizer::BoUpSLP {
/// scalar/slot type used to widen into \p VecTy/\p FinalVecTy and may itself
/// be a FixedVectorType in ReVec mode or an adjusted type due to MinBWs.
InstructionCost getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
- Type *VecTy, Type *FinalVecTy,
+ VectorType *VecTy,
+ VectorType *FinalVecTy,
TTI::TargetCostKind CostKind) const;
/// This is the recursive part of buildTree.
@@ -7129,12 +7107,12 @@ static InstructionCost getExtractWithExtendCost(
const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
VectorType *VecTy, unsigned Index,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
- if (isVectorizedTy(Dst)) {
+ if (auto *ScalarTy = dyn_cast<FixedVectorType>(Dst)) {
assert(SLPReVec && "Only supported by REVEC.");
- auto *SubTp = cast<FixedVectorType>(
- getWidenedType(toScalarizedTy(VecTy), getNumElements(Dst)));
+ auto *SubTp =
+ getWidenedType(VecTy->getElementType(), ScalarTy->getNumElements());
return getShuffleCost(TTI, TTI::SK_ExtractSubvector, VecTy, {}, CostKind,
- Index * getNumElements(Dst), SubTp) +
+ Index * ScalarTy->getNumElements(), SubTp) +
TTI.getCastInstrCost(Opcode, Dst, SubTp, TTI::CastContextHint::None,
CostKind);
}
@@ -7227,7 +7205,7 @@ static bool isMaskedLoadCompress(
InterleaveFactor = 0;
Type *ScalarTy = VL.front()->getType();
const size_t Sz = VL.size();
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, Sz));
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
SmallVector<int> Mask;
if (!Order.empty())
@@ -7263,7 +7241,7 @@ static bool isMaskedLoadCompress(
// Check for very large distances between elements.
if (*Diff / Sz >= MaxRegSize / 8)
return false;
- LoadVecTy = cast<FixedVectorType>(getWidenedType(ScalarTy, *Diff + 1));
+ LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
Align CommonAlignment = LI->getAlign();
IsMasked = !isSafeToLoadUnconditionally(
@@ -7312,8 +7290,8 @@ static bool isMaskedLoadCompress(
}
if (IsStrided && !IsMasked && Order.empty()) {
// Check for potential segmented(interleaved) loads.
- VectorType *AlignedLoadVecTy = cast<VectorType>(getWidenedType(
- ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1)));
+ VectorType *AlignedLoadVecTy = getWidenedType(
+ ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment,
DL, cast<LoadInst>(VL.back()), &AC, &DT,
&TLI))
@@ -7504,7 +7482,7 @@ bool BoUpSLP::analyzeConstantStrideCandidate(
Type *StrideTy = DL->getIndexType(Ptr0->getType());
SPtrInfo.StrideVal = ConstantInt::getSigned(StrideTy, StrideIntVal);
- SPtrInfo.Ty = cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
+ SPtrInfo.Ty = getWidenedType(NewScalarTy, VecSz);
return true;
}
@@ -7559,8 +7537,7 @@ bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
NewScalarTy = Type::getIntNTy(
SE->getContext(),
DL->getTypeSizeInBits(BaseTy).getFixedValue() * NumOffsets);
- auto *StridedLoadTy =
- cast<FixedVectorType>(getWidenedType(NewScalarTy, VecSz));
+ FixedVectorType *StridedLoadTy = getWidenedType(NewScalarTy, VecSz);
unsigned MinProfitableStridedOps =
IsLoad ? MinProfitableStridedLoads : MinProfitableStridedStores;
const unsigned BaseTyNumElts = getNumElements(BaseTy);
@@ -7759,7 +7736,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// Check the order of pointer operands or that all pointers are the same.
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, Sz));
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
// Cache masked gather legality - both the !IsSorted path below and the
// post-branch check use the same VecTy/CommonAlignment, and the underlying
@@ -7840,7 +7817,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
// estimate as a buildvector, otherwise estimate as splat.
APInt DemandedElts = APInt::getAllOnes(Sz);
Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
- auto *PtrVecTy = cast<VectorType>(getWidenedType(PtrScalarTy, Sz));
+ VectorType *PtrVecTy = getWidenedType(PtrScalarTy, Sz);
// Cache the underlying object of PointerOps.front() - it is invariant
// across the per-V comparisons below and getUnderlyingObject walks
// GEP/cast chains.
@@ -7937,7 +7914,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
}
for (const auto &[SliceStart, LS] : States) {
const unsigned SliceVF = std::min<unsigned>(VF, VL.size() - SliceStart);
- auto *SubVecTy = cast<VectorType>(getWidenedType(ScalarTy, SliceVF));
+ auto *SubVecTy = getWidenedType(ScalarTy, SliceVF);
auto *LI0 = cast<LoadInst>(VL[SliceStart]);
InstructionCost VectorGEPCost =
(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
@@ -8542,8 +8519,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
const auto *It = find_if_not(TE.Scalars, isConstant);
if (It == TE.Scalars.begin())
return OrdersType();
- auto *Ty =
- cast<VectorType>(getWidenedType(TE.Scalars.front()->getType(), Sz));
+ auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
if (It != TE.Scalars.end()) {
OrdersType Order(Sz, Sz);
unsigned Idx = std::distance(TE.Scalars.begin(), It);
@@ -8801,12 +8777,6 @@ void BoUpSLP::reorderTopToBottom() {
// Maps a TreeEntry to the reorder indices of external users.
DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
ExternalUserReorderMap;
- // TODO: Reordering of struct types is not supported.
- if (any_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
- return TE->State == TreeEntry::Vectorize &&
- isa<StructType>(getValueType(TE->Scalars.front()));
- }))
- return;
// Compute IgnoreReorder once - it depends only on UserIgnoreList and
// VectorizableTree.front(), which do not change during this loop.
const bool IgnoreReorder =
@@ -8833,8 +8803,7 @@ void BoUpSLP::reorderTopToBottom() {
if (TE->hasState() && TE->isAltShuffle() &&
TE->State != TreeEntry::SplitVectorize) {
Type *ScalarTy = TE->Scalars[0]->getType();
- auto *VecTy =
- cast<VectorType>(getWidenedType(ScalarTy, TE->Scalars.size()));
+ VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size());
unsigned Opcode0 = TE->getOpcode();
unsigned Opcode1 = TE->getAltOpcode();
SmallBitVector OpcodeMask(
@@ -9203,10 +9172,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
}
if (Users.first) {
auto &Data = Users;
- // TODO: Reordering of struct types is not supported.
- if (Data.first->State == TreeEntry::Vectorize &&
- isa<StructType>(getValueType(Data.first->Scalars.front())))
- continue;
if (Data.first->State == TreeEntry::SplitVectorize) {
assert(
Data.second.size() <= 2 &&
@@ -10007,8 +9972,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
Loads.size());
Align Alignment = computeCommonAlignment<LoadInst>(Values);
- auto *Ty = cast<VectorType>(
- getWidenedType(Loads.front()->getType(), Loads.size()));
+ auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
return TTI->isLegalMaskedGather(Ty, Alignment) &&
!TTI->forceScalarizeMaskedGather(Ty, Alignment);
};
@@ -10306,8 +10270,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads(
// Segmented load detected - vectorize at maximum vector factor.
if (InterleaveFactor <= Slice.size() &&
TTI.isLegalInterleavedAccessType(
- cast<VectorType>(
- getWidenedType(Slice.front()->getType(), VF)),
+ getWidenedType(Slice.front()->getType(), VF),
InterleaveFactor,
cast<LoadInst>(Slice.front())->getAlign(),
cast<LoadInst>(Slice.front())
@@ -10567,10 +10530,11 @@ buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
/// function (if possible) calls. Returns invalid cost for the corresponding
/// calls, if they cannot be vectorized/will be scalarized.
static std::pair<InstructionCost, InstructionCost>
-getVectorCallCosts(CallInst *CI, Type *VecTy, TargetTransformInfo *TTI,
- TargetLibraryInfo *TLI, ArrayRef<Type *> ArgTys) {
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+ ArrayRef<Type *> ArgTys) {
auto Shape = VFShape::get(CI->getFunctionType(),
- ElementCount::getFixed(getNumElements(VecTy)),
+ ElementCount::getFixed(VecTy->getNumElements()),
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
auto LibCost = InstructionCost::getInvalid();
@@ -10631,70 +10595,6 @@ ArrayRef<const Loop *> BoUpSLP::getLoopNest(const Loop *L) {
return Res;
}
-/// Detects an extractvalue bundle that can be widened by vectorizing the
-/// underlying struct-returning calls.
-///
-/// \p VL is a bundle whose state \p S is Instruction::ExtractValue. The
-/// bundle is acceptable for widening into one struct-of-vectors call only
-/// when:
-/// - every element of \p VL is an ExtractValueInst,
-/// - every ExtractValueInst extracts the same struct field (its
-/// getIndices() matches the main op's indices),
-/// - the aggregate operands form a uniform set of CallInsts (per
-/// getSameOpcode) that is not an alt-shuffle and whose return type is
-/// a literal struct, and
-/// - every user of every such call is itself an ExtractValueInst, so the
-/// external-use extraction code can rebuild scalars via extractvalue +
-/// extractelement without needing an insertvalue chain.
-///
-/// On success returns true and fills \p Indices with the common field
-/// index path and \p Calls with the per-lane aggregate calls (in VL order),
-/// for the caller to feed as the operand of the new tree entry. Otherwise
-/// returns false and leaves the output parameters untouched.
-static bool checkEVsForVecCalls(ArrayRef<Value *> VL,
- const InstructionsState &S,
- const TargetLibraryInfo &TLI,
- SmallVectorImpl<unsigned> &Indices,
- SmallVectorImpl<Value *> &Calls) {
- assert(S && S.getOpcode() == Instruction::ExtractValue &&
- "Expected extractvalue instruction state.");
- if (!all_of(VL, IsaPred<ExtractValueInst>))
- return false;
- auto *VL0 = cast<ExtractValueInst>(S.getMainOp());
- ArrayRef<unsigned> VL0Indices = VL0->getIndices();
- SmallVector<Value *> Aggregates;
- for (Value *V : VL) {
- if (V == VL0) {
- Aggregates.push_back(VL0->getAggregateOperand());
- continue;
- }
- auto *IV = cast<ExtractValueInst>(V);
- if (IV->getIndices() != VL0Indices ||
- isa<ScalableVectorType>(IV->getType()))
- return false;
- Value *Agg = IV->getAggregateOperand();
- Aggregates.push_back(Agg);
- }
- const InstructionsState AggState = getSameOpcode(Aggregates, TLI);
- if (AggState && AggState.getOpcode() == Instruction::Call &&
- !AggState.isAltShuffle() &&
- isa<StructType>(AggState.getMainOp()->getType()) &&
- allSameBlock(Aggregates)) {
- // The struct-returning call may have non-bundle users too. The external
- // extraction code rebuilds scalars by extractvalue + extractelement,
- // which only works when every user of the call is an ExtractValueInst.
- // Bail out if any aggregate has a
diff erent kind of user.
- for (Value *Agg : Aggregates) {
- if (!all_of(Agg->users(), IsaPred<ExtractValueInst>))
- return false;
- }
- Indices.assign(VL0Indices.begin(), VL0Indices.end());
- Calls.swap(Aggregates);
- return true;
- }
- return false;
-}
-
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
const InstructionsState &S, ArrayRef<Value *> VL,
bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
@@ -10740,11 +10640,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
bool Reuse = canReuseExtract(VL, CurrentOrder);
if (Reuse || !CurrentOrder.empty())
return TreeEntry::Vectorize;
- SmallVector<unsigned> Indices;
- SmallVector<Value *> Calls;
- if (ShuffleOrOp == Instruction::ExtractValue &&
- checkEVsForVecCalls(VL, S, *TLI, Indices, Calls))
- return TreeEntry::Vectorize;
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
return TreeEntry::NeedToGather;
}
@@ -11266,12 +11161,6 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
const InstructionsState &S,
const BoUpSLP::EdgeInfo &UserTreeIdx,
const BoUpSLP &R, bool BuildGatherOnly = true) {
- // TODO: Reordering of struct types is not supported.
- if (isa<StructType>(getValueType(VL.front()))) {
- LLVM_DEBUG(dbgs() << "SLP: struct type in bundle.\n");
- ReuseShuffleIndices.clear();
- return true;
- }
// Check that every instruction appears once in this bundle.
SmallVector<Value *> UniqueValues;
SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
@@ -11383,9 +11272,8 @@ static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
if (Val != PoisonMaskElem && UniquePositions.contains(UniqueValues[Val]))
DemandedElts.setBit(Idx);
Type *ScalarTy = ::getValueType(UniqueValues.front());
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, VL.size()));
- auto *UniquesVecTy =
- cast<VectorType>(getWidenedType(ScalarTy, NumUniqueScalarValues));
+ auto *VecTy = getWidenedType(ScalarTy, VL.size());
+ auto *UniquesVecTy = getWidenedType(ScalarTy, NumUniqueScalarValues);
const unsigned NumParts = ::getNumberOfParts(TTI, VecTy, ScalarTy);
const unsigned UniquesNumParts =
::getNumberOfParts(TTI, UniquesVecTy, ScalarTy);
@@ -11541,7 +11429,7 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
Op2.push_back(V);
}
Type *ScalarTy = getValueType(VL.front());
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, VL.size()));
+ VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
unsigned Opcode0 = LocalState.getOpcode();
unsigned Opcode1 = LocalState.getAltOpcode();
SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
@@ -11576,8 +11464,8 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
if (!ReorderIndices.empty())
inversePermutation(ReorderIndices, Mask);
unsigned NumParts = TTI->getNumberOfParts(VecTy);
- auto *Op1VecTy = cast<VectorType>(getWidenedType(ScalarTy, Op1.size()));
- auto *Op2VecTy = cast<VectorType>(getWidenedType(ScalarTy, Op2.size()));
+ VectorType *Op1VecTy = getWidenedType(ScalarTy, Op1.size());
+ VectorType *Op2VecTy = getWidenedType(ScalarTy, Op2.size());
// Check non-profitable single register ops, which better to be represented
// as alternate ops.
if (NumParts >= VL.size())
@@ -11585,8 +11473,8 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
InstructionCost InsertCost = ::getShuffleCost(
*TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
- auto *SubVecTy = cast<VectorType>(
- getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size())));
+ FixedVectorType *SubVecTy =
+ getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
InstructionCost NewShuffleCost =
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
if (!LocalState.isCmpOp() && NumParts <= 1 &&
@@ -11786,16 +11674,7 @@ class InstructionsCompatibilityAnalysis {
Handler.getOperands(I).end());
return;
}
- case Instruction::ExtractValue: {
- SmallVector<unsigned> Indices;
- SmallVector<Value *> Calls;
- if (checkEVsForVecCalls(VL, S, TLI, Indices, Calls)) {
- Operands.assign(1, {});
- Operands[0].swap(Calls);
- return;
- }
- [[fallthrough]];
- }
+ case Instruction::ExtractValue:
case Instruction::ExtractElement:
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
@@ -12183,7 +12062,7 @@ class InstructionsCompatibilityAnalysis {
}
if (S && S.isAltShuffle()) {
Type *ScalarTy = S.getMainOp()->getType();
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, VL.size()));
+ VectorType *VecTy = getWidenedType(ScalarTy, VL.size());
unsigned Opcode0 = S.getOpcode();
unsigned Opcode1 = S.getAltOpcode();
SmallBitVector OpcodeMask(
@@ -12248,7 +12127,8 @@ class InstructionsCompatibilityAnalysis {
constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
InstructionCost VectorCost;
- auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
+ FixedVectorType *VecTy =
+ getWidenedType(S.getMainOp()->getType(), VL.size());
switch (MainOpcode) {
case Instruction::Add:
case Instruction::Sub:
@@ -12650,7 +12530,7 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
// Rough cost estimation, if the vector code (+ potential extracts) is
// more profitable than the scalar + buildvector.
Type *ScalarTy = VL.front()->getType();
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, VL.size()));
+ auto *VecTy = getWidenedType(ScalarTy, VL.size());
InstructionCost VectorizeCostEstimate =
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
@@ -12969,12 +12849,6 @@ void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTreeRec() towards the operands.
TE->setOperands(Operands);
- if (ShuffleOrOp == Instruction::ExtractValue) {
- SmallVector<unsigned> Indices;
- SmallVector<Value *> Calls;
- if (checkEVsForVecCalls(VL, S, *TLI, Indices, Calls))
- buildTreeRec(Operands.front(), Depth + 1, {TE, 0});
- }
return;
}
case Instruction::InsertElement: {
@@ -14278,11 +14152,10 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Cost = 0;
auto *ScalarTy = TE.Scalars.front()->getType();
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, TE.Scalars.size()));
+ auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
for (auto [Idx, Sz] : SubVectors) {
- Cost +=
- ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
- Idx, cast<VectorType>(getWidenedType(ScalarTy, Sz)));
+ Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
+ Idx, getWidenedType(ScalarTy, Sz));
}
Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElts,
/*Insert=*/true,
@@ -14495,10 +14368,8 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
Stride * LhsTE->getVectorFactor());
FastMathFlags FMF;
SmallPtrSet<Value *, 4> CheckedExtracts;
- auto *VecTy =
- cast<VectorType>(getWidenedType(ScalarTy, TE.getVectorFactor()));
- auto *SrcVecTy =
- cast<VectorType>(getWidenedType(SrcScalarTy, LhsTE->getVectorFactor()));
+ auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor());
+ auto *SrcVecTy = getWidenedType(SrcScalarTy, LhsTE->getVectorFactor());
TTI::CastContextHint CastCtx =
getCastContextHint(*getOperandEntry(LhsTE, /*Idx=*/0));
InstructionCost VecCost =
@@ -14629,7 +14500,7 @@ bool BoUpSLP::matchesInversedZExtSelect(
if (InversedCmpsIndices.empty())
return false;
- Type *VecTy =
+ VectorType *VecTy =
getWidenedType(Cmp->getOperand(0)->getType(), CmpTE->getVectorFactor());
Type *CmpTy = CmpInst::makeCmpResultType(VecTy);
@@ -14690,16 +14561,14 @@ bool BoUpSLP::matchesSelectOfBits(const TreeEntry &SelectTE) const {
// Check if bitcast is cheaper than select.
auto *DstTy = IntegerType::getIntNTy(ScalarTy->getContext(),
SelectTE.getVectorFactor());
- Type *OpTy = getWidenedType(DstTy, SelectTE.getVectorFactor());
+ VectorType *OpTy = getWidenedType(DstTy, SelectTE.getVectorFactor());
Type *CmpTy = CmpInst::makeCmpResultType(OpTy);
- auto *VecTy =
- cast<VectorType>(getWidenedType(ScalarTy, SelectTE.getVectorFactor()));
+ VectorType *VecTy = getWidenedType(ScalarTy, SelectTE.getVectorFactor());
auto It = MinBWs.find(&SelectTE);
if (It != MinBWs.end()) {
auto *EffectiveScalarTy =
IntegerType::get(F->getContext(), It->second.first);
- VecTy = cast<VectorType>(
- getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor()));
+ VecTy = getWidenedType(EffectiveScalarTy, SelectTE.getVectorFactor());
}
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost BitcastCost = TTI->getCastInstrCost(
@@ -15013,8 +14882,7 @@ void BoUpSLP::transformNodes() {
if (E.State != TreeEntry::Vectorize)
break;
Type *ScalarTy = E.getMainOp()->getType();
- auto *VecTy =
- cast<FixedVectorType>(getWidenedType(ScalarTy, E.Scalars.size()));
+ auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
// Check if profitable to represent consecutive load + reverse as strided
// load with stride -1.
@@ -15052,8 +14920,7 @@ void BoUpSLP::transformNodes() {
case Instruction::Store: {
Type *ScalarTy =
cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
- auto *VecTy =
- cast<FixedVectorType>(getWidenedType(ScalarTy, E.Scalars.size()));
+ auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
// Check if profitable to represent consecutive load + reverse as strided
// load with stride -1.
@@ -15380,7 +15247,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
return TTI::TCC_Free;
- auto *VecTy = cast<VectorType>(getWidenedType(ScalarTy, VL.size()));
+ auto *VecTy = getWidenedType(ScalarTy, VL.size());
InstructionCost GatherCost = 0;
SmallVector<Value *> Gathers(VL);
if (!Root && isSplat(VL)) {
@@ -15520,34 +15387,32 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
!ShuffleVectorInst::isIdentityMask(
MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
- Cost += ::getShuffleCost(
- TTI, *ShuffleKinds[Part],
- cast<VectorType>(getWidenedType(ScalarTy, NumElts)), MaskSlice);
+ Cost +=
+ ::getShuffleCost(TTI, *ShuffleKinds[Part],
+ getWidenedType(ScalarTy, NumElts), MaskSlice);
continue;
}
if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
!ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
- Cost += ::getShuffleCost(
- TTI, *RegShuffleKind,
- cast<VectorType>(getWidenedType(ScalarTy, EltsPerVector)), SubMask);
+ Cost +=
+ ::getShuffleCost(TTI, *RegShuffleKind,
+ getWidenedType(ScalarTy, EltsPerVector), SubMask);
}
const unsigned BaseVF = getFullVectorNumberOfElements(
*R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
assert((Idx + SubVecSize) <= BaseVF &&
"SK_ExtractSubvector index out of range");
- Cost += ::getShuffleCost(
- TTI, TTI::SK_ExtractSubvector,
- cast<VectorType>(getWidenedType(ScalarTy, BaseVF)), {}, CostKind,
- Idx, cast<VectorType>(getWidenedType(ScalarTy, SubVecSize)));
+ Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
+ getWidenedType(ScalarTy, BaseVF), {}, CostKind,
+ Idx, getWidenedType(ScalarTy, SubVecSize));
}
// Second attempt to check, if just a permute is better estimated than
// subvector extract.
SubMask.assign(NumElts, PoisonMaskElem);
copy(MaskSlice, SubMask.begin());
InstructionCost OriginalCost = ::getShuffleCost(
- TTI, *ShuffleKinds[Part],
- cast<VectorType>(getWidenedType(ScalarTy, NumElts)), SubMask);
+ TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
if (OriginalCost < Cost)
Cost = OriginalCost;
}
@@ -16222,10 +16087,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
I1 = I2 + CommonMask.size();
}
}
- Cost += ::getShuffleCost(
- TTI, TTI::SK_PermuteTwoSrc,
- cast<VectorType>(getWidenedType(ScalarTy, CommonMask.size())),
- SVMask, CostKind);
+ Cost += ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
+ getWidenedType(ScalarTy, CommonMask.size()),
+ SVMask, CostKind);
}
for (auto [E, Idx] : SubVectors) {
Type *EScalarTy = E->Scalars.front()->getType();
@@ -16248,9 +16112,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
}
Cost += ::getShuffleCost(
TTI, TTI::SK_InsertSubvector,
- cast<VectorType>(getWidenedType(ScalarTy, CommonMask.size())), {},
- CostKind, Idx,
- cast<VectorType>(getWidenedType(ScalarTy, E->getVectorFactor())));
+ getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
+ getWidenedType(ScalarTy, E->getVectorFactor()));
if (!CommonMask.empty()) {
std::iota(std::next(CommonMask.begin(), Idx),
std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
@@ -16449,7 +16312,7 @@ uint64_t BoUpSLP::getGatherNodeEffectiveScale(const TreeEntry &TE) {
InstructionCost
BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
- Type *VecTy, Type *FinalVecTy,
+ VectorType *VecTy, VectorType *FinalVecTy,
TTI::TargetCostKind CostKind) const {
InstructionCost SpillsReloads = 0;
@@ -16475,7 +16338,8 @@ BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, Type *ScalarTy,
PressureByClass[RegClass] += Parts;
};
- auto GetEntryVecTy = [&](const TreeEntry *TE) -> std::pair<Type *, Type *> {
+ auto GetEntryVecTy =
+ [&](const TreeEntry *TE) -> std::pair<Type *, VectorType *> {
Type *ScalarTy = getValueType(TE->Scalars.front());
auto BWIt = MinBWs.find(TE);
if (BWIt != MinBWs.end()) {
@@ -16627,22 +16491,21 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
InstructionCost VectorCost = 0;
if (E->ReorderIndices.empty()) {
VectorCost = ::getShuffleCost(
- *TTI, TTI::SK_InsertSubvector, cast<VectorType>(FinalVecTy), {},
- CostKind, E->CombinedEntriesWithIndices.back().second,
- cast<VectorType>(getWidenedType(
+ *TTI, TTI::SK_InsertSubvector, FinalVecTy, {}, CostKind,
+ E->CombinedEntriesWithIndices.back().second,
+ getWidenedType(
ScalarTy,
VectorizableTree[E->CombinedEntriesWithIndices.back().first]
- ->getVectorFactor())));
+ ->getVectorFactor()));
} else {
unsigned CommonVF =
std::max(VectorizableTree[E->CombinedEntriesWithIndices.front().first]
->getVectorFactor(),
VectorizableTree[E->CombinedEntriesWithIndices.back().first]
->getVectorFactor());
- VectorCost =
- ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
- cast<VectorType>(getWidenedType(ScalarTy, CommonVF)),
- E->getSplitMask(), CostKind);
+ VectorCost = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
+ getWidenedType(ScalarTy, CommonVF),
+ E->getSplitMask(), CostKind);
}
VectorCost += SpillsReloads;
LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
@@ -16666,9 +16529,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
if (!E->ReuseShuffleIndices.empty())
::addMask(Mask, E->ReuseShuffleIndices);
if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
- CommonCost = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
- cast<VectorType>(FinalVecTy), Mask, CostKind,
- /*Index=*/0, cast<VectorType>(VecTy));
+ CommonCost = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy,
+ Mask, CostKind, /*Index=*/0, VecTy);
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize ||
E->State == TreeEntry::StridedVectorize ||
@@ -16776,9 +16638,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
"MaskedLoadCompressVectorize here.");
InstructionCost ScalarCost = 0;
InstructionCost VecCost = 0;
- std::tie(ScalarCost, VecCost) =
- getGEPCosts(*TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy,
- cast<VectorType>(VecTy));
+ std::tie(ScalarCost, VecCost) = getGEPCosts(
+ *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
"Calculated GEPs cost for Tree"));
@@ -16863,7 +16724,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
NumElts = ATy->getNumElements();
else
NumElts = AggregateTy->getStructNumElements();
- SrcVecTy = cast<VectorType>(getWidenedType(OrigScalarTy, NumElts));
+ SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
}
}
if (I->hasOneUse()) {
@@ -16968,7 +16829,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// need to shift the vector.
// Do not calculate the cost if the actual size is the register size and
// we can merge this shuffle with the following SK_Select.
- auto *InsertVecTy = cast<VectorType>(getWidenedType(ScalarTy, InsertVecSz));
+ auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
if (!IsIdentity)
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
InsertVecTy, Mask);
@@ -16984,7 +16845,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
if (InsertVecSz != VecSz) {
- auto *ActualVecTy = cast<VectorType>(getWidenedType(ScalarTy, VecSz));
+ auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
CostKind, OffsetBeg - Offset, InsertVecTy);
} else {
@@ -17167,10 +17028,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
// (e.g. condition is <N x i1> while result is <N x i32>). For
// compares, the result type IS the mask (i1/vNi1). Construct the
// right type so getCmpSelInstrCost sees the actual mask/result width.
- auto *MaskTy = cast<VectorType>(getWidenedType(
- ShuffleOrOp == Instruction::Select ? VL0->getOperand(0)->getType()
- : VL0->getType(),
- VL.size()));
+ auto *MaskTy = getWidenedType(ShuffleOrOp == Instruction::Select
+ ? VL0->getOperand(0)->getType()
+ : VL0->getType(),
+ VL.size());
InstructionCost VecCost = InstructionCost::getInvalid();
if (ShuffleOrOp == Instruction::Select) {
@@ -17622,7 +17483,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
- CI, ID, getNumElements(VecTy),
+ CI, ID, VecTy->getNumElements(),
It != MinBWs.end() ? It->second.first : 0, TTI);
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
@@ -17735,7 +17596,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
},
Mask);
VecCost += ::getShuffleCost(TTIRef, TargetTransformInfo::SK_PermuteTwoSrc,
- cast<VectorType>(FinalVecTy), Mask, CostKind);
+ FinalVecTy, Mask, CostKind);
// Patterns like [fadd,fsub] can be combined into a single instruction
// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
// need to take into account their order when looking for the most used
@@ -17746,10 +17607,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1));
// If this pattern is supported by the target then we consider the
// order.
- if (TTIRef.isLegalAltInstr(cast<VectorType>(VecTy), Opcode0, Opcode1,
- OpcodeMask)) {
+ if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
InstructionCost AltVecCost = TTIRef.getAltInstrCost(
- cast<VectorType>(VecTy), Opcode0, Opcode1, OpcodeMask, CostKind);
+ VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
return AltVecCost < VecCost ? AltVecCost : VecCost;
}
// TODO: Check the reverse order too.
@@ -17785,8 +17645,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return true;
}))
return ::getShuffleCost(
- *TTI, TargetTransformInfo::SK_PermuteSingleSrc,
- cast<VectorType>(VecTy),
+ *TTI, TargetTransformInfo::SK_PermuteSingleSrc, VecTy,
calculateShufflevectorMask(E->Scalars));
}
return TTI::TCC_Free;
@@ -17865,18 +17724,6 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
return true;
}
- // FIXME: support buildvector of the gather nodes with struct types.
- if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
- return TE->isGather() && TE->hasState() &&
- TE->getOpcode() == Instruction::Call &&
- isa<StructType>(TE->getMainOp()->getType());
- })) {
- LLVM_DEBUG(
- dbgs() << "SLP: rejecting tree with buildvector struct values of size "
- << VectorizableTree.size() << ".\n");
- return true;
- }
-
// Cache values from the root node and the cost-threshold options to avoid
// re-querying them inside hot predicates below.
const unsigned TreeSize = VectorizableTree.size();
@@ -18183,8 +18030,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
if (BackVF > 2 && allSameBlock(Back.Scalars) &&
!Back.Scalars.front()->getType()->isVectorTy() &&
TTI->getScalarizationOverhead(
- cast<VectorType>(
- getWidenedType(Back.Scalars.front()->getType(), BackVF)),
+ getWidenedType(Back.Scalars.front()->getType(), BackVF),
APInt::getAllOnes(BackVF),
/*Insert=*/true, /*Extract=*/false,
TTI::TCK_RecipThroughput) > -SLPCostThreshold)
@@ -18906,8 +18752,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
GatheredLoadsNodes.insert(&TE);
if (!TE.isGather() && TE.State != TreeEntry::SplitVectorize &&
!(TE.Idx == 0 && (TE.getOpcode() == Instruction::InsertElement ||
- TE.getOpcode() == Instruction::Store)) &&
- !isa<StructType>(getValueType(TE.Scalars.front()))) {
+ TE.getOpcode() == Instruction::Store))) {
// Calculate costs of external uses.
APInt DemandedElts = APInt::getZero(TE.getVectorFactor());
for (Value *V : TE.Scalars) {
@@ -18920,10 +18765,9 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
if (It != MinBWs.end())
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor());
- InstructionCost ExtCost =
- ::getScalarizationOverhead(*TTI, ScalarTy, cast<VectorType>(VecTy),
- DemandedElts, /*Insert=*/false,
- /*Extract=*/true, CostKind);
+ InstructionCost ExtCost = ::getScalarizationOverhead(
+ *TTI, ScalarTy, VecTy, DemandedElts, /*Insert=*/false,
+ /*Extract=*/true, CostKind);
if (ExtCost.isValid() && ExtCost != 0) {
if (!Scale)
Scale = getScaleToLoopIterations(TE);
@@ -19021,7 +18865,6 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
while (!Worklist.empty() && std::get<0>(Worklist.top().second) > 0) {
TreeEntry *TE = Worklist.top().first;
if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE) ||
- isa<StructType>(getValueType(TE->Scalars.front())) ||
// Exit early if the parent node is split node and any of scalars is
// used in other split nodes.
(TE->UserTreeIndex &&
@@ -19083,7 +18926,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
auto *VecTy = getWidenedType(ScalarTy, EntryVF);
InstructionCost GatherCost = ::getScalarizationOverhead(
- *TTI, ScalarTy, cast<VectorType>(VecTy), DemandedElts,
+ *TTI, ScalarTy, VecTy, DemandedElts,
/*Insert=*/true, /*Extract=*/false, CostKind);
SmallVector<int> Mask;
if (!TE->ReorderIndices.empty() &&
@@ -19103,8 +18946,8 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
if (!TE->ReuseShuffleIndices.empty())
::addMask(Mask, TE->ReuseShuffleIndices);
if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF))
- GatherCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
- cast<VectorType>(VecTy), Mask);
+ GatherCost +=
+ ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, VecTy, Mask);
// If all scalars are reused in gather node(s) or other vector nodes, there
// might be extra cost for inserting them.
if ((!TE->hasState() || !TE->isAltShuffle()) &&
@@ -19210,7 +19053,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
ScalarTy = IntegerType::get(ScalarTy->getContext(), It->second.first);
auto *VecTy = getWidenedType(ScalarTy, TE->getVectorFactor());
InstructionCost ExtractsCost = ::getScalarizationOverhead(
- *TTI, ScalarTy, cast<VectorType>(VecTy), DemandedElts,
+ *TTI, ScalarTy, VecTy, DemandedElts,
/*Insert=*/false, /*Extract=*/true, CostKind);
InstructionCost BVCost = 0;
for (const auto &[BVE, Values] : ValuesToInsert) {
@@ -19224,7 +19067,7 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
}
auto *BVVecTy = getWidenedType(ScalarTy, BVE->getVectorFactor());
BVCost += ::getScalarizationOverhead(
- *TTI, ScalarTy, cast<VectorType>(BVVecTy), BVDemandedElts,
+ *TTI, ScalarTy, BVVecTy, BVDemandedElts,
/*Insert=*/true, /*Extract=*/false, CostKind,
BVDemandedElts.isAllOnes(), BVValues);
}
@@ -19554,20 +19397,14 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
? Instruction::ZExt
: Instruction::SExt;
VecTy = getWidenedType(MinTy, BundleWidth);
- ExtraCost = getExtractWithExtendCost(*TTI, Extend, ScalarTy,
- cast<VectorType>(VecTy), EU.Lane);
+ ExtraCost =
+ getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
<< ExtraCost << "\n");
} else {
- Type *ExtractTy = VecTy;
- if (auto *ST = dyn_cast<StructType>(VecTy)) {
- assert(EU.User && "Expected user for struct extract");
- const auto *EV = cast<ExtractValueInst>(EU.User);
- ExtractTy = ExtractValueInst::getIndexedType(ST, EV->getIndices());
- }
- ExtraCost = getVectorInstrCost(
- *TTI, ScalarTy, Instruction::ExtractElement, ExtractTy, CostKind,
- EU.Lane, EU.Scalar, ScalarUserAndIdx);
+ ExtraCost =
+ getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
+ CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
<< *VecTy << ": " << ExtraCost << "\n");
}
@@ -19671,11 +19508,6 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
if (KeepScalar) {
ExternalUsesAsOriginalScalar.insert(EU.Scalar);
for (Value *V : Inst->operands()) {
- // Struct operands cannot be rebuilt by the !User extraction
- // path (it has no insertvalue chain), so leave their existing
- // ExtractValueInst user in place.
- if (isa<StructType>(V->getType()))
- continue;
auto It = ValueToExtUses->find(V);
if (It != ValueToExtUses->end()) {
// Replace all uses to avoid compiler crash.
@@ -19691,8 +19523,6 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
// compiler crash.
if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
for (Value *V : IOp->operands()) {
- if (isa<StructType>(V->getType()))
- continue;
auto It = ValueToExtUses->find(V);
if (It != ValueToExtUses->end()) {
// Replace all uses to avoid compiler crash.
@@ -19815,10 +19645,9 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
OrigMask.begin());
- C = ::getShuffleCost(
- *TTI, TTI::SK_PermuteSingleSrc,
- cast<VectorType>(getWidenedType(TE->getMainOp()->getType(), VecVF)),
- OrigMask);
+ C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
+ getWidenedType(TE->getMainOp()->getType(), VecVF),
+ OrigMask);
LLVM_DEBUG(
dbgs() << "SLP: Adding cost " << C
<< " for final shuffle of insertelement external users.\n";
@@ -19834,10 +19663,9 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
ResizeMask[Mask[I]] = Mask[I];
}
if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
- C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
- cast<VectorType>(getWidenedType(
- TE->getMainOp()->getType(), VecVF)),
- ResizeMask);
+ C = ::getShuffleCost(
+ *TTI, TTI::SK_PermuteSingleSrc,
+ getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
LLVM_DEBUG(
dbgs() << "SLP: Adding cost " << C
<< " for final shuffle of insertelement external users.\n";
@@ -19867,8 +19695,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
(Data.index() < VF &&
static_cast<int>(Data.index()) == Data.value());
})) {
- InstructionCost C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
- cast<VectorType>(FTy), Mask);
+ InstructionCost C =
+ ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FTy, Mask);
C = ScaleCost(C, *TEs.front());
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for final shuffle of insertelement "
@@ -19886,8 +19714,8 @@ InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
VF = Mask.size();
}
auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
- InstructionCost C = ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
- cast<VectorType>(FTy), Mask);
+ InstructionCost C =
+ ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, FTy, Mask);
C = ScaleCost(C, *TEs.back());
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for final shuffle of vector node and external "
@@ -20779,10 +20607,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
}
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- auto *VecTy =
- cast<VectorType>(getWidenedType(VL.front()->getType(), NewVF));
- auto *MaskVecTy =
- cast<VectorType>(getWidenedType(VL.front()->getType(), SubMask.size()));
+ auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
+ auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
auto GetShuffleCost = [&,
&TTI = *TTI](ArrayRef<int> Mask,
ArrayRef<const TreeEntry *> Entries,
@@ -20990,16 +20816,16 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
any_of(VL, [](Value *V) { return !isa<UndefValue>(V) && isConstant(V); });
// 1. Shuffle input source vector and constant vector.
if (!ForPoisonSrc && IsAnyNonUndefConst) {
- Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteTwoSrc,
- cast<VectorType>(VecTy), ConstantShuffleMask);
+ Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteTwoSrc, VecTy,
+ ConstantShuffleMask);
}
// 2. Insert unique non-constants.
if (!DemandedElements.isZero())
- Cost += getScalarizationOverhead(
- *TTI, ScalarTy, cast<VectorType>(VecTy), DemandedElements,
- /*Insert=*/true,
- /*Extract=*/false, CostKind, ForPoisonSrc && !IsAnyNonUndefConst, VL);
+ Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
+ /*Insert=*/true,
+ /*Extract=*/false, CostKind,
+ ForPoisonSrc && !IsAnyNonUndefConst, VL);
return Cost;
}
@@ -22522,9 +22348,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
for (auto [Idx, I] : enumerate(BVMask))
if (I != PoisonMaskElem)
NewMask[Idx] = Mask.size();
- SplatCost +=
- ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc,
- cast<VectorType>(VecTy), NewMask, CostKind);
+ SplatCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy,
+ NewMask, CostKind);
InstructionCost BVCost = TTI->getVectorInstrCost(
Instruction::InsertElement, VecTy, CostKind,
*find_if(Mask, not_equal_to(PoisonMaskElem)), Vec, V);
@@ -22536,8 +22361,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
if (I != PoisonMaskElem)
NewMask[Idx] = I;
BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
- cast<VectorType>(VecTy), NewMask,
- CostKind);
+ VecTy, NewMask, CostKind);
}
return SplatCost <= BVCost;
};
@@ -22734,14 +22558,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
bool IsReverseOrder =
!E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
- if (isa<StructType>(ScalarTy)) {
- // TODO: Reordering of struct types is not supported.
- assert(E->ReorderIndices.empty() &&
- "Expected no reordering for struct types.");
- assert(E->ReuseShuffleIndices.empty() &&
- "Expected no reuse shuffle indices for struct types.");
- return V;
- }
ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
if (E->getOpcode() == Instruction::Store &&
E->State == TreeEntry::Vectorize) {
@@ -22873,19 +22689,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return V;
}
case Instruction::ExtractValue: {
- SmallVector<unsigned> Indices;
- SmallVector<Value *> Calls;
- if (checkEVsForVecCalls(E->Scalars, E->getOperations(), *TLI, Indices,
- Calls)) {
- setInsertPointAfterBundle(E);
- Value *V = vectorizeOperand(E, 0);
- V = Builder.CreateExtractValue(V, Indices);
- if (auto *I = dyn_cast<Instruction>(V))
- V = ::propagateMetadata(I, E->Scalars);
- V = FinalShuffle(V, E);
- E->VectorizedValue = V;
- return V;
- }
auto *LI = cast<LoadInst>(E->getSingleOperand(0));
Builder.SetInsertPoint(LI);
Value *Ptr = LI->getPointerOperand();
@@ -23419,7 +23222,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return Builder.getInt64(I % ScalarTyNumElements);
});
VecPtr = Builder.CreateGEP(
- toScalarizedTy(VecTy),
+ VecTy->getElementType(),
Builder.CreateShuffleVector(
VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
ConstantVector::get(Indices));
@@ -23478,8 +23281,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Intrinsic::experimental_vp_strided_store,
{VecTy, Ptr->getType(), StrideTy},
{VecValue, Ptr, StrideVal,
- Builder.getAllOnesMask(
- ElementCount::getFixed(getNumElements(VecTy))),
+ Builder.getAllOnesMask(VecTy->getElementCount()),
Builder.getInt32(E->Scalars.size())});
Inst->addParamAttr(
/*ArgNo=*/1,
@@ -23529,7 +23331,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
- CI, ID, getNumElements(VecTy),
+ CI, ID, VecTy->getNumElements(),
It != MinBWs.end() ? It->second.first : 0, TTI);
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
@@ -23539,13 +23341,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<Value *> OpVecs;
SmallVector<Type *, 2> TysForDecl;
// Add return type if intrinsic is overloaded on it.
- if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI)) {
- ArrayRef<Type *> ContainedTys = getContainedTypes(VecTy);
- for (auto [Idx, Ty] : enumerate(ContainedTys)) {
- if (isVectorIntrinsicWithStructReturnOverloadAtField(ID, Idx, TTI))
- TysForDecl.push_back(Ty);
- }
- }
+ if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
+ TysForDecl.push_back(VecTy);
auto *CEI = cast<CallInst>(VL0);
for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
// Some intrinsics have scalar arguments. This argument should not be
@@ -23569,7 +23366,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ScalarArg->getType()->getScalarType() &&
It == MinBWs.end()) {
auto *CastTy =
- getWidenedType(ScalarArg->getType(), getNumElements(VecTy));
+ getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
} else if (It != MinBWs.end()) {
OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
@@ -23584,7 +23381,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (!UseIntrinsic) {
VFShape Shape =
VFShape::get(CI->getFunctionType(),
- ElementCount::getFixed(getNumElements(VecTy)),
+ ElementCount::getFixed(VecTy->getNumElements()),
false /*HasGlobalPred*/);
CF = VFDatabase(*CI).getVectorizedFunction(Shape);
} else {
@@ -24051,21 +23848,10 @@ Value *BoUpSLP::vectorizeTree(
SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
// Maps vector instruction to original insertelement instruction
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
- // Maps extract Scalar (plus the struct-field index path, when extracting
- // from a struct of vectors) to the corresponding extractelement instruction
- // in the basic block. Only one extractelement per block should be emitted.
- // The index path is stored in an owning SmallVector so the key remains
- // valid after the per-lane ExtractValueInst (whose Indices buffer it was
- // copied from) is erased later in this loop.
- SmallDenseMap<std::pair<Value *, SmallVector<unsigned, 1>>,
- DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
+ // Maps extract Scalar to the corresponding extractelement instruction in the
+ // basic block. Only one extractelement per block should be emitted.
+ DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
ScalarToEEs;
- // Maps (struct-of-vectors Vec, field-index path) to the corresponding
- // per-block extractvalue, so
diff erent external lanes that need the same
- // struct field of the same vectorized call share a single extractvalue.
- SmallDenseMap<std::pair<Value *, SmallVector<unsigned, 1>>,
- DenseMap<BasicBlock *, Value *>>
- StructFieldExtracts;
SmallDenseSet<Value *, 4> UsedInserts;
DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
@@ -24097,18 +23883,7 @@ Value *BoUpSLP::vectorizeTree(
Value *ExV = nullptr;
auto *Inst = dyn_cast<Instruction>(Scalar);
bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
- // For struct-typed scalars, the User must be an ExtractValueInst that
- // describes which struct field is being extracted. Copy its indices
- // into an owning SmallVector so the cache key survives erasure of the
- // ExtractValueInst.
- SmallVector<unsigned, 1> Indices;
- if (isa<StructType>(Scalar->getType())) {
- assert(User && "User expected for StructType extract.");
- const auto *EV = cast<ExtractValueInst>(User);
- Indices.assign(EV->getIndices());
- }
- auto Key = std::make_pair(Scalar, Indices);
- auto It = ScalarToEEs.find(Key);
+ auto It = ScalarToEEs.find(Scalar);
if (It != ScalarToEEs.end()) {
// No need to emit many extracts, just move the only one in the
// current block.
@@ -24166,53 +23941,19 @@ Value *BoUpSLP::vectorizeTree(
Ex = createExtractVector(Builder, Vec, VecTyNumElements,
ExternalUse.Lane * VecTyNumElements);
} else {
- if (isa<StructType>(Vec->getType())) {
- assert(isa<StructType>(Scalar->getType()) &&
- "Vec is struct of vectors only when Scalar is struct.");
- auto FieldKey = std::make_pair(Vec, Indices);
- BasicBlock *EVBB = Builder.GetInsertBlock();
- Value *FieldVec = nullptr;
- auto FieldIt = StructFieldExtracts.find(FieldKey);
- if (FieldIt != StructFieldExtracts.end()) {
- auto BBIt = FieldIt->second.find(EVBB);
- if (BBIt != FieldIt->second.end())
- FieldVec = BBIt->second;
- }
- if (!FieldVec) {
- FieldVec = Builder.CreateExtractValue(Vec, Indices);
- StructFieldExtracts[FieldKey][EVBB] = FieldVec;
- } else if (auto *FieldI = dyn_cast<Instruction>(FieldVec);
- FieldI && Builder.GetInsertPoint() != EVBB->end() &&
- Builder.GetInsertPoint()->comesBefore(FieldI)) {
- // Cached extractvalue is below the current insertion point;
- // move it up so the extractelement we are about to emit can
- // use it.
- FieldI->moveBefore(*EVBB, Builder.GetInsertPoint());
- }
- Vec = FieldVec;
- }
- if (SLPReVec && isVectorizedTy(Scalar->getType())) {
- unsigned VecTyNumElements = getNumElements(Scalar->getType());
- // When REVEC is enabled, we need to extract a vector.
- // Note: The element size of Scalar may be
diff erent from the
- // element size of Vec.
- Ex = createExtractVector(Builder, Vec, VecTyNumElements,
- ExternalUse.Lane * VecTyNumElements);
- } else {
- Ex = Builder.CreateExtractElement(Vec, Lane);
- }
+ Ex = Builder.CreateExtractElement(Vec, Lane);
}
// If necessary, sign-extend or zero-extend ScalarRoot
// to the larger type.
ExV = Ex;
- if (!isa<StructType>(Scalar->getType()) &&
- Scalar->getType() != Ex->getType())
+ if (Scalar->getType() != Ex->getType())
ExV = Builder.CreateIntCast(
Ex, Scalar->getType(),
!isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
auto *I = dyn_cast<Instruction>(Ex);
- ScalarToEEs[Key].try_emplace(I ? I->getParent() : &F->getEntryBlock(),
- std::make_pair(Ex, ExV));
+ ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
+ : &F->getEntryBlock(),
+ std::make_pair(Ex, ExV));
}
// The then branch of the previous if may produce constants, since 0
// operand might be a constant.
@@ -24379,13 +24120,7 @@ Value *BoUpSLP::vectorizeTree(
} else {
Builder.SetInsertPoint(cast<Instruction>(User));
Value *NewInst = ExtractAndExtendIfNeeded(Vec);
- if (isa<StructType>(Scalar->getType()) &&
- isa_and_nonnull<ExtractValueInst>(User)) {
- User->replaceAllUsesWith(NewInst);
- eraseInstruction(cast<Instruction>(User));
- } else {
- User->replaceUsesOfWith(Scalar, NewInst);
- }
+ User->replaceUsesOfWith(Scalar, NewInst);
}
} else {
Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
@@ -27951,7 +27686,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
auto *VecTy = getWidenedType(ScalarTy, VF);
- if (getNumberOfParts(*TTI, VecTy, ScalarTy) == VF)
+ if (TTI->getNumberOfParts(VecTy) == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
unsigned ActualVF = std::min(MaxInst - I, VF);
@@ -29030,15 +28765,14 @@ class HorizontalReduction {
Type *ScalarTy = Candidates.front()->getType();
ReduxWidth =
getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
- VectorType *Tp = cast<VectorType>(getWidenedType(ScalarTy, ReduxWidth));
+ VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy);
NumRegs =
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
while (NumParts > NumRegs) {
assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
ReduxWidth = bit_floor(ReduxWidth - 1);
- VectorType *Tp =
- cast<VectorType>(getWidenedType(ScalarTy, ReduxWidth));
+ VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
NumParts = ::getNumberOfParts(TTI, Tp, ScalarTy);
NumRegs =
TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));
@@ -29646,16 +29380,14 @@ class HorizontalReduction {
} else {
VectorCost = TTI->getExtendedReductionCost(
RdxOpcode, !IsSigned, RedTy,
- cast<VectorType>(getWidenedType(RType, ReduxWidth)), FMF,
- CostKind);
+ getWidenedType(RType, ReduxWidth), FMF, CostKind);
}
}
} else {
Type *RedTy = VectorTy->getElementType();
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
std::make_pair(RedTy, true));
- VectorType *RVecTy =
- cast<VectorType>(getWidenedType(RType, ReduxWidth));
+ VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
InstructionCost FMACost = InstructionCost::getInvalid();
if (RdxKind == RecurKind::FAdd) {
// Check if the reduction operands can be converted to FMA.
@@ -29727,8 +29459,7 @@ class HorizontalReduction {
Type *RedTy = VectorTy->getElementType();
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
std::make_pair(RedTy, true));
- VectorType *RVecTy =
- cast<VectorType>(getWidenedType(RType, ReduxWidth));
+ VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
if (RType != RedTy) {
@@ -30498,7 +30229,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (!HorRdx.matchReductionForOperands())
return false;
// Check the cost of operations.
- auto *VecTy = cast<VectorType>(getWidenedType(Ty, Ops.size()));
+ VectorType *VecTy = getWidenedType(Ty, Ops.size());
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost ScalarCost =
TTI.getScalarizationOverhead(
@@ -30626,7 +30357,7 @@ static bool tryToVectorizeSequence(
// Look for the next elements with the same type, parent and operand
// kinds.
auto *I = dyn_cast<Instruction>(*IncIt);
- if (!I || R.isDeleted(I) || !isValidElementType(getValueType(I))) {
+ if (!I || R.isDeleted(I)) {
++IncIt;
continue;
}
@@ -30898,17 +30629,6 @@ static bool isNonVectorizableInst(const Instruction *I,
}
if (isa<AtomicRMWInst, AtomicCmpXchgInst>(I))
return true;
- if (const auto *EV = dyn_cast<ExtractValueInst>(I)) {
- const auto *Arg = EV->getAggregateOperand();
- if (const auto *CI = dyn_cast<CallInst>(Arg)) {
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- if (isTriviallyVectorizable(ID))
- return true;
- if (!VFDatabase::getMappings(*CI).empty())
- return true;
- return false;
- }
- }
if (const auto *RI = dyn_cast<ReturnInst>(I))
return RI->getNumOperands() > 0 &&
(SLPReVec || !I->getOperand(0)->getType()->isVectorTy()) &&
@@ -30937,10 +30657,6 @@ static void forEachOperandChainCandidate(Instruction *I, Func F,
F(AI->getNewValOperand(), 1);
return;
}
- if (auto *EV = dyn_cast<ExtractValueInst>(I)) {
- F(EV->getAggregateOperand(), 0);
- return;
- }
if (ForReduction && !NonVectReductions)
return;
if (auto *SI = dyn_cast<StoreInst>(I)) {
@@ -31097,7 +30813,7 @@ bool SLPVectorizerPass::vectorizeNonVectorizableInsts(
auto *OpI = dyn_cast<Instruction>(Op);
if (!OpI || OpI->getParent() != BB || R.isDeleted(OpI) ||
isa<ShuffleVectorInst>(OpI) ||
- (!isValidElementType(OpI->getType()) && !isa<IntrinsicInst>(OpI)))
+ !isValidElementType(OpI->getType()))
return;
if (!Seen.insert(OpI).second)
return;
@@ -31564,8 +31280,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (auto *ValTy = dyn_cast<VectorType>(
PostProcessStores.front()->getValueOperand()->getType()))
ScalarTy = ::getWidenedType(ScalarTy, getNumElements(ValTy));
- auto *VecTy = cast<VectorType>(
- ::getWidenedType(ScalarTy, PostProcessStores.size()));
+ auto *VecTy = ::getWidenedType(ScalarTy, PostProcessStores.size());
InstructionCost ExtractsCost = ::getScalarizationOverhead(
*TTI, ScalarTy, VecTy, APInt::getAllOnes(PostProcessStores.size()),
/*Insert=*/false, /*Extract=*/true, TTI::TCK_RecipThroughput,
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-nonvect-struct-returned.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-nonvect-struct-returned.ll
deleted file mode 100644
index 35b8fa9ca0955..0000000000000
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-nonvect-struct-returned.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-unknown-elf -mattr=+v < %s | FileCheck %s
-
-define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test() {
-; CHECK-LABEL: define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[BB:.*:]]
-; CHECK-NEXT: [[CALL:%.*]] = tail call <vscale x 4 x i64> @llvm.riscv.vand.nxv4i64.i64.i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i64 0, i64 0)
-; CHECK-NEXT: [[CALL1:%.*]] = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv32i8_2t.nxv4i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, <vscale x 4 x i64> [[CALL]], i32 0)
-; CHECK-NEXT: [[CALL2:%.*]] = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv32i8_2t.nxv4i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[CALL1]], <vscale x 4 x i64> zeroinitializer, i32 0)
-; CHECK-NEXT: ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) [[CALL2]]
-;
-bb:
- %call = tail call <vscale x 4 x i64> @llvm.riscv.vand.nxv4i64.i64.i64(<vscale x 4 x i64> zeroinitializer, <vscale x 4 x i64> zeroinitializer, i64 0, i64 0)
- %call1 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv32i8_2t.nxv4i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) poison, <vscale x 4 x i64> %call, i32 0)
- %call2 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv32i8_2t.nxv4i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %call1, <vscale x 4 x i64> zeroinitializer, i32 0)
- ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %call2
-}
-
-declare <vscale x 4 x i64> @llvm.riscv.vand.nxv4i64.i64.i64(<vscale x 4 x i64>, <vscale x 4 x i64>, i64, i64)
-
-declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv32i8_2t.nxv4i64(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), <vscale x 4 x i64>, i32 immarg)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll
index 5d3dd1661fb8f..8d7dd9b9621c8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64
@@ -26,146 +26,48 @@ declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16)
declare {i8 , i1} @llvm.sadd.with.overflow.i8 (i8 , i8 )
define void @add_v8i64() {
-; SSE-LABEL: @add_v8i64(
-; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
-; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
-; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
-; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
-; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
-; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
-; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
-; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
-; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @add_v8i64(
-; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
-; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
-; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
-; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
-; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
-; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
-; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
-; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
-; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
-; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
-; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8
-; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @add_v8i64(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @add_v8i64(
-; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @add_v8i64(
-; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @add_v8i64(
-; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @add_v8i64(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @add_v8i64(
+; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
+; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
+; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8
+; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; CHECK-NEXT: ret void
;
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -211,106 +113,88 @@ define void @add_v8i64() {
}
define void @add_v16i32() {
-; SSE-LABEL: @add_v16i32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4
-; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @add_v16i32(
-; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4
-; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @add_v16i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @add_v16i32(
-; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @add_v16i32(
-; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @add_v16i32(
-; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @add_v16i32(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @add_v16i32(
+; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
+; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4
+; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4
+; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4
+; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4
+; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4
+; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
+; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4
+; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4
+; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4
+; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
+; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4
+; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
+; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
+; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
+; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
+; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
+; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
+; CHECK-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -396,106 +280,168 @@ define void @add_v16i32() {
}
define void @add_v32i16() {
-; SSE-LABEL: @add_v32i16(
-; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @add_v32i16(
-; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @add_v32i16(
-; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @add_v32i16(
-; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @add_v32i16(
-; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @add_v32i16(
-; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @add_v32i16(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @add_v32i16(
+; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2
+; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2
+; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2
+; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2
+; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2
+; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2
+; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2
+; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2
+; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2
+; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2
+; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
+; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2
+; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2
+; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2
+; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2
+; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2
+; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2
+; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2
+; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
+; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2
+; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2
+; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2
+; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2
+; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2
+; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2
+; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2
+; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
+; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2
+; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2
+; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2
+; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2
+; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2
+; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2
+; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2
+; CHECK-NEXT: ret void
;
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -661,106 +607,328 @@ define void @add_v32i16() {
}
define void @add_v64i8() {
-; SSE-LABEL: @add_v64i8(
-; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @add_v64i8(
-; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @add_v64i8(
-; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @add_v64i8(
-; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @add_v64i8(
-; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @add_v64i8(
-; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @add_v64i8(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @add_v64i8(
+; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1
+; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1
+; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1
+; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1
+; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1
+; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1
+; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1
+; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1
+; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1
+; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1
+; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1
+; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1
+; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1
+; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1
+; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1
+; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1
+; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1
+; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1
+; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
+; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1
+; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1
+; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1
+; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1
+; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1
+; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1
+; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1
+; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1
+; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1
+; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1
+; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1
+; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1
+; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1
+; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1
+; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1
+; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
+; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1
+; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1
+; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1
+; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1
+; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1
+; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1
+; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1
+; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1
+; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1
+; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1
+; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1
+; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1
+; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1
+; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1
+; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1
+; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
+; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1
+; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1
+; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1
+; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1
+; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1
+; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1
+; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1
+; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1
+; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1
+; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1
+; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1
+; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1
+; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1
+; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1
+; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
+; CHECK-NEXT: ret void
;
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
@@ -1084,5 +1252,3 @@ define void @add_v64i8() {
store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll
index f5d2212cbe584..fc67cec60f177 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64
@@ -26,146 +26,48 @@ declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16)
declare {i8 , i1} @llvm.uadd.with.overflow.i8 (i8 , i8 )
define void @add_v8i64() {
-; SSE-LABEL: @add_v8i64(
-; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
-; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
-; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
-; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
-; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
-; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
-; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
-; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
-; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @add_v8i64(
-; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
-; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
-; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
-; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
-; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
-; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
-; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
-; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
-; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
-; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
-; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8
-; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @add_v8i64(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @add_v8i64(
-; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @add_v8i64(
-; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @add_v8i64(
-; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @add_v8i64(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @add_v8i64(
+; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
+; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
+; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8
+; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; CHECK-NEXT: ret void
;
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -211,106 +113,88 @@ define void @add_v8i64() {
}
define void @add_v16i32() {
-; SSE-LABEL: @add_v16i32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4
-; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @add_v16i32(
-; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4
-; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @add_v16i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @add_v16i32(
-; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @add_v16i32(
-; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @add_v16i32(
-; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @add_v16i32(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @add_v16i32(
+; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
+; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4
+; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4
+; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4
+; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4
+; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4
+; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
+; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4
+; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4
+; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4
+; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
+; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4
+; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
+; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
+; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
+; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
+; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
+; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
+; CHECK-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -396,106 +280,168 @@ define void @add_v16i32() {
}
define void @add_v32i16() {
-; SSE-LABEL: @add_v32i16(
-; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @add_v32i16(
-; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @add_v32i16(
-; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @add_v32i16(
-; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @add_v32i16(
-; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @add_v32i16(
-; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @add_v32i16(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @add_v32i16(
+; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2
+; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2
+; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2
+; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2
+; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2
+; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2
+; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2
+; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2
+; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2
+; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2
+; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
+; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2
+; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2
+; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2
+; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2
+; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2
+; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2
+; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2
+; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
+; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2
+; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2
+; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2
+; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2
+; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2
+; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2
+; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2
+; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
+; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2
+; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2
+; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2
+; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2
+; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2
+; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2
+; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2
+; CHECK-NEXT: ret void
;
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -661,106 +607,328 @@ define void @add_v32i16() {
}
define void @add_v64i8() {
-; SSE-LABEL: @add_v64i8(
-; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @add_v64i8(
-; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @add_v64i8(
-; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @add_v64i8(
-; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @add_v64i8(
-; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @add_v64i8(
-; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @add_v64i8(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @add_v64i8(
+; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1
+; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1
+; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1
+; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1
+; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1
+; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1
+; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1
+; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1
+; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1
+; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1
+; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1
+; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1
+; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1
+; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1
+; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1
+; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1
+; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1
+; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1
+; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
+; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1
+; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1
+; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1
+; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1
+; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1
+; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1
+; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1
+; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1
+; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1
+; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1
+; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1
+; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1
+; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1
+; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1
+; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1
+; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
+; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1
+; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1
+; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1
+; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1
+; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1
+; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1
+; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1
+; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1
+; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1
+; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1
+; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1
+; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1
+; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1
+; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1
+; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1
+; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
+; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1
+; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1
+; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1
+; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1
+; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1
+; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1
+; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1
+; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1
+; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1
+; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1
+; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1
+; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1
+; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1
+; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1
+; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
+; CHECK-NEXT: ret void
;
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
@@ -1084,5 +1252,3 @@ define void @add_v64i8() {
store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll
index c7470f28d1c7b..72a3ddd0bb747 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64
@@ -26,126 +26,48 @@ declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16)
declare {i8 , i1} @llvm.smul.with.overflow.i8 (i8 , i8 )
define void @mul_v8i64() {
-; SSE-LABEL: @mul_v8i64(
-; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
-; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
-; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
-; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
-; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
-; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
-; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
-; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
-; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @mul_v8i64(
-; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
-; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
-; SLM-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <2 x i64> [[TMP4]], ptr @c64, align 8
-; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SLM-NEXT: [[TMP7:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <2 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SLM-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SLM-NEXT: [[TMP11:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SLM-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SLM-NEXT: [[TMP14:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SLM-NEXT: [[TMP15:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <2 x i64> [[TMP16]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @mul_v8i64(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @mul_v8i64(
-; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @mul_v8i64(
-; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <8 x i64> [[TMP4]], ptr @c64, align 8
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @mul_v8i64(
-; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <8 x i64> [[TMP4]], ptr @c64, align 8
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @mul_v8i64(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @mul_v8i64(
+; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
+; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
+; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8
+; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; CHECK-NEXT: ret void
;
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -191,226 +113,88 @@ define void @mul_v8i64() {
}
define void @mul_v16i32() {
-; SSE-LABEL: @mul_v16i32(
-; SSE-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
-; SSE-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; SSE-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4
-; SSE-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4
-; SSE-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SSE-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4
-; SSE-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4
-; SSE-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4
-; SSE-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SSE-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4
-; SSE-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
-; SSE-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
-; SSE-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SSE-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
-; SSE-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
-; SSE-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
-; SSE-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4
-; SSE-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
-; SSE-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4
-; SSE-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4
-; SSE-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SSE-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4
-; SSE-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4
-; SSE-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4
-; SSE-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SSE-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4
-; SSE-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
-; SSE-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
-; SSE-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SSE-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
-; SSE-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
-; SSE-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
-; SSE-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A0]], i32 [[B0]])
-; SSE-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A1]], i32 [[B1]])
-; SSE-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A2]], i32 [[B2]])
-; SSE-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A3]], i32 [[B3]])
-; SSE-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A4]], i32 [[B4]])
-; SSE-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A5]], i32 [[B5]])
-; SSE-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A6]], i32 [[B6]])
-; SSE-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A7]], i32 [[B7]])
-; SSE-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A8]], i32 [[B8]])
-; SSE-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A9]], i32 [[B9]])
-; SSE-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A10]], i32 [[B10]])
-; SSE-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A11]], i32 [[B11]])
-; SSE-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A12]], i32 [[B12]])
-; SSE-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A13]], i32 [[B13]])
-; SSE-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A14]], i32 [[B14]])
-; SSE-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A15]], i32 [[B15]])
-; SSE-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
-; SSE-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
-; SSE-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
-; SSE-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
-; SSE-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
-; SSE-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
-; SSE-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
-; SSE-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
-; SSE-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
-; SSE-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
-; SSE-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
-; SSE-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
-; SSE-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
-; SSE-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
-; SSE-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
-; SSE-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
-; SSE-NEXT: store i32 [[R0]], ptr @c32, align 4
-; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4
-; SSE-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4
-; SSE-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4
-; SSE-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SSE-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4
-; SSE-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4
-; SSE-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4
-; SSE-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SSE-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4
-; SSE-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
-; SSE-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
-; SSE-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SSE-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
-; SSE-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
-; SSE-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @mul_v16i32(
-; SLM-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
-; SLM-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; SLM-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4
-; SLM-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4
-; SLM-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SLM-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4
-; SLM-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4
-; SLM-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4
-; SLM-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SLM-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4
-; SLM-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
-; SLM-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
-; SLM-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SLM-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
-; SLM-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
-; SLM-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
-; SLM-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4
-; SLM-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
-; SLM-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4
-; SLM-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4
-; SLM-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SLM-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4
-; SLM-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4
-; SLM-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4
-; SLM-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SLM-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4
-; SLM-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
-; SLM-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
-; SLM-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SLM-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
-; SLM-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
-; SLM-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
-; SLM-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A0]], i32 [[B0]])
-; SLM-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A1]], i32 [[B1]])
-; SLM-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A2]], i32 [[B2]])
-; SLM-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A3]], i32 [[B3]])
-; SLM-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A4]], i32 [[B4]])
-; SLM-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A5]], i32 [[B5]])
-; SLM-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A6]], i32 [[B6]])
-; SLM-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A7]], i32 [[B7]])
-; SLM-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A8]], i32 [[B8]])
-; SLM-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A9]], i32 [[B9]])
-; SLM-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A10]], i32 [[B10]])
-; SLM-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A11]], i32 [[B11]])
-; SLM-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A12]], i32 [[B12]])
-; SLM-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A13]], i32 [[B13]])
-; SLM-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A14]], i32 [[B14]])
-; SLM-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A15]], i32 [[B15]])
-; SLM-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
-; SLM-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
-; SLM-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
-; SLM-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
-; SLM-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
-; SLM-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
-; SLM-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
-; SLM-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
-; SLM-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
-; SLM-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
-; SLM-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
-; SLM-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
-; SLM-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
-; SLM-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
-; SLM-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
-; SLM-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
-; SLM-NEXT: store i32 [[R0]], ptr @c32, align 4
-; SLM-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4
-; SLM-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4
-; SLM-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4
-; SLM-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SLM-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4
-; SLM-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4
-; SLM-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4
-; SLM-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SLM-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4
-; SLM-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
-; SLM-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
-; SLM-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SLM-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
-; SLM-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
-; SLM-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @mul_v16i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @mul_v16i32(
-; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @mul_v16i32(
-; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @mul_v16i32(
-; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @mul_v16i32(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @mul_v16i32(
+; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
+; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4
+; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4
+; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4
+; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4
+; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4
+; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
+; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4
+; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4
+; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4
+; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
+; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4
+; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
+; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
+; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
+; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
+; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
+; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
+; CHECK-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -496,106 +280,168 @@ define void @mul_v16i32() {
}
define void @mul_v32i16() {
-; SSE-LABEL: @mul_v32i16(
-; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @mul_v32i16(
-; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @mul_v32i16(
-; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @mul_v32i16(
-; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @mul_v32i16(
-; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @mul_v32i16(
-; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @mul_v32i16(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @mul_v32i16(
+; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2
+; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2
+; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2
+; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2
+; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2
+; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2
+; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2
+; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2
+; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2
+; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2
+; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
+; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2
+; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2
+; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2
+; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2
+; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2
+; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2
+; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2
+; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
+; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2
+; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2
+; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2
+; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2
+; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2
+; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2
+; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2
+; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
+; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2
+; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2
+; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2
+; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2
+; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2
+; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2
+; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2
+; CHECK-NEXT: ret void
;
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -761,106 +607,328 @@ define void @mul_v32i16() {
}
define void @mul_v64i8() {
-; SSE-LABEL: @mul_v64i8(
-; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @mul_v64i8(
-; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @mul_v64i8(
-; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @mul_v64i8(
-; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @mul_v64i8(
-; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @mul_v64i8(
-; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @mul_v64i8(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @mul_v64i8(
+; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1
+; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1
+; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1
+; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1
+; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1
+; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1
+; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1
+; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1
+; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1
+; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1
+; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1
+; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1
+; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1
+; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1
+; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1
+; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1
+; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1
+; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1
+; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
+; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1
+; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1
+; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1
+; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1
+; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1
+; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1
+; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1
+; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1
+; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1
+; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1
+; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1
+; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1
+; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1
+; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1
+; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1
+; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
+; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1
+; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1
+; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1
+; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1
+; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1
+; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1
+; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1
+; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1
+; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1
+; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1
+; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1
+; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1
+; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1
+; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1
+; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1
+; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
+; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1
+; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1
+; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1
+; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1
+; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1
+; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1
+; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1
+; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1
+; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1
+; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1
+; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1
+; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1
+; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1
+; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1
+; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
+; CHECK-NEXT: ret void
;
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
@@ -1184,5 +1252,3 @@ define void @mul_v64i8() {
store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll
index 4c1d070a569e7..4126f06e8ca81 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64
@@ -26,126 +26,48 @@ declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16)
declare {i8 , i1} @llvm.umul.with.overflow.i8 (i8 , i8 )
define void @mul_v8i64() {
-; SSE-LABEL: @mul_v8i64(
-; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
-; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
-; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
-; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
-; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
-; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
-; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
-; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
-; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @mul_v8i64(
-; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @a64, align 8
-; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @b64, align 8
-; SLM-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <2 x i64> [[TMP4]], ptr @c64, align 8
-; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SLM-NEXT: [[TMP7:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <2 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SLM-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SLM-NEXT: [[TMP11:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <2 x i64> [[TMP12]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SLM-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SLM-NEXT: [[TMP14:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SLM-NEXT: [[TMP15:%.*]] = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> [[TMP13]], <2 x i64> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <2 x i64>, <2 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <2 x i64> [[TMP16]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @mul_v8i64(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @mul_v8i64(
-; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @mul_v8i64(
-; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @mul_v8i64(
-; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @mul_v8i64(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @mul_v8i64(
+; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
+; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
+; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8
+; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; CHECK-NEXT: ret void
;
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -191,106 +113,88 @@ define void @mul_v8i64() {
}
define void @mul_v16i32() {
-; SSE-LABEL: @mul_v16i32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4
-; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @mul_v16i32(
-; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4
-; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @mul_v16i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @mul_v16i32(
-; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @mul_v16i32(
-; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @mul_v16i32(
-; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @mul_v16i32(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @mul_v16i32(
+; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
+; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4
+; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4
+; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4
+; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4
+; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4
+; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
+; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4
+; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4
+; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4
+; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
+; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4
+; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
+; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
+; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
+; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
+; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
+; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
+; CHECK-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -376,106 +280,168 @@ define void @mul_v16i32() {
}
define void @mul_v32i16() {
-; SSE-LABEL: @mul_v32i16(
-; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @mul_v32i16(
-; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @mul_v32i16(
-; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @mul_v32i16(
-; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @mul_v32i16(
-; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @mul_v32i16(
-; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <32 x i16> [[TMP4]], ptr @c16, align 2
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @mul_v32i16(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @mul_v32i16(
+; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2
+; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2
+; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2
+; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2
+; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2
+; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2
+; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2
+; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2
+; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2
+; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2
+; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
+; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2
+; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2
+; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2
+; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2
+; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2
+; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2
+; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2
+; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
+; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2
+; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2
+; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2
+; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2
+; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2
+; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2
+; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2
+; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
+; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2
+; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2
+; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2
+; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2
+; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2
+; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2
+; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2
+; CHECK-NEXT: ret void
;
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -641,106 +607,328 @@ define void @mul_v32i16() {
}
define void @mul_v64i8() {
-; SSE-LABEL: @mul_v64i8(
-; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @mul_v64i8(
-; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @mul_v64i8(
-; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @mul_v64i8(
-; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @mul_v64i8(
-; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; KNL-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @mul_v64i8(
-; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; AVX512-NEXT: [[TMP4:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <64 x i8> [[TMP4]], ptr @c8, align 1
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @mul_v64i8(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @mul_v64i8(
+; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1
+; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1
+; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1
+; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1
+; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1
+; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1
+; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1
+; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1
+; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1
+; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1
+; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1
+; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1
+; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1
+; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1
+; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1
+; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1
+; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1
+; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1
+; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
+; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1
+; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1
+; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1
+; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1
+; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1
+; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1
+; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1
+; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1
+; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1
+; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1
+; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1
+; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1
+; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1
+; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1
+; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1
+; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
+; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1
+; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1
+; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1
+; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1
+; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1
+; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1
+; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1
+; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1
+; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1
+; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1
+; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1
+; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1
+; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1
+; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1
+; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1
+; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
+; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1
+; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1
+; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1
+; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1
+; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1
+; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1
+; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1
+; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1
+; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1
+; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1
+; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1
+; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1
+; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1
+; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1
+; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
+; CHECK-NEXT: ret void
;
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
@@ -1064,5 +1252,3 @@ define void @mul_v64i8() {
store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll
index fa1ed4dd49c8d..d628dddd16cb1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64
@@ -26,146 +26,48 @@ declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16)
declare {i8 , i1} @llvm.ssub.with.overflow.i8 (i8 , i8 )
define void @sub_v8i64() {
-; SSE-LABEL: @sub_v8i64(
-; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
-; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
-; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
-; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
-; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
-; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
-; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
-; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
-; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @sub_v8i64(
-; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
-; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
-; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
-; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
-; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
-; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
-; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
-; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
-; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
-; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
-; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8
-; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @sub_v8i64(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @sub_v8i64(
-; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @sub_v8i64(
-; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @sub_v8i64(
-; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @sub_v8i64(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @sub_v8i64(
+; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
+; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
+; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8
+; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; CHECK-NEXT: ret void
;
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -211,106 +113,88 @@ define void @sub_v8i64() {
}
define void @sub_v16i32() {
-; SSE-LABEL: @sub_v16i32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4
-; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @sub_v16i32(
-; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4
-; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @sub_v16i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @sub_v16i32(
-; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @sub_v16i32(
-; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @sub_v16i32(
-; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @sub_v16i32(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @sub_v16i32(
+; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
+; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4
+; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4
+; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4
+; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4
+; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4
+; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
+; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4
+; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4
+; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4
+; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
+; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4
+; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
+; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
+; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
+; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
+; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
+; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
+; CHECK-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -396,106 +280,168 @@ define void @sub_v16i32() {
}
define void @sub_v32i16() {
-; SSE-LABEL: @sub_v32i16(
-; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @sub_v32i16(
-; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @sub_v32i16(
-; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @sub_v32i16(
-; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @sub_v32i16(
-; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @sub_v32i16(
-; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @sub_v32i16(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @sub_v32i16(
+; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2
+; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2
+; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2
+; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2
+; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2
+; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2
+; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2
+; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2
+; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2
+; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2
+; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
+; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2
+; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2
+; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2
+; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2
+; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2
+; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2
+; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2
+; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
+; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2
+; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2
+; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2
+; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2
+; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2
+; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2
+; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2
+; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
+; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2
+; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2
+; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2
+; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2
+; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2
+; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2
+; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2
+; CHECK-NEXT: ret void
;
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -661,106 +607,328 @@ define void @sub_v32i16() {
}
define void @sub_v64i8() {
-; SSE-LABEL: @sub_v64i8(
-; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @sub_v64i8(
-; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @sub_v64i8(
-; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @sub_v64i8(
-; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @sub_v64i8(
-; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @sub_v64i8(
-; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @sub_v64i8(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @sub_v64i8(
+; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1
+; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1
+; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1
+; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1
+; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1
+; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1
+; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1
+; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1
+; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1
+; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1
+; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1
+; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1
+; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1
+; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1
+; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1
+; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1
+; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1
+; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1
+; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
+; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1
+; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1
+; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1
+; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1
+; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1
+; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1
+; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1
+; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1
+; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1
+; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1
+; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1
+; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1
+; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1
+; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1
+; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1
+; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
+; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1
+; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1
+; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1
+; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1
+; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1
+; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1
+; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1
+; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1
+; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1
+; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1
+; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1
+; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1
+; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1
+; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1
+; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1
+; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
+; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1
+; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1
+; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1
+; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1
+; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1
+; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1
+; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1
+; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1
+; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1
+; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1
+; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1
+; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1
+; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1
+; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1
+; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
+; CHECK-NEXT: ret void
;
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
@@ -1084,5 +1252,3 @@ define void @sub_v64i8() {
store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll
index 9c683eacc7062..11a68a5dfbcca 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,KNL
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512_256
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s
@a64 = common global [8 x i64] zeroinitializer, align 64
@b64 = common global [8 x i64] zeroinitializer, align 64
@@ -26,146 +26,48 @@ declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16)
declare {i8 , i1} @llvm.usub.with.overflow.i8 (i8 , i8 )
define void @sub_v8i64() {
-; SSE-LABEL: @sub_v8i64(
-; SSE-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
-; SSE-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SSE-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SSE-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SSE-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SSE-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SSE-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SSE-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SSE-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
-; SSE-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SSE-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SSE-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SSE-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SSE-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SSE-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SSE-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SSE-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A0]], i64 [[B0]])
-; SSE-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A1]], i64 [[B1]])
-; SSE-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A2]], i64 [[B2]])
-; SSE-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A3]], i64 [[B3]])
-; SSE-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A4]], i64 [[B4]])
-; SSE-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A5]], i64 [[B5]])
-; SSE-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A6]], i64 [[B6]])
-; SSE-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A7]], i64 [[B7]])
-; SSE-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
-; SSE-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
-; SSE-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
-; SSE-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
-; SSE-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
-; SSE-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
-; SSE-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
-; SSE-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
-; SSE-NEXT: store i64 [[R0]], ptr @c64, align 8
-; SSE-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SSE-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SSE-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SSE-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SSE-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SSE-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SSE-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @sub_v8i64(
-; SLM-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
-; SLM-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
-; SLM-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
-; SLM-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
-; SLM-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; SLM-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
-; SLM-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
-; SLM-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
-; SLM-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
-; SLM-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
-; SLM-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
-; SLM-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
-; SLM-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; SLM-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
-; SLM-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
-; SLM-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
-; SLM-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A0]], i64 [[B0]])
-; SLM-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A1]], i64 [[B1]])
-; SLM-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A2]], i64 [[B2]])
-; SLM-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A3]], i64 [[B3]])
-; SLM-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A4]], i64 [[B4]])
-; SLM-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A5]], i64 [[B5]])
-; SLM-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A6]], i64 [[B6]])
-; SLM-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A7]], i64 [[B7]])
-; SLM-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
-; SLM-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
-; SLM-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
-; SLM-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
-; SLM-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
-; SLM-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
-; SLM-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
-; SLM-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
-; SLM-NEXT: store i64 [[R0]], ptr @c64, align 8
-; SLM-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
-; SLM-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
-; SLM-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
-; SLM-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; SLM-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
-; SLM-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
-; SLM-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @sub_v8i64(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @sub_v8i64(
-; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX2-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX2-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @sub_v8i64(
-; KNL-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; KNL-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; KNL-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; KNL-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @sub_v8i64(
-; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
-; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
-; AVX512-NEXT: [[TMP3:%.*]] = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
-; AVX512-NEXT: [[TMP18:%.*]] = extractvalue { <8 x i64>, <8 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <8 x i64> [[TMP18]], ptr @c64, align 8
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @sub_v8i64(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @a64, align 8
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @b64, align 8
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP4]], ptr @c64, align 8
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> [[TMP5]], <4 x i64> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i64>, <4 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <4 x i64> [[TMP8]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @sub_v8i64(
+; CHECK-NEXT: [[A0:%.*]] = load i64, ptr @a64, align 8
+; CHECK-NEXT: [[A1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[A2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[A3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[A4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[A5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[A6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[A7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[B0:%.*]] = load i64, ptr @b64, align 8
+; CHECK-NEXT: [[B1:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 1), align 8
+; CHECK-NEXT: [[B2:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 2), align 8
+; CHECK-NEXT: [[B3:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 3), align 8
+; CHECK-NEXT: [[B4:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 4), align 8
+; CHECK-NEXT: [[B5:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 5), align 8
+; CHECK-NEXT: [[B6:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 6), align 8
+; CHECK-NEXT: [[B7:%.*]] = load i64, ptr getelementptr inbounds ([8 x i64], ptr @b64, i32 0, i64 7), align 8
+; CHECK-NEXT: [[C0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT: store i64 [[R0]], ptr @c64, align 8
+; CHECK-NEXT: store i64 [[R1]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 1), align 8
+; CHECK-NEXT: store i64 [[R2]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 2), align 8
+; CHECK-NEXT: store i64 [[R3]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 3), align 8
+; CHECK-NEXT: store i64 [[R4]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 4), align 8
+; CHECK-NEXT: store i64 [[R5]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 5), align 8
+; CHECK-NEXT: store i64 [[R6]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 6), align 8
+; CHECK-NEXT: store i64 [[R7]], ptr getelementptr inbounds ([8 x i64], ptr @c64, i32 0, i64 7), align 8
+; CHECK-NEXT: ret void
;
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -211,106 +113,88 @@ define void @sub_v8i64() {
}
define void @sub_v16i32() {
-; SSE-LABEL: @sub_v16i32(
-; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SSE-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4
-; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SSE-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SSE-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SSE-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SSE-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @sub_v16i32(
-; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
-; SLM-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <4 x i32> [[TMP4]], ptr @c32, align 4
-; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP7:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <4 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
-; SLM-NEXT: [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP11:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; SLM-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
-; SLM-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
-; SLM-NEXT: [[TMP15:%.*]] = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> [[TMP13]], <4 x i32> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <4 x i32> [[TMP16]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @sub_v16i32(
-; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @sub_v16i32(
-; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX2-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX2-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX2-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @sub_v16i32(
-; KNL-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; KNL-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; KNL-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; KNL-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @sub_v16i32(
-; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
-; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
-; AVX512-NEXT: [[TMP3:%.*]] = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
-; AVX512-NEXT: [[TMP34:%.*]] = extractvalue { <16 x i32>, <16 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <16 x i32> [[TMP34]], ptr @c32, align 4
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @sub_v16i32(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP4]], ptr @c32, align 4
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> [[TMP5]], <8 x i32> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i32>, <8 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <8 x i32> [[TMP8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @sub_v16i32(
+; CHECK-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4
+; CHECK-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[A2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[A3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[A4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[A5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[A6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[A7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[A8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[A9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[A10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[A11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[A12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[A13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[A14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[A15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[B0:%.*]] = load i32, ptr @b32, align 4
+; CHECK-NEXT: [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
+; CHECK-NEXT: [[B2:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 2), align 4
+; CHECK-NEXT: [[B3:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 3), align 4
+; CHECK-NEXT: [[B4:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
+; CHECK-NEXT: [[B5:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 5), align 4
+; CHECK-NEXT: [[B6:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 6), align 4
+; CHECK-NEXT: [[B7:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 7), align 4
+; CHECK-NEXT: [[B8:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; CHECK-NEXT: [[B9:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 9), align 4
+; CHECK-NEXT: [[B10:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 10), align 4
+; CHECK-NEXT: [[B11:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 11), align 4
+; CHECK-NEXT: [[B12:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
+; CHECK-NEXT: [[B13:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 13), align 4
+; CHECK-NEXT: [[B14:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 14), align 4
+; CHECK-NEXT: [[B15:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 15), align 4
+; CHECK-NEXT: [[C0:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT: store i32 [[R0]], ptr @c32, align 4
+; CHECK-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 1), align 4
+; CHECK-NEXT: store i32 [[R2]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 2), align 4
+; CHECK-NEXT: store i32 [[R3]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 3), align 4
+; CHECK-NEXT: store i32 [[R4]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 4), align 4
+; CHECK-NEXT: store i32 [[R5]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 5), align 4
+; CHECK-NEXT: store i32 [[R6]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 6), align 4
+; CHECK-NEXT: store i32 [[R7]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 7), align 4
+; CHECK-NEXT: store i32 [[R8]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 8), align 4
+; CHECK-NEXT: store i32 [[R9]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 9), align 4
+; CHECK-NEXT: store i32 [[R10]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 10), align 4
+; CHECK-NEXT: store i32 [[R11]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 11), align 4
+; CHECK-NEXT: store i32 [[R12]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 12), align 4
+; CHECK-NEXT: store i32 [[R13]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 13), align 4
+; CHECK-NEXT: store i32 [[R14]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 14), align 4
+; CHECK-NEXT: store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @c32, i32 0, i64 15), align 4
+; CHECK-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -396,106 +280,168 @@ define void @sub_v16i32() {
}
define void @sub_v32i16() {
-; SSE-LABEL: @sub_v32i16(
-; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SSE-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SSE-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SSE-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @sub_v32i16(
-; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SLM-NEXT: [[TMP3:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <8 x i16> [[TMP4]], ptr @c16, align 2
-; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP7:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <8 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
-; SLM-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP11:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; SLM-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SLM-NEXT: [[TMP15:%.*]] = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> [[TMP13]], <8 x i16> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <8 x i16>, <8 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <8 x i16> [[TMP16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @sub_v32i16(
-; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @sub_v32i16(
-; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX2-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX2-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX2-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX2-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @sub_v32i16(
-; KNL-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; KNL-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; KNL-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; KNL-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @sub_v32i16(
-; AVX512-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
-; AVX512-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
-; AVX512-NEXT: [[TMP3:%.*]] = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
-; AVX512-NEXT: [[TMP66:%.*]] = extractvalue { <32 x i16>, <32 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <32 x i16> [[TMP66]], ptr @c16, align 2
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @sub_v32i16(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP4]], ptr @c16, align 2
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> [[TMP5]], <16 x i16> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i16>, <16 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <16 x i16> [[TMP8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @sub_v32i16(
+; CHECK-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2
+; CHECK-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2
+; CHECK-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
+; CHECK-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
+; CHECK-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
+; CHECK-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
+; CHECK-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
+; CHECK-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
+; CHECK-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
+; CHECK-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; CHECK-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
+; CHECK-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
+; CHECK-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
+; CHECK-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
+; CHECK-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
+; CHECK-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
+; CHECK-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
+; CHECK-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; CHECK-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
+; CHECK-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
+; CHECK-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
+; CHECK-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
+; CHECK-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
+; CHECK-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
+; CHECK-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
+; CHECK-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; CHECK-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
+; CHECK-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
+; CHECK-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
+; CHECK-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
+; CHECK-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
+; CHECK-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
+; CHECK-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
+; CHECK-NEXT: [[C0:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT: store i16 [[R0]], ptr @c16, align 2
+; CHECK-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 1), align 2
+; CHECK-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 2), align 2
+; CHECK-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 3), align 2
+; CHECK-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 4), align 2
+; CHECK-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 5), align 2
+; CHECK-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 6), align 2
+; CHECK-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 7), align 2
+; CHECK-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 8), align 2
+; CHECK-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 9), align 2
+; CHECK-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 10), align 2
+; CHECK-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 11), align 2
+; CHECK-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 12), align 2
+; CHECK-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 13), align 2
+; CHECK-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 14), align 2
+; CHECK-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 15), align 2
+; CHECK-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 16), align 2
+; CHECK-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 17), align 2
+; CHECK-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 18), align 2
+; CHECK-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 19), align 2
+; CHECK-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 20), align 2
+; CHECK-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 21), align 2
+; CHECK-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 22), align 2
+; CHECK-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 23), align 2
+; CHECK-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 24), align 2
+; CHECK-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 25), align 2
+; CHECK-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 26), align 2
+; CHECK-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 27), align 2
+; CHECK-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 28), align 2
+; CHECK-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 29), align 2
+; CHECK-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 30), align 2
+; CHECK-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @c16, i32 0, i64 31), align 2
+; CHECK-NEXT: ret void
;
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -661,106 +607,328 @@ define void @sub_v32i16() {
}
define void @sub_v64i8() {
-; SSE-LABEL: @sub_v64i8(
-; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SSE-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SSE-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SSE-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SSE-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SSE-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SSE-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SSE-NEXT: ret void
-;
-; SLM-LABEL: @sub_v64i8(
-; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @a8, align 1
-; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1
-; SLM-NEXT: [[TMP3:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
-; SLM-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP3]], 0
-; SLM-NEXT: store <16 x i8> [[TMP4]], ptr @c8, align 1
-; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP7:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-; SLM-NEXT: [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP7]], 0
-; SLM-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
-; SLM-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP11:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]])
-; SLM-NEXT: [[TMP12:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP11]], 0
-; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; SLM-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
-; SLM-NEXT: [[TMP15:%.*]] = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]])
-; SLM-NEXT: [[TMP16:%.*]] = extractvalue { <16 x i8>, <16 x i1> } [[TMP15]], 0
-; SLM-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
-; SLM-NEXT: ret void
-;
-; AVX-LABEL: @sub_v64i8(
-; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @sub_v64i8(
-; AVX2-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX2-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX2-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX2-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX2-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX2-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX2-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX2-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX2-NEXT: ret void
-;
-; KNL-LABEL: @sub_v64i8(
-; KNL-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; KNL-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; KNL-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; KNL-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; KNL-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1
-; KNL-NEXT: ret void
-;
-; AVX512-LABEL: @sub_v64i8(
-; AVX512-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
-; AVX512-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
-; AVX512-NEXT: [[TMP3:%.*]] = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
-; AVX512-NEXT: [[TMP130:%.*]] = extractvalue { <64 x i8>, <64 x i1> } [[TMP3]], 0
-; AVX512-NEXT: store <64 x i8> [[TMP130]], ptr @c8, align 1
-; AVX512-NEXT: ret void
-;
-; AVX512_256-LABEL: @sub_v64i8(
-; AVX512_256-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @a8, align 1
-; AVX512_256-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1
-; AVX512_256-NEXT: [[TMP3:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]])
-; AVX512_256-NEXT: [[TMP4:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP3]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP4]], ptr @c8, align 1
-; AVX512_256-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: [[TMP7:%.*]] = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]])
-; AVX512_256-NEXT: [[TMP8:%.*]] = extractvalue { <32 x i8>, <32 x i1> } [[TMP7]], 0
-; AVX512_256-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
-; AVX512_256-NEXT: ret void
+; CHECK-LABEL: @sub_v64i8(
+; CHECK-NEXT: [[A0:%.*]] = load i8, ptr @a8, align 1
+; CHECK-NEXT: [[A1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[A2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[A3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[A4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[A5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[A6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[A7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[A8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[A9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[A10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[A11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[A12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[A13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[A14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[A15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[A16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[A17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[A18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[A19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[A20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[A21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[A22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[A23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[A24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[A25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[A26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[A27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[A28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[A29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[A30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[A31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[A32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[A33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[A34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[A35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[A36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[A37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[A38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[A39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[A40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[A41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[A42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[A43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[A44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[A45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[A46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[A47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[A48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[A49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[A50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[A51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[A52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[A53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[A54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[A55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[A56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[A57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[A58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[A59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[A60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[A61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[A62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[A63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[B0:%.*]] = load i8, ptr @b8, align 1
+; CHECK-NEXT: [[B1:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 1), align 1
+; CHECK-NEXT: [[B2:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 2), align 1
+; CHECK-NEXT: [[B3:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 3), align 1
+; CHECK-NEXT: [[B4:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 4), align 1
+; CHECK-NEXT: [[B5:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 5), align 1
+; CHECK-NEXT: [[B6:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 6), align 1
+; CHECK-NEXT: [[B7:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 7), align 1
+; CHECK-NEXT: [[B8:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 8), align 1
+; CHECK-NEXT: [[B9:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 9), align 1
+; CHECK-NEXT: [[B10:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 10), align 1
+; CHECK-NEXT: [[B11:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 11), align 1
+; CHECK-NEXT: [[B12:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 12), align 1
+; CHECK-NEXT: [[B13:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 13), align 1
+; CHECK-NEXT: [[B14:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 14), align 1
+; CHECK-NEXT: [[B15:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 15), align 1
+; CHECK-NEXT: [[B16:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1
+; CHECK-NEXT: [[B17:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 17), align 1
+; CHECK-NEXT: [[B18:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 18), align 1
+; CHECK-NEXT: [[B19:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 19), align 1
+; CHECK-NEXT: [[B20:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 20), align 1
+; CHECK-NEXT: [[B21:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 21), align 1
+; CHECK-NEXT: [[B22:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 22), align 1
+; CHECK-NEXT: [[B23:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 23), align 1
+; CHECK-NEXT: [[B24:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 24), align 1
+; CHECK-NEXT: [[B25:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 25), align 1
+; CHECK-NEXT: [[B26:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 26), align 1
+; CHECK-NEXT: [[B27:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 27), align 1
+; CHECK-NEXT: [[B28:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 28), align 1
+; CHECK-NEXT: [[B29:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 29), align 1
+; CHECK-NEXT: [[B30:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 30), align 1
+; CHECK-NEXT: [[B31:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 31), align 1
+; CHECK-NEXT: [[B32:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1
+; CHECK-NEXT: [[B33:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 33), align 1
+; CHECK-NEXT: [[B34:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 34), align 1
+; CHECK-NEXT: [[B35:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 35), align 1
+; CHECK-NEXT: [[B36:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 36), align 1
+; CHECK-NEXT: [[B37:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 37), align 1
+; CHECK-NEXT: [[B38:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 38), align 1
+; CHECK-NEXT: [[B39:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 39), align 1
+; CHECK-NEXT: [[B40:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 40), align 1
+; CHECK-NEXT: [[B41:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 41), align 1
+; CHECK-NEXT: [[B42:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 42), align 1
+; CHECK-NEXT: [[B43:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 43), align 1
+; CHECK-NEXT: [[B44:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 44), align 1
+; CHECK-NEXT: [[B45:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 45), align 1
+; CHECK-NEXT: [[B46:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 46), align 1
+; CHECK-NEXT: [[B47:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 47), align 1
+; CHECK-NEXT: [[B48:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1
+; CHECK-NEXT: [[B49:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 49), align 1
+; CHECK-NEXT: [[B50:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 50), align 1
+; CHECK-NEXT: [[B51:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 51), align 1
+; CHECK-NEXT: [[B52:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 52), align 1
+; CHECK-NEXT: [[B53:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 53), align 1
+; CHECK-NEXT: [[B54:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 54), align 1
+; CHECK-NEXT: [[B55:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 55), align 1
+; CHECK-NEXT: [[B56:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 56), align 1
+; CHECK-NEXT: [[B57:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 57), align 1
+; CHECK-NEXT: [[B58:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 58), align 1
+; CHECK-NEXT: [[B59:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 59), align 1
+; CHECK-NEXT: [[B60:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 60), align 1
+; CHECK-NEXT: [[B61:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 61), align 1
+; CHECK-NEXT: [[B62:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 62), align 1
+; CHECK-NEXT: [[B63:%.*]] = load i8, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 63), align 1
+; CHECK-NEXT: [[C0:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT: [[C1:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT: [[C2:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT: [[C3:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT: [[C4:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT: [[C5:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT: [[C6:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT: [[C7:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT: [[C8:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT: [[C9:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT: [[C10:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT: [[C11:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT: [[C12:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT: [[C13:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT: [[C14:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT: [[C15:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT: [[C16:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT: [[C17:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT: [[C18:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT: [[C19:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT: [[C20:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT: [[C21:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT: [[C22:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT: [[C23:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT: [[C24:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT: [[C25:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT: [[C26:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT: [[C27:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT: [[C28:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT: [[C29:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT: [[C30:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT: [[C31:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT: [[C32:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT: [[C33:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT: [[C34:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT: [[C35:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT: [[C36:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT: [[C37:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT: [[C38:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT: [[C39:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT: [[C40:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT: [[C41:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT: [[C42:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT: [[C43:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT: [[C44:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT: [[C45:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT: [[C46:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT: [[C47:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT: [[C48:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT: [[C49:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT: [[C50:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT: [[C51:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT: [[C52:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT: [[C53:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT: [[C54:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT: [[C55:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT: [[C56:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT: [[C57:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT: [[C58:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT: [[C59:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT: [[C60:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT: [[C61:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT: [[C62:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT: [[C63:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT: [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT: [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT: [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT: [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT: [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT: [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT: [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT: [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT: [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT: [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT: [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT: [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT: [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT: [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT: [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT: [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT: [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT: [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT: [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT: [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT: [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT: [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT: [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT: [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT: [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT: [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT: [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT: [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT: [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT: [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT: [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT: [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT: [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT: [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT: [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT: [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT: [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT: [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT: [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT: [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT: [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT: [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT: [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT: [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT: [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT: [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT: [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT: [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT: [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT: [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT: [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT: [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT: [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT: [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT: [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT: [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT: [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT: [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT: [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT: [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT: [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT: [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT: [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT: [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT: store i8 [[R0]], ptr @c8, align 1
+; CHECK-NEXT: store i8 [[R1]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 1), align 1
+; CHECK-NEXT: store i8 [[R2]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 2), align 1
+; CHECK-NEXT: store i8 [[R3]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 3), align 1
+; CHECK-NEXT: store i8 [[R4]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 4), align 1
+; CHECK-NEXT: store i8 [[R5]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 5), align 1
+; CHECK-NEXT: store i8 [[R6]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 6), align 1
+; CHECK-NEXT: store i8 [[R7]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 7), align 1
+; CHECK-NEXT: store i8 [[R8]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 8), align 1
+; CHECK-NEXT: store i8 [[R9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 9), align 1
+; CHECK-NEXT: store i8 [[R10]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 10), align 1
+; CHECK-NEXT: store i8 [[R11]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 11), align 1
+; CHECK-NEXT: store i8 [[R12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 12), align 1
+; CHECK-NEXT: store i8 [[R13]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 13), align 1
+; CHECK-NEXT: store i8 [[R14]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 14), align 1
+; CHECK-NEXT: store i8 [[R15]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 15), align 1
+; CHECK-NEXT: store i8 [[R16]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1
+; CHECK-NEXT: store i8 [[R17]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 17), align 1
+; CHECK-NEXT: store i8 [[R18]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 18), align 1
+; CHECK-NEXT: store i8 [[R19]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 19), align 1
+; CHECK-NEXT: store i8 [[R20]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 20), align 1
+; CHECK-NEXT: store i8 [[R21]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 21), align 1
+; CHECK-NEXT: store i8 [[R22]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 22), align 1
+; CHECK-NEXT: store i8 [[R23]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 23), align 1
+; CHECK-NEXT: store i8 [[R24]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 24), align 1
+; CHECK-NEXT: store i8 [[R25]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 25), align 1
+; CHECK-NEXT: store i8 [[R26]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 26), align 1
+; CHECK-NEXT: store i8 [[R27]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 27), align 1
+; CHECK-NEXT: store i8 [[R28]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 28), align 1
+; CHECK-NEXT: store i8 [[R29]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 29), align 1
+; CHECK-NEXT: store i8 [[R30]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 30), align 1
+; CHECK-NEXT: store i8 [[R31]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 31), align 1
+; CHECK-NEXT: store i8 [[R32]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1
+; CHECK-NEXT: store i8 [[R33]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 33), align 1
+; CHECK-NEXT: store i8 [[R34]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 34), align 1
+; CHECK-NEXT: store i8 [[R35]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 35), align 1
+; CHECK-NEXT: store i8 [[R36]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 36), align 1
+; CHECK-NEXT: store i8 [[R37]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 37), align 1
+; CHECK-NEXT: store i8 [[R38]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 38), align 1
+; CHECK-NEXT: store i8 [[R39]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 39), align 1
+; CHECK-NEXT: store i8 [[R40]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 40), align 1
+; CHECK-NEXT: store i8 [[R41]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 41), align 1
+; CHECK-NEXT: store i8 [[R42]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 42), align 1
+; CHECK-NEXT: store i8 [[R43]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 43), align 1
+; CHECK-NEXT: store i8 [[R44]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 44), align 1
+; CHECK-NEXT: store i8 [[R45]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 45), align 1
+; CHECK-NEXT: store i8 [[R46]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 46), align 1
+; CHECK-NEXT: store i8 [[R47]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 47), align 1
+; CHECK-NEXT: store i8 [[R48]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1
+; CHECK-NEXT: store i8 [[R49]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 49), align 1
+; CHECK-NEXT: store i8 [[R50]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 50), align 1
+; CHECK-NEXT: store i8 [[R51]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 51), align 1
+; CHECK-NEXT: store i8 [[R52]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 52), align 1
+; CHECK-NEXT: store i8 [[R53]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 53), align 1
+; CHECK-NEXT: store i8 [[R54]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 54), align 1
+; CHECK-NEXT: store i8 [[R55]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 55), align 1
+; CHECK-NEXT: store i8 [[R56]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 56), align 1
+; CHECK-NEXT: store i8 [[R57]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 57), align 1
+; CHECK-NEXT: store i8 [[R58]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 58), align 1
+; CHECK-NEXT: store i8 [[R59]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 59), align 1
+; CHECK-NEXT: store i8 [[R60]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 60), align 1
+; CHECK-NEXT: store i8 [[R61]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 61), align 1
+; CHECK-NEXT: store i8 [[R62]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 62), align 1
+; CHECK-NEXT: store i8 [[R63]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
+; CHECK-NEXT: ret void
;
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
@@ -1084,5 +1252,3 @@ define void @sub_v64i8() {
store i8 %r63, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 63), align 1
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll
index 9683f71bd40e0..7740aaa14b805 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-non-power-2-to-power-2-large-vect.ll
@@ -13,12 +13,10 @@ define float @test(ptr %0, double %1, double %2, double %3) {
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <3 x double> [[TMP9]], <3 x double> poison, <3 x i32> <i32 1, i32 0, i32 2>
; CHECK-NEXT: [[TMP14:%.*]] = fmul <3 x double> [[TMP13]], zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1356
-; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x double> [[TMP28]], <double 0.000000e+00, double 1.000000e+00>
+; CHECK-NEXT: [[TMP16:%.*]] = fmul double [[TMP1]], 0.000000e+00
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <3 x double> <double -0.000000e+00, double poison, double -0.000000e+00>, double [[TMP6]], i32 1
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> poison, <3 x i32> <i32 0, i32 1, i32 poison>
-; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <3 x double> <double poison, double poison, double -0.000000e+00>, <3 x double> [[TMP18]], <3 x i32> <i32 3, i32 4, i32 2>
+; CHECK-NEXT: [[TMP18:%.*]] = insertelement <3 x double> <double poison, double poison, double -0.000000e+00>, double [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP19:%.*]] = insertelement <3 x double> [[TMP18]], double [[TMP16]], i32 0
; CHECK-NEXT: [[TMP20:%.*]] = fadd <3 x double> [[TMP17]], [[TMP19]]
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <3 x double> <double 0.000000e+00, double poison, double 0.000000e+00>, double [[TMP5]], i32 1
; CHECK-NEXT: [[TMP22:%.*]] = fadd <3 x double> [[TMP21]], [[TMP20]]
diff --git a/llvm/test/Transforms/SLPVectorizer/sincos.ll b/llvm/test/Transforms/SLPVectorizer/sincos.ll
index 504467d0049d7..76545dedac5f5 100644
--- a/llvm/test/Transforms/SLPVectorizer/sincos.ll
+++ b/llvm/test/Transforms/SLPVectorizer/sincos.ll
@@ -8,40 +8,52 @@
define i32 @test() {
; CHECK-LABEL: define i32 @test() {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr @phase, align 16
-; CHECK-NEXT: [[TMP1:%.*]] = call fast { <8 x double>, <8 x double> } @llvm.sincos.v8f64(<8 x double> [[TMP0]])
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @phase, align 16
+; CHECK-NEXT: [[TMP1:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { double, double } [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { double, double } [[TMP1]], 1
; CHECK-NEXT: store double [[TMP2]], ptr @sinval, align 16
; CHECK-NEXT: store double [[TMP3]], ptr @cosval, align 16
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 8), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { double, double } [[TMP5]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { double, double } [[TMP5]], 1
; CHECK-NEXT: store double [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 8), align 8
; CHECK-NEXT: store double [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 8), align 8
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x double> [[TMP5]], i32 2
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x double> [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 16), align 16
+; CHECK-NEXT: [[TMP9:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP8]])
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { double, double } [[TMP9]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { double, double } [[TMP9]], 1
; CHECK-NEXT: store double [[TMP10]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 16), align 16
; CHECK-NEXT: store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 16), align 16
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x double> [[TMP5]], i32 3
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x double> [[TMP4]], i32 3
+; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 24), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP12]])
+; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { double, double } [[TMP13]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { double, double } [[TMP13]], 1
; CHECK-NEXT: store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 24), align 8
; CHECK-NEXT: store double [[TMP15]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 24), align 8
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x double> [[TMP5]], i32 4
-; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x double> [[TMP4]], i32 4
+; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 32), align 16
+; CHECK-NEXT: [[TMP17:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP16]])
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { double, double } [[TMP17]], 0
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { double, double } [[TMP17]], 1
; CHECK-NEXT: store double [[TMP18]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 32), align 16
; CHECK-NEXT: store double [[TMP19]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 32), align 16
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x double> [[TMP5]], i32 5
-; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x double> [[TMP4]], i32 5
+; CHECK-NEXT: [[TMP20:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 40), align 8
+; CHECK-NEXT: [[TMP21:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP20]])
+; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { double, double } [[TMP21]], 0
+; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { double, double } [[TMP21]], 1
; CHECK-NEXT: store double [[TMP22]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 40), align 8
; CHECK-NEXT: store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 40), align 8
-; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x double> [[TMP5]], i32 6
-; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x double> [[TMP4]], i32 6
+; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 48), align 16
+; CHECK-NEXT: [[TMP25:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP24]])
+; CHECK-NEXT: [[TMP26:%.*]] = extractvalue { double, double } [[TMP25]], 0
+; CHECK-NEXT: [[TMP27:%.*]] = extractvalue { double, double } [[TMP25]], 1
; CHECK-NEXT: store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 48), align 16
; CHECK-NEXT: store double [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 48), align 16
-; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x double> [[TMP5]], i32 7
-; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x double> [[TMP4]], i32 7
+; CHECK-NEXT: [[TMP28:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 56), align 8
+; CHECK-NEXT: [[TMP29:%.*]] = tail call fast { double, double } @llvm.sincos.f64(double [[TMP28]])
+; CHECK-NEXT: [[TMP30:%.*]] = extractvalue { double, double } [[TMP29]], 0
+; CHECK-NEXT: [[TMP31:%.*]] = extractvalue { double, double } [[TMP29]], 1
; CHECK-NEXT: store double [[TMP30]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 56), align 8
; CHECK-NEXT: store double [[TMP31]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 56), align 8
; CHECK-NEXT: ret i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll b/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll
index 10bee3262f738..45d6e395b6886 100644
--- a/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/struct-return-revec.ll
@@ -8,24 +8,28 @@
define i32 @test() {
; CHECK-LABEL: define i32 @test() {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr @phase, align 16
-; CHECK-NEXT: [[TMP1:%.*]] = call fast { <8 x double>, <8 x double> } @llvm.sincos.v8f64(<8 x double> [[TMP0]])
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr @phase, align 16
+; CHECK-NEXT: [[TMP1:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP0]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 1
; CHECK-NEXT: store <2 x double> [[TMP2]], ptr @sinval, align 16
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr @cosval, align 16
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 16), align 8
+; CHECK-NEXT: [[TMP5:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP5]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP5]], 1
; CHECK-NEXT: store <2 x double> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 16), align 8
; CHECK-NEXT: store <2 x double> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 16), align 8
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 32), align 16
+; CHECK-NEXT: [[TMP9:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP8]])
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP9]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP9]], 1
; CHECK-NEXT: store <2 x double> [[TMP10]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 32), align 16
; CHECK-NEXT: store <2 x double> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 32), align 16
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @phase, i64 48), align 8
+; CHECK-NEXT: [[TMP13:%.*]] = tail call fast { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> [[TMP12]])
+; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP13]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP13]], 1
; CHECK-NEXT: store <2 x double> [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @sinval, i64 48), align 8
; CHECK-NEXT: store <2 x double> [[TMP15]], ptr getelementptr inbounds nuw (i8, ptr @cosval, i64 48), align 8
; CHECK-NEXT: ret i32 0
More information about the llvm-branch-commits
mailing list