[clang] [compiler-rt] [flang] [libc] [libcxx] [lldb] [llvm] [RFC][Clang] Add __int256/__uint256 builtin types (PR #182733)
Xavier Roche via cfe-commits
cfe-commits at lists.llvm.org
Mon Mar 2 02:54:25 PST 2026
https://github.com/xroche updated https://github.com/llvm/llvm-project/pull/182733
>From 4ad63284b520eb707abb6219854df22225ed7b52 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:37:59 +0100
Subject: [PATCH 01/17] [clang] Add __int256/__uint256 builtin type definitions
Define Int256 and UInt256 as new builtin types alongside __int128/__uint128.
Add type specifiers, token kinds, target info queries (hasInt256Type(),
getInt256Align()), and target-specific overrides (SPIR disables).
Plumbs through BuiltinTypes.def, TypeBase.h, Specifiers.h, DeclID.h,
TypeLoc.h, TokenKinds.def, and TargetInfo.
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
clang/include/clang/AST/BuiltinTypes.def | 6 ++++++
clang/include/clang/AST/DeclID.h | 6 ++++++
clang/include/clang/AST/TypeBase.h | 10 ++++------
clang/include/clang/AST/TypeLoc.h | 2 +-
clang/include/clang/Basic/Specifiers.h | 1 +
clang/include/clang/Basic/TargetInfo.h | 9 +++++++++
clang/include/clang/Basic/TokenKinds.def | 1 +
clang/lib/Basic/TargetInfo.cpp | 1 +
clang/lib/Basic/Targets/SPIR.h | 4 ++++
9 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/clang/include/clang/AST/BuiltinTypes.def b/clang/include/clang/AST/BuiltinTypes.def
index 444be4311a743..5af242566d84f 100644
--- a/clang/include/clang/AST/BuiltinTypes.def
+++ b/clang/include/clang/AST/BuiltinTypes.def
@@ -95,6 +95,9 @@ UNSIGNED_TYPE(ULongLong, UnsignedLongLongTy)
// '__uint128_t'
UNSIGNED_TYPE(UInt128, UnsignedInt128Ty)
+// '__uint256_t'
+UNSIGNED_TYPE(UInt256, UnsignedInt256Ty)
+
//===- Signed Types -------------------------------------------------------===//
// 'char' for targets where it's signed
@@ -121,6 +124,9 @@ SIGNED_TYPE(LongLong, LongLongTy)
// '__int128_t'
SIGNED_TYPE(Int128, Int128Ty)
+// '__int256_t'
+SIGNED_TYPE(Int256, Int256Ty)
+
//===- Fixed point types --------------------------------------------------===//
// 'short _Accum'
diff --git a/clang/include/clang/AST/DeclID.h b/clang/include/clang/AST/DeclID.h
index 47ae05b2747ae..801defab4dfe5 100644
--- a/clang/include/clang/AST/DeclID.h
+++ b/clang/include/clang/AST/DeclID.h
@@ -53,6 +53,12 @@ enum PredefinedDeclIDs {
/// The unsigned 128-bit integer type.
PREDEF_DECL_UNSIGNED_INT_128_ID,
+ /// The signed 256-bit integer type.
+ PREDEF_DECL_INT_256_ID,
+
+ /// The unsigned 256-bit integer type.
+ PREDEF_DECL_UNSIGNED_INT_256_ID,
+
/// The internal 'instancetype' typedef.
PREDEF_DECL_OBJC_INSTANCETYPE_ID,
diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
index 9402469f5e12b..dba08422ca8d2 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -1935,7 +1935,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
unsigned : NumTypeBits;
/// The kind (BuiltinType::Kind) of builtin type this is.
- static constexpr unsigned NumOfBuiltinTypeBits = 9;
+ static constexpr unsigned NumOfBuiltinTypeBits = 10;
unsigned Kind : NumOfBuiltinTypeBits;
};
@@ -3230,16 +3230,14 @@ class BuiltinType : public Type {
bool isSugared() const { return false; }
QualType desugar() const { return QualType(this, 0); }
- bool isInteger() const {
- return getKind() >= Bool && getKind() <= Int128;
- }
+ bool isInteger() const { return getKind() >= Bool && getKind() <= Int256; }
bool isSignedInteger() const {
- return getKind() >= Char_S && getKind() <= Int128;
+ return getKind() >= Char_S && getKind() <= Int256;
}
bool isUnsignedInteger() const {
- return getKind() >= Bool && getKind() <= UInt128;
+ return getKind() >= Bool && getKind() <= UInt256;
}
bool isFloatingPoint() const {
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 24df18dbaace4..5c972c857d1dc 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -605,7 +605,7 @@ class BuiltinTypeLoc : public ConcreteTypeLoc<UnqualTypeLoc,
bool needsExtraLocalData() const {
BuiltinType::Kind bk = getTypePtr()->getKind();
- return (bk >= BuiltinType::UShort && bk <= BuiltinType::UInt128) ||
+ return (bk >= BuiltinType::UShort && bk <= BuiltinType::UInt256) ||
(bk >= BuiltinType::Short && bk <= BuiltinType::Ibm128) ||
bk == BuiltinType::UChar || bk == BuiltinType::SChar;
}
diff --git a/clang/include/clang/Basic/Specifiers.h b/clang/include/clang/Basic/Specifiers.h
index 118c3b75aed95..ca644b7d01392 100644
--- a/clang/include/clang/Basic/Specifiers.h
+++ b/clang/include/clang/Basic/Specifiers.h
@@ -62,6 +62,7 @@ namespace clang {
TST_char32, // C++11 char32_t
TST_int,
TST_int128,
+ TST_int256,
TST_bitint, // Bit-precise integer types.
TST_half, // OpenCL half, ARM NEON __fp16
TST_Float16, // C11 extension ISO/IEC TS 18661-3
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index ec6cd2be7c3c5..61b5d80f4f102 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -100,6 +100,7 @@ struct TransferrableTargetInfo {
unsigned char LongWidth, LongAlign;
unsigned char LongLongWidth, LongLongAlign;
unsigned char Int128Align;
+ unsigned short Int256Align;
// This is an optional parameter for targets that
// don't use 'LongLongAlign' for '_BitInt' max alignment
@@ -543,6 +544,9 @@ class TargetInfo : public TransferrableTargetInfo,
/// getInt128Align() - Returns the alignment of Int128.
unsigned getInt128Align() const { return Int128Align; }
+ /// getInt256Align() - Returns the alignment of Int256.
+ unsigned getInt256Align() const { return Int256Align; }
+
/// getBitIntMaxAlign() - Returns the maximum possible alignment of
/// '_BitInt' and 'unsigned _BitInt'.
unsigned getBitIntMaxAlign() const {
@@ -680,6 +684,11 @@ class TargetInfo : public TransferrableTargetInfo,
getTargetOpts().ForceEnableInt128;
} // FIXME
+ /// Determine whether the __int256 type is supported on this target.
+ virtual bool hasInt256Type() const {
+ return getPointerWidth(LangAS::Default) >= 64;
+ }
+
/// Determine whether the _BitInt type is supported on this target. This
/// limitation is put into place for ABI reasons.
/// FIXME: _BitInt is a required type in C23, so there's not much utility in
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 8b9f613037718..0ebd1743f8ba0 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -488,6 +488,7 @@ KEYWORD(__float128 , KEYALL)
KEYWORD(__ibm128 , KEYALL)
KEYWORD(__imag , KEYALL)
KEYWORD(__int128 , KEYALL)
+KEYWORD(__int256 , KEYALL)
KEYWORD(__label__ , KEYALL)
KEYWORD(__real , KEYALL)
KEYWORD(__thread , KEYALL)
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 794621c4b3e1f..c04bde68c937d 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -82,6 +82,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
LongWidth = LongAlign = 32;
LongLongWidth = LongLongAlign = 64;
Int128Align = 128;
+ Int256Align = 128;
// Fixed point default bit widths
ShortAccumWidth = ShortAccumAlign = 16;
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index eef9521c7434a..a8d60e7714295 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -242,6 +242,8 @@ class LLVM_LIBRARY_VISIBILITY BaseSPIRTargetInfo : public TargetInfo {
bool hasBitIntType() const override { return true; }
bool hasInt128Type() const override { return false; }
+
+ bool hasInt256Type() const override { return false; }
};
class LLVM_LIBRARY_VISIBILITY SPIRTargetInfo : public BaseSPIRTargetInfo {
@@ -478,6 +480,8 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final
}
bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); }
+
+ bool hasInt256Type() const override { return TargetInfo::hasInt256Type(); }
};
class LLVM_LIBRARY_VISIBILITY SPIRV64IntelTargetInfo final
>From 6d9f71a193f178234c98f97be477220572146a1a Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:38:16 +0100
Subject: [PATCH 02/17] [clang] Add __int256/__uint256 AST, parser, sema, and
codegen
Wire __int256/__uint256 through all clang subsystems:
- AST: context queries, constant evaluation, mangling (Itanium/MSVC),
printf format strings, record layout, type printing, type locs
- Parser: declaration specifiers, expression parsing, tentative parsing
- Sema: type checking, overload resolution, template variadic support,
atomic/bitfield constraints, tautological comparisons
- CodeGen: LLVM type mapping, debug info (DW_ATE_signed/unsigned_256),
TBAA, Swift calling convention, X86 ABI, CIR
- Serialization: AST reader/writer with new type ID
- Frontend: preprocessor macros (__INT256_MAX__, etc.)
- Index: USR generation
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
clang/include/clang/AST/ASTContext.h | 17 ++++++-
clang/include/clang/Sema/DeclSpec.h | 1 +
.../include/clang/Serialization/ASTBitCodes.h | 8 +++-
clang/lib/AST/ASTContext.cpp | 36 +++++++++++++++
clang/lib/AST/ExprConstant.cpp | 1 +
clang/lib/AST/ItaniumMangle.cpp | 18 ++++++++
clang/lib/AST/MicrosoftMangle.cpp | 6 +++
clang/lib/AST/NSAPI.cpp | 2 +
clang/lib/AST/PrintfFormatString.cpp | 2 +
clang/lib/AST/RecordLayoutBuilder.cpp | 1 +
clang/lib/AST/StmtPrinter.cpp | 4 ++
clang/lib/AST/Type.cpp | 4 ++
clang/lib/AST/TypeLoc.cpp | 2 +
clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp | 2 +
clang/lib/CIR/CodeGen/CIRGenModule.cpp | 2 +
clang/lib/CIR/CodeGen/CIRGenTypeCache.h | 2 +
clang/lib/CIR/CodeGen/CIRGenTypes.cpp | 2 +
clang/lib/CodeGen/CGDebugInfo.cpp | 2 +
clang/lib/CodeGen/CodeGenModule.cpp | 5 +++
clang/lib/CodeGen/CodeGenTBAA.cpp | 2 +
clang/lib/CodeGen/CodeGenTypes.cpp | 5 +++
clang/lib/CodeGen/ItaniumCXXABI.cpp | 44 ++++++++++++-------
clang/lib/CodeGen/SwiftCallingConv.cpp | 3 ++
clang/lib/CodeGen/Targets/X86.cpp | 29 ++++++++++--
clang/lib/Frontend/InitPreprocessor.cpp | 2 +
clang/lib/Index/USRGeneration.cpp | 6 +++
clang/lib/Parse/ParseDecl.cpp | 7 +++
clang/lib/Parse/ParseExpr.cpp | 1 +
clang/lib/Parse/ParseExprCXX.cpp | 3 ++
clang/lib/Parse/ParseTentative.cpp | 2 +
clang/lib/Sema/DeclSpec.cpp | 8 +++-
clang/lib/Sema/Sema.cpp | 14 ++++++
clang/lib/Sema/SemaChecking.cpp | 10 ++---
clang/lib/Sema/SemaOverload.cpp | 8 ++++
clang/lib/Sema/SemaTemplateVariadic.cpp | 1 +
clang/lib/Sema/SemaType.cpp | 28 ++++++++----
clang/lib/Sema/TreeTransform.h | 8 ++--
clang/lib/Serialization/ASTCommon.cpp | 6 +++
clang/lib/Serialization/ASTReader.cpp | 18 ++++++++
clang/lib/Serialization/ASTWriter.cpp | 2 +
40 files changed, 283 insertions(+), 41 deletions(-)
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 05302c30d18d1..2a552066afb8f 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -433,6 +433,12 @@ class ASTContext : public RefCountedBase<ASTContext> {
/// The typedef for the __uint128_t type.
mutable TypedefDecl *UInt128Decl = nullptr;
+ /// The typedef for the __int256_t type.
+ mutable TypedefDecl *Int256Decl = nullptr;
+
+ /// The typedef for the __uint256_t type.
+ mutable TypedefDecl *UInt256Decl = nullptr;
+
/// The typedef for the target specific predefined
/// __builtin_va_list type.
mutable TypedefDecl *BuiltinVaListDecl = nullptr;
@@ -1296,9 +1302,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
CanQualType Char8Ty; // [C++20 proposal]
CanQualType Char16Ty; // [C++0x 3.9.1p5], integer type in C99.
CanQualType Char32Ty; // [C++0x 3.9.1p5], integer type in C99.
- CanQualType SignedCharTy, ShortTy, IntTy, LongTy, LongLongTy, Int128Ty;
+ CanQualType SignedCharTy, ShortTy, IntTy, LongTy, LongLongTy, Int128Ty,
+ Int256Ty;
CanQualType UnsignedCharTy, UnsignedShortTy, UnsignedIntTy, UnsignedLongTy;
- CanQualType UnsignedLongLongTy, UnsignedInt128Ty;
+ CanQualType UnsignedLongLongTy, UnsignedInt128Ty, UnsignedInt256Ty;
CanQualType FloatTy, DoubleTy, LongDoubleTy, Float128Ty, Ibm128Ty;
CanQualType ShortAccumTy, AccumTy,
LongAccumTy; // ISO/IEC JTC1 SC22 WG14 N1169 Extension
@@ -1448,6 +1455,12 @@ class ASTContext : public RefCountedBase<ASTContext> {
/// Retrieve the declaration for the 128-bit unsigned integer type.
TypedefDecl *getUInt128Decl() const;
+ /// Retrieve the declaration for the 256-bit signed integer type.
+ TypedefDecl *getInt256Decl() const;
+
+ /// Retrieve the declaration for the 256-bit unsigned integer type.
+ TypedefDecl *getUInt256Decl() const;
+
//===--------------------------------------------------------------------===//
// Type Constructors
//===--------------------------------------------------------------------===//
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index 6e5421c7072c7..40661f25fa65e 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -255,6 +255,7 @@ class DeclSpec {
static const TST TST_char32 = clang::TST_char32;
static const TST TST_int = clang::TST_int;
static const TST TST_int128 = clang::TST_int128;
+ static const TST TST_int256 = clang::TST_int256;
static const TST TST_bitint = clang::TST_bitint;
static const TST TST_half = clang::TST_half;
static const TST TST_BFloat16 = clang::TST_BFloat16;
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index d72f1f9db86b2..9916b5cd9369b 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1126,6 +1126,12 @@ enum PredefinedTypeIDs {
/// \brief The '__ibm128' type
PREDEF_TYPE_IBM128_ID = 74,
+ /// The '__uint256_t' type.
+ PREDEF_TYPE_UINT256_ID = 75,
+
+ /// The '__int256_t' type.
+ PREDEF_TYPE_INT256_ID = 76,
+
/// OpenCL image types with auto numeration
#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
PREDEF_TYPE_##Id##_ID,
@@ -1163,7 +1169,7 @@ enum PredefinedTypeIDs {
///
/// Type IDs for non-predefined types will start at
/// NUM_PREDEF_TYPE_IDs.
-const unsigned NUM_PREDEF_TYPE_IDS = 514;
+const unsigned NUM_PREDEF_TYPE_IDS = 516;
// Ensure we do not overrun the predefined types we reserved
// in the enum PredefinedTypeIDs above.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 5fbdff280073f..a9f79015e8483 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1215,6 +1215,18 @@ TypedefDecl *ASTContext::getUInt128Decl() const {
return UInt128Decl;
}
+TypedefDecl *ASTContext::getInt256Decl() const {
+ if (!Int256Decl)
+ Int256Decl = buildImplicitTypedef(Int256Ty, "__int256_t");
+ return Int256Decl;
+}
+
+TypedefDecl *ASTContext::getUInt256Decl() const {
+ if (!UInt256Decl)
+ UInt256Decl = buildImplicitTypedef(UnsignedInt256Ty, "__uint256_t");
+ return UInt256Decl;
+}
+
void ASTContext::InitBuiltinType(CanQualType &R, BuiltinType::Kind K) {
auto *Ty = new (*this, alignof(BuiltinType)) BuiltinType(K);
R = CanQualType::CreateUnsafe(QualType(Ty, 0));
@@ -1301,6 +1313,10 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
InitBuiltinType(Int128Ty, BuiltinType::Int128);
InitBuiltinType(UnsignedInt128Ty, BuiltinType::UInt128);
+ // Extension, 256-bit integers.
+ InitBuiltinType(Int256Ty, BuiltinType::Int256);
+ InitBuiltinType(UnsignedInt256Ty, BuiltinType::UInt256);
+
// C++ 3.9.1p5
if (TargetInfo::isTypeSigned(Target.getWCharType()))
InitBuiltinType(WCharTy, BuiltinType::WChar_S);
@@ -2174,6 +2190,11 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
Width = 128;
Align = Target->getInt128Align();
break;
+ case BuiltinType::Int256:
+ case BuiltinType::UInt256:
+ Width = 256;
+ Align = Target->getInt256Align();
+ break;
case BuiltinType::ShortAccum:
case BuiltinType::UShortAccum:
case BuiltinType::SatShortAccum:
@@ -8233,6 +8254,11 @@ unsigned ASTContext::getIntegerRank(const Type *T) const {
case BuiltinType::UInt128:
return 7 + (getIntWidth(Int128Ty) << 3);
+ case BuiltinType::Int256:
+ case BuiltinType::UInt256:
+ // Base rank > 7 is fine: only the total ordering matters, not the low bits.
+ return 8 + (getIntWidth(Int256Ty) << 3);
+
// "The ranks of char8_t, char16_t, char32_t, and wchar_t equal the ranks of
// their underlying types" [c++20 conv.rank]
case BuiltinType::Char8:
@@ -9165,6 +9191,8 @@ static char getObjCEncodingForPrimitiveType(const ASTContext *C,
case BuiltinType::ULong:
return C->getTargetInfo().getLongWidth() == 32 ? 'L' : 'Q';
case BuiltinType::UInt128: return 'T';
+ case BuiltinType::UInt256:
+ return 'W';
case BuiltinType::ULongLong: return 'Q';
case BuiltinType::Char_S:
case BuiltinType::SChar: return 'c';
@@ -9176,6 +9204,8 @@ static char getObjCEncodingForPrimitiveType(const ASTContext *C,
return C->getTargetInfo().getLongWidth() == 32 ? 'l' : 'q';
case BuiltinType::LongLong: return 'q';
case BuiltinType::Int128: return 't';
+ case BuiltinType::Int256:
+ return 'w';
case BuiltinType::Float: return 'f';
case BuiltinType::Double: return 'd';
case BuiltinType::LongDouble: return 'D';
@@ -12388,6 +12418,8 @@ QualType ASTContext::getCorrespondingUnsignedType(QualType T) const {
return UnsignedLongLongTy;
case BuiltinType::Int128:
return UnsignedInt128Ty;
+ case BuiltinType::Int256:
+ return UnsignedInt256Ty;
// wchar_t is special. It is either signed or not, but when it's signed,
// there's no matching "unsigned wchar_t". Therefore we return the unsigned
// version of its underlying type instead.
@@ -12462,6 +12494,8 @@ QualType ASTContext::getCorrespondingSignedType(QualType T) const {
return LongLongTy;
case BuiltinType::UInt128:
return Int128Ty;
+ case BuiltinType::UInt256:
+ return Int256Ty;
// wchar_t is special. It is either unsigned or not, but when it's unsigned,
// there's no matching "signed wchar_t". Therefore we return the signed
// version of its underlying type instead.
@@ -13466,6 +13500,8 @@ QualType ASTContext::getIntTypeForBitwidth(unsigned DestWidth,
CanQualType QualTy = getFromTargetType(Ty);
if (!QualTy && DestWidth == 128)
return Signed ? Int128Ty : UnsignedInt128Ty;
+ if (!QualTy && DestWidth == 256)
+ return Signed ? Int256Ty : UnsignedInt256Ty;
return QualTy;
}
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index feea97cd67534..ea985428d0251 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -15381,6 +15381,7 @@ GCCTypeClass EvaluateBuiltinClassifyType(QualType T,
case BuiltinType::ULong:
case BuiltinType::ULongLong:
case BuiltinType::UInt128:
+ case BuiltinType::UInt256:
return GCCTypeClass::Integer;
case BuiltinType::UShortAccum:
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 1faf7f1466e39..3e7a5fcc8a492 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3177,6 +3177,12 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
Out << "u4i128";
addSubstitution(BuiltinType::Int128);
break;
+ case 256:
+ if (mangleSubstitution(BuiltinType::Int256))
+ break;
+ Out << "u4i256";
+ addSubstitution(BuiltinType::Int256);
+ break;
default:
llvm_unreachable("Unknown integer size for normalization");
}
@@ -3212,6 +3218,12 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
Out << "u4u128";
addSubstitution(BuiltinType::UInt128);
break;
+ case 256:
+ if (mangleSubstitution(BuiltinType::UInt256))
+ break;
+ Out << "u4u256";
+ addSubstitution(BuiltinType::UInt256);
+ break;
default:
llvm_unreachable("Unknown integer size for normalization");
}
@@ -3247,6 +3259,9 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
case BuiltinType::UInt128:
Out << 'o';
break;
+ case BuiltinType::UInt256:
+ Out << "u8__uint256";
+ break;
case BuiltinType::SChar:
Out << 'a';
break;
@@ -3278,6 +3293,9 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
case BuiltinType::Int128:
Out << 'n';
break;
+ case BuiltinType::Int256:
+ Out << "u7__int256";
+ break;
case BuiltinType::Float16:
Out << "DF16_";
break;
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 1f28d281be9fe..e499551da5c83 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -2723,6 +2723,12 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
case BuiltinType::UInt128:
Out << "_M";
break;
+ case BuiltinType::Int256:
+ Out << "$$_L";
+ break;
+ case BuiltinType::UInt256:
+ Out << "$$_M";
+ break;
case BuiltinType::Bool:
Out << "_N";
break;
diff --git a/clang/lib/AST/NSAPI.cpp b/clang/lib/AST/NSAPI.cpp
index 17f5ee5dee3d1..43d964e2edca6 100644
--- a/clang/lib/AST/NSAPI.cpp
+++ b/clang/lib/AST/NSAPI.cpp
@@ -399,6 +399,7 @@ NSAPI::getNSNumberFactoryMethodKind(QualType T) const {
case BuiltinType::Char16:
case BuiltinType::Char32:
case BuiltinType::Int128:
+ case BuiltinType::Int256:
case BuiltinType::LongDouble:
case BuiltinType::ShortAccum:
case BuiltinType::Accum:
@@ -425,6 +426,7 @@ NSAPI::getNSNumberFactoryMethodKind(QualType T) const {
case BuiltinType::SatUFract:
case BuiltinType::SatULongFract:
case BuiltinType::UInt128:
+ case BuiltinType::UInt256:
case BuiltinType::Float16:
case BuiltinType::Float128:
case BuiltinType::Ibm128:
diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp
index 855550475721a..bf821ad4e3049 100644
--- a/clang/lib/AST/PrintfFormatString.cpp
+++ b/clang/lib/AST/PrintfFormatString.cpp
@@ -820,6 +820,8 @@ bool PrintfSpecifier::fixType(QualType QT, const LangOptions &LangOpt,
case BuiltinType::Char32:
case BuiltinType::UInt128:
case BuiltinType::Int128:
+ case BuiltinType::UInt256:
+ case BuiltinType::Int256:
case BuiltinType::Half:
case BuiltinType::BFloat16:
case BuiltinType::Float16:
diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp
index 0b1bf813efd10..6f7f09211bd0b 100644
--- a/clang/lib/AST/RecordLayoutBuilder.cpp
+++ b/clang/lib/AST/RecordLayoutBuilder.cpp
@@ -1470,6 +1470,7 @@ void ItaniumRecordLayoutBuilder::LayoutWideBitField(uint64_t FieldSize,
Context.UnsignedCharTy, Context.UnsignedShortTy,
Context.UnsignedIntTy, Context.UnsignedLongTy,
Context.UnsignedLongLongTy, Context.UnsignedInt128Ty,
+ Context.UnsignedInt256Ty,
};
QualType Type;
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index f4ce4a7573aab..78a8e024d7f0d 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1506,6 +1506,10 @@ void StmtPrinter::VisitIntegerLiteral(IntegerLiteral *Node) {
break; // no suffix.
case BuiltinType::UInt128:
break; // no suffix.
+ case BuiltinType::Int256:
+ break; // no suffix.
+ case BuiltinType::UInt256:
+ break; // no suffix.
case BuiltinType::WChar_S:
case BuiltinType::WChar_U:
break; // no suffix
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index a85f08753a132..dbe50a7f4f927 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3446,6 +3446,8 @@ StringRef BuiltinType::getName(const PrintingPolicy &Policy) const {
return "long long";
case Int128:
return "__int128";
+ case Int256:
+ return "__int256";
case UChar:
return "unsigned char";
case UShort:
@@ -3458,6 +3460,8 @@ StringRef BuiltinType::getName(const PrintingPolicy &Policy) const {
return "unsigned long long";
case UInt128:
return "unsigned __int128";
+ case UInt256:
+ return "unsigned __int256";
case Half:
return Policy.Half ? "half" : "__fp16";
case BFloat16:
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index 53edfdb65a4d5..1766ca37c9a65 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -344,12 +344,14 @@ TypeSpecifierType BuiltinTypeLoc::getWrittenTypeSpec() const {
case BuiltinType::ULong:
case BuiltinType::ULongLong:
case BuiltinType::UInt128:
+ case BuiltinType::UInt256:
case BuiltinType::SChar:
case BuiltinType::Short:
case BuiltinType::Int:
case BuiltinType::Long:
case BuiltinType::LongLong:
case BuiltinType::Int128:
+ case BuiltinType::Int256:
case BuiltinType::Half:
case BuiltinType::Float:
case BuiltinType::Double:
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index a18e2b91b1dd4..3e1ef878f981b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -669,6 +669,8 @@ static bool typeInfoIsInStandardLibrary(const BuiltinType *ty) {
case BuiltinType::Char32:
case BuiltinType::Int128:
case BuiltinType::UInt128:
+ case BuiltinType::Int256:
+ case BuiltinType::UInt256:
return true;
#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 223b53731359a..a0328e1523eb2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -82,6 +82,7 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
sInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/true);
sInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/true);
sInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/true);
+ sInt256Ty = cir::IntType::get(&getMLIRContext(), 256, /*isSigned=*/true);
uInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/false);
uInt8PtrTy = cir::PointerType::get(uInt8Ty);
cirAllocaAddressSpace = getTargetCIRGenInfo().getCIRAllocaAddressSpace();
@@ -89,6 +90,7 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
uInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/false);
uInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/false);
uInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/false);
+ uInt256Ty = cir::IntType::get(&getMLIRContext(), 256, /*isSigned=*/false);
fP16Ty = cir::FP16Type::get(&getMLIRContext());
bFloat16Ty = cir::BF16Type::get(&getMLIRContext());
floatTy = cir::SingleType::get(&getMLIRContext());
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
index 4f3c319816e3a..cd4227501c94c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
@@ -35,6 +35,7 @@ struct CIRGenTypeCache {
cir::IntType sInt32Ty;
cir::IntType sInt64Ty;
cir::IntType sInt128Ty;
+ cir::IntType sInt256Ty;
// ClangIR unsigned integral type of common sizes
cir::IntType uInt8Ty;
@@ -42,6 +43,7 @@ struct CIRGenTypeCache {
cir::IntType uInt32Ty;
cir::IntType uInt64Ty;
cir::IntType uInt128Ty;
+ cir::IntType uInt256Ty;
// ClangIR floating-point types with fixed formats
cir::FP16Type fP16Ty;
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index d5641441b2384..8c96bcad11459 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -315,6 +315,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
case BuiltinType::Char_S:
case BuiltinType::Int:
case BuiltinType::Int128:
+ case BuiltinType::Int256:
case BuiltinType::Long:
case BuiltinType::LongLong:
case BuiltinType::SChar:
@@ -387,6 +388,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
case BuiltinType::UChar:
case BuiltinType::UInt:
case BuiltinType::UInt128:
+ case BuiltinType::UInt256:
case BuiltinType::ULong:
case BuiltinType::ULongLong:
case BuiltinType::UShort:
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 1758513a2844b..698cc97b49e8a 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1122,6 +1122,7 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
case BuiltinType::UShort:
case BuiltinType::UInt:
case BuiltinType::UInt128:
+ case BuiltinType::UInt256:
case BuiltinType::ULong:
case BuiltinType::WChar_U:
case BuiltinType::ULongLong:
@@ -1130,6 +1131,7 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
case BuiltinType::Short:
case BuiltinType::Int:
case BuiltinType::Int128:
+ case BuiltinType::Int256:
case BuiltinType::Long:
case BuiltinType::WChar_S:
case BuiltinType::LongLong:
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c31bcabe49016..107ef39106c18 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -375,6 +375,11 @@ static void checkDataLayoutConsistency(const TargetInfo &Target,
Triple.getArch() != llvm::Triple::ve)
Check("__int128", llvm::Type::getIntNTy(Context, 128), Target.Int128Align);
+ if (Target.hasInt256Type() && !Triple.isAMDGPU() && !Triple.isSPIRV() &&
+ Triple.getArch() != llvm::Triple::ve &&
+ Triple.getArch() != llvm::Triple::systemz)
+ Check("__int256", llvm::Type::getIntNTy(Context, 256), Target.Int256Align);
+
if (Target.hasFloat16Type())
Check("half", llvm::Type::getFloatingPointTy(Context, *Target.HalfFormat),
Target.HalfAlign);
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index cd08f3ec397a0..8a7743019c8e8 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -184,6 +184,8 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
return getTypeInfo(Context.LongLongTy);
case BuiltinType::UInt128:
return getTypeInfo(Context.Int128Ty);
+ case BuiltinType::UInt256:
+ return getTypeInfo(Context.Int256Ty);
case BuiltinType::UShortFract:
return getTypeInfo(Context.ShortFractTy);
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 6bd79056e599a..5014cc9cea6b5 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -520,6 +520,11 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
ResultType = llvm::IntegerType::get(getLLVMContext(), 128);
break;
+ case BuiltinType::UInt256:
+ case BuiltinType::Int256:
+ ResultType = llvm::IntegerType::get(getLLVMContext(), 256);
+ break;
+
#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
case BuiltinType::Id:
#include "clang/Basic/OpenCLImageTypes.def"
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 52768a8300a20..5a71293510796 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -3726,6 +3726,8 @@ static bool TypeInfoIsInStandardLibrary(const BuiltinType *Ty) {
case BuiltinType::Char32:
case BuiltinType::Int128:
case BuiltinType::UInt128:
+ case BuiltinType::Int256:
+ case BuiltinType::UInt256:
return true;
#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
@@ -4667,21 +4669,33 @@ llvm::Constant *ItaniumCXXABI::getAddrOfRTTIDescriptor(QualType Ty) {
void ItaniumCXXABI::EmitFundamentalRTTIDescriptors(const CXXRecordDecl *RD) {
// Types added here must also be added to TypeInfoIsInStandardLibrary.
- QualType FundamentalTypes[] = {
- getContext().VoidTy, getContext().NullPtrTy,
- getContext().BoolTy, getContext().WCharTy,
- getContext().CharTy, getContext().UnsignedCharTy,
- getContext().SignedCharTy, getContext().ShortTy,
- getContext().UnsignedShortTy, getContext().IntTy,
- getContext().UnsignedIntTy, getContext().LongTy,
- getContext().UnsignedLongTy, getContext().LongLongTy,
- getContext().UnsignedLongLongTy, getContext().Int128Ty,
- getContext().UnsignedInt128Ty, getContext().HalfTy,
- getContext().FloatTy, getContext().DoubleTy,
- getContext().LongDoubleTy, getContext().Float128Ty,
- getContext().Char8Ty, getContext().Char16Ty,
- getContext().Char32Ty
- };
+ QualType FundamentalTypes[] = {getContext().VoidTy,
+ getContext().NullPtrTy,
+ getContext().BoolTy,
+ getContext().WCharTy,
+ getContext().CharTy,
+ getContext().UnsignedCharTy,
+ getContext().SignedCharTy,
+ getContext().ShortTy,
+ getContext().UnsignedShortTy,
+ getContext().IntTy,
+ getContext().UnsignedIntTy,
+ getContext().LongTy,
+ getContext().UnsignedLongTy,
+ getContext().LongLongTy,
+ getContext().UnsignedLongLongTy,
+ getContext().Int128Ty,
+ getContext().UnsignedInt128Ty,
+ getContext().Int256Ty,
+ getContext().UnsignedInt256Ty,
+ getContext().HalfTy,
+ getContext().FloatTy,
+ getContext().DoubleTy,
+ getContext().LongDoubleTy,
+ getContext().Float128Ty,
+ getContext().Char8Ty,
+ getContext().Char16Ty,
+ getContext().Char32Ty};
llvm::GlobalValue::DLLStorageClassTypes DLLStorageClass =
RD->hasAttr<DLLExportAttr>() || CGM.shouldMapVisibilityToDLLExport(RD)
? llvm::GlobalValue::DLLExportStorageClass
diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp
index 209654303a82b..718295cdd116d 100644
--- a/clang/lib/CodeGen/SwiftCallingConv.cpp
+++ b/clang/lib/CodeGen/SwiftCallingConv.cpp
@@ -679,6 +679,9 @@ bool swiftcall::isLegalIntegerType(CodeGenModule &CGM,
case 128:
return CGM.getContext().getTargetInfo().hasInt128Type();
+ case 256:
+ return CGM.getContext().getTargetInfo().hasInt256Type();
+
default:
return false;
}
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index e6203db8bc245..997a03f77baf1 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -1821,6 +1821,10 @@ void X86_64ABIInfo::classify(QualType Ty, uint64_t OffsetBase, Class &Lo,
} else if (k == BuiltinType::Int128 || k == BuiltinType::UInt128) {
Lo = Integer;
Hi = Integer;
+ } else if (k == BuiltinType::Int256 || k == BuiltinType::UInt256) {
+ // Exceeds 2 eightbytes; cannot be classified in registers per SysV ABI.
+ Lo = Memory;
+ Hi = Memory;
} else if (k >= BuiltinType::Bool && k <= BuiltinType::LongLong) {
Current = Integer;
} else if (k == BuiltinType::Float || k == BuiltinType::Double ||
@@ -1926,7 +1930,9 @@ void X86_64ABIInfo::classify(QualType Ty, uint64_t OffsetBase, Class &Lo,
// gcc passes 256 and 512 bit <X x __int128> vectors in memory. :(
if (passInt128VectorsInMem() && Size != 128 &&
(ElementType->isSpecificBuiltinType(BuiltinType::Int128) ||
- ElementType->isSpecificBuiltinType(BuiltinType::UInt128)))
+ ElementType->isSpecificBuiltinType(BuiltinType::UInt128) ||
+ ElementType->isSpecificBuiltinType(BuiltinType::Int256) ||
+ ElementType->isSpecificBuiltinType(BuiltinType::UInt256)))
return;
// Arguments of 256-bits are split into four eightbyte chunks. The
@@ -2186,7 +2192,10 @@ ABIArgInfo X86_64ABIInfo::getIndirectReturnResult(QualType Ty) const {
if (const auto *ED = Ty->getAsEnumDecl())
Ty = ED->getIntegerType();
- if (Ty->isBitIntType())
+ // Types that exceed two eightbytes (128 bits) cannot be returned in
+ // registers per the SysV ABI. Route them through the indirect path
+ // just like _BitInt.
+ if (Ty->isBitIntType() || getContext().getTypeSize(Ty) > 128)
return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
return (isPromotableIntegerTypeForABI(Ty) ? ABIArgInfo::getExtend(Ty)
@@ -2205,7 +2214,9 @@ bool X86_64ABIInfo::IsIllegalVectorType(QualType Ty) const {
QualType EltTy = VecTy->getElementType();
if (passInt128VectorsInMem() &&
(EltTy->isSpecificBuiltinType(BuiltinType::Int128) ||
- EltTy->isSpecificBuiltinType(BuiltinType::UInt128)))
+ EltTy->isSpecificBuiltinType(BuiltinType::UInt128) ||
+ EltTy->isSpecificBuiltinType(BuiltinType::Int256) ||
+ EltTy->isSpecificBuiltinType(BuiltinType::UInt256)))
return true;
}
@@ -2222,8 +2233,11 @@ ABIArgInfo X86_64ABIInfo::getIndirectResult(QualType Ty,
// the argument in the free register. This does not seem to happen currently,
// but this code would be much safer if we could mark the argument with
// 'onstack'. See PR12193.
+ // Scalar types that fit in two eightbytes (128 bits) can be passed in
+ // registers naturally. Larger scalar types (e.g. __int256) exceed the
+ // SysV ABI register-passing limit and must go through the indirect path.
if (!isAggregateTypeForABI(Ty) && !IsIllegalVectorType(Ty) &&
- !Ty->isBitIntType()) {
+ !Ty->isBitIntType() && getContext().getTypeSize(Ty) <= 128) {
// Treat an enum type as its underlying type.
if (const auto *ED = Ty->getAsEnumDecl())
Ty = ED->getIntegerType();
@@ -3420,6 +3434,13 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
llvm::Type::getInt64Ty(getVMContext()), 2));
+ case BuiltinType::Int256:
+ case BuiltinType::UInt256:
+ // > 8 bytes non-float/vector: passed indirectly on Win64.
+ return ABIArgInfo::getIndirect(
+ Align, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
+ /*ByVal=*/false);
+
default:
break;
}
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 1ccd74314f373..217f78886835f 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1141,6 +1141,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
TI.getTypeWidth(TI.getWIntType()), TI, Builder);
if (TI.hasInt128Type())
DefineTypeSizeof("__SIZEOF_INT128__", 128, TI, Builder);
+ if (TI.hasInt256Type())
+ DefineTypeSizeof("__SIZEOF_INT256__", 256, TI, Builder);
DefineType("__INTMAX_TYPE__", TI.getIntMaxType(), Builder);
DefineFmt(LangOpts, "__INTMAX", TI.getIntMaxType(), TI, Builder);
diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp
index e3649631ac8d3..e810d68830e94 100644
--- a/clang/lib/Index/USRGeneration.cpp
+++ b/clang/lib/Index/USRGeneration.cpp
@@ -714,6 +714,9 @@ void USRGenerator::VisitType(QualType T) {
Out << 'k'; break;
case BuiltinType::UInt128:
Out << 'j'; break;
+ case BuiltinType::UInt256:
+ Out << "@BT at UInt256";
+ break;
case BuiltinType::Char_U:
case BuiltinType::Char_S:
Out << 'C'; break;
@@ -732,6 +735,9 @@ void USRGenerator::VisitType(QualType T) {
Out << 'K'; break;
case BuiltinType::Int128:
Out << 'J'; break;
+ case BuiltinType::Int256:
+ Out << "@BT at Int256";
+ break;
case BuiltinType::Float16:
case BuiltinType::Half:
Out << 'h'; break;
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index d0d006a78274e..f90ca97b54457 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -4315,6 +4315,10 @@ void Parser::ParseDeclarationSpecifiers(
isInvalid = DS.SetTypeSpecType(DeclSpec::TST_int128, Loc, PrevSpec,
DiagID, Policy);
break;
+ case tok::kw___int256:
+ isInvalid = DS.SetTypeSpecType(DeclSpec::TST_int256, Loc, PrevSpec,
+ DiagID, Policy);
+ break;
case tok::kw_half:
isInvalid = DS.SetTypeSpecType(DeclSpec::TST_half, Loc, PrevSpec,
DiagID, Policy);
@@ -5534,6 +5538,7 @@ bool Parser::isKnownToBeTypeSpecifier(const Token &Tok) const {
case tok::kw_long:
case tok::kw___int64:
case tok::kw___int128:
+ case tok::kw___int256:
case tok::kw_signed:
case tok::kw_unsigned:
case tok::kw__Complex:
@@ -5618,6 +5623,7 @@ bool Parser::isTypeSpecifierQualifier() {
case tok::kw_long:
case tok::kw___int64:
case tok::kw___int128:
+ case tok::kw___int256:
case tok::kw_signed:
case tok::kw_unsigned:
case tok::kw__Complex:
@@ -5836,6 +5842,7 @@ bool Parser::isDeclarationSpecifier(
case tok::kw_long:
case tok::kw___int64:
case tok::kw___int128:
+ case tok::kw___int256:
case tok::kw_signed:
case tok::kw_unsigned:
case tok::kw__Complex:
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index be6c7824cdbae..17b97d5347794 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -1331,6 +1331,7 @@ Parser::ParseCastExpression(CastParseKind ParseKind, bool isAddressOfOperand,
case tok::kw_long:
case tok::kw___int64:
case tok::kw___int128:
+ case tok::kw___int256:
case tok::kw__ExtInt:
case tok::kw__BitInt:
case tok::kw_signed:
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index b3d50daf66b10..fc0594e2b0638 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -2134,6 +2134,9 @@ void Parser::ParseCXXSimpleTypeSpecifier(DeclSpec &DS) {
case tok::kw___int128:
DS.SetTypeSpecType(DeclSpec::TST_int128, Loc, PrevSpec, DiagID, Policy);
break;
+ case tok::kw___int256:
+ DS.SetTypeSpecType(DeclSpec::TST_int256, Loc, PrevSpec, DiagID, Policy);
+ break;
case tok::kw___bf16:
DS.SetTypeSpecType(DeclSpec::TST_BFloat16, Loc, PrevSpec, DiagID, Policy);
break;
diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp
index 3af20ce66a5d1..41eb17706c614 100644
--- a/clang/lib/Parse/ParseTentative.cpp
+++ b/clang/lib/Parse/ParseTentative.cpp
@@ -1485,6 +1485,7 @@ Parser::isCXXDeclarationSpecifier(ImplicitTypenameContext AllowImplicitTypename,
case tok::kw_long:
case tok::kw___int64:
case tok::kw___int128:
+ case tok::kw___int256:
case tok::kw_signed:
case tok::kw_unsigned:
case tok::kw_half:
@@ -1613,6 +1614,7 @@ bool Parser::isCXXDeclarationSpecifierAType() {
case tok::kw_long:
case tok::kw___int64:
case tok::kw___int128:
+ case tok::kw___int256:
case tok::kw_signed:
case tok::kw_unsigned:
case tok::kw_half:
diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index 479a959e0aadc..22badb8739ca6 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -336,6 +336,7 @@ bool Declarator::isDeclarationOfFunction() const {
case TST_half:
case TST_int:
case TST_int128:
+ case TST_int256:
case TST_bitint:
case TST_struct:
case TST_interface:
@@ -541,6 +542,8 @@ const char *DeclSpec::getSpecifierName(DeclSpec::TST T,
case DeclSpec::TST_char32: return "char32_t";
case DeclSpec::TST_int: return "int";
case DeclSpec::TST_int128: return "__int128";
+ case DeclSpec::TST_int256:
+ return "__int256";
case DeclSpec::TST_bitint: return "_BitInt";
case DeclSpec::TST_half: return "half";
case DeclSpec::TST_float: return "float";
@@ -1300,8 +1303,9 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
if (TypeSpecType == TST_unspecified)
TypeSpecType = TST_int; // unsigned -> unsigned int, signed -> signed int.
else if (TypeSpecType != TST_int && TypeSpecType != TST_int128 &&
- TypeSpecType != TST_char && TypeSpecType != TST_wchar &&
- !IsFixedPointType && TypeSpecType != TST_bitint) {
+ TypeSpecType != TST_int256 && TypeSpecType != TST_char &&
+ TypeSpecType != TST_wchar && !IsFixedPointType &&
+ TypeSpecType != TST_bitint) {
S.Diag(TSSLoc, diag::err_invalid_sign_spec)
<< getSpecifierName((TST)TypeSpecType, Policy);
// signed double -> double.
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 3065b5e1e66d3..7f69d2f975360 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -406,6 +406,18 @@ void Sema::Initialize() {
PushOnScopeChains(Context.getUInt128Decl(), TUScope);
}
+ // Initialize predefined 256-bit integer types, if needed.
+ if (Context.getTargetInfo().hasInt256Type() ||
+ (Context.getAuxTargetInfo() &&
+ Context.getAuxTargetInfo()->hasInt256Type())) {
+ DeclarationName Int256 = &Context.Idents.get("__int256_t");
+ if (IdResolver.begin(Int256) == IdResolver.end())
+ PushOnScopeChains(Context.getInt256Decl(), TUScope);
+
+ DeclarationName UInt256 = &Context.Idents.get("__uint256_t");
+ if (IdResolver.begin(UInt256) == IdResolver.end())
+ PushOnScopeChains(Context.getUInt256Decl(), TUScope);
+ }
// Initialize predefined Objective-C types:
if (getLangOpts().ObjC) {
@@ -2206,6 +2218,8 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
(Ty->isIbm128Type() && !Context.getTargetInfo().hasIbm128Type()) ||
(Ty->isIntegerType() && Context.getTypeSize(Ty) == 128 &&
!Context.getTargetInfo().hasInt128Type()) ||
+ (Ty->isIntegerType() && Context.getTypeSize(Ty) == 256 &&
+ !Context.getTargetInfo().hasInt256Type()) ||
(Ty->isBFloat16Type() && !Context.getTargetInfo().hasBFloat16Type() &&
!LangOpts.CUDAIsDevice) ||
LongDoubleMismatched) {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index a49e3883a35a5..3498882d4240f 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -419,11 +419,11 @@ static bool BuiltinOverflow(Sema &S, CallExpr *TheCall, unsigned BuiltinID) {
// bool, a bit-precise type, or an enumeration type.
if (const auto *BT = QT.getCanonicalType()->getAs<BuiltinType>())
return (BT->getKind() >= BuiltinType::Short &&
- BT->getKind() <= BuiltinType::Int128) || (
- BT->getKind() >= BuiltinType::UShort &&
- BT->getKind() <= BuiltinType::UInt128) ||
- BT->getKind() == BuiltinType::UChar ||
- BT->getKind() == BuiltinType::SChar;
+ BT->getKind() <= BuiltinType::Int256) ||
+ (BT->getKind() >= BuiltinType::UShort &&
+ BT->getKind() <= BuiltinType::UInt256) ||
+ BT->getKind() == BuiltinType::UChar ||
+ BT->getKind() == BuiltinType::SChar;
return false;
};
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index e5c4c59e9ffbb..8eeee8e004a19 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -9480,6 +9480,10 @@ class BuiltinOperatorOverloadBuilder {
(S.Context.getAuxTargetInfo() &&
S.Context.getAuxTargetInfo()->hasInt128Type()))
ArithmeticTypes.push_back(S.Context.Int128Ty);
+ if (S.Context.getTargetInfo().hasInt256Type() ||
+ (S.Context.getAuxTargetInfo() &&
+ S.Context.getAuxTargetInfo()->hasInt256Type()))
+ ArithmeticTypes.push_back(S.Context.Int256Ty);
ArithmeticTypes.push_back(S.Context.UnsignedIntTy);
ArithmeticTypes.push_back(S.Context.UnsignedLongTy);
ArithmeticTypes.push_back(S.Context.UnsignedLongLongTy);
@@ -9487,6 +9491,10 @@ class BuiltinOperatorOverloadBuilder {
(S.Context.getAuxTargetInfo() &&
S.Context.getAuxTargetInfo()->hasInt128Type()))
ArithmeticTypes.push_back(S.Context.UnsignedInt128Ty);
+ if (S.Context.getTargetInfo().hasInt256Type() ||
+ (S.Context.getAuxTargetInfo() &&
+ S.Context.getAuxTargetInfo()->hasInt256Type()))
+ ArithmeticTypes.push_back(S.Context.UnsignedInt256Ty);
/// We add candidates for the unique, unqualified _BitInt types present in
/// the candidate type set. The candidate set already handled ensuring the
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 5b1aad3fa8470..1fc60f08f6a21 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -1167,6 +1167,7 @@ bool Sema::containsUnexpandedParameterPacks(Declarator &D) {
case TST_char32:
case TST_int:
case TST_int128:
+ case TST_int256:
case TST_half:
case TST_float:
case TST_double:
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index c082dd85f345f..036f9151497f5 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -1148,6 +1148,15 @@ static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
else
Result = Context.Int128Ty;
break;
+ case DeclSpec::TST_int256:
+ if (!S.Context.getTargetInfo().hasInt256Type() &&
+ !(S.getLangOpts().isTargetDevice()))
+ S.Diag(DS.getTypeSpecTypeLoc(), diag::err_type_unsupported) << "__int256";
+ if (DS.getTypeSpecSign() == TypeSpecifierSign::Unsigned)
+ Result = Context.UnsignedInt256Ty;
+ else
+ Result = Context.Int256Ty;
+ break;
case DeclSpec::TST_float16:
// CUDA host and device may have different _Float16 support, therefore
// do not diagnose _Float16 usage to avoid false alarm.
@@ -10201,18 +10210,21 @@ static QualType ChangeIntegralSignedness(Sema &S, QualType BaseType,
}
bool Int128Unsupported = !S.Context.getTargetInfo().hasInt128Type();
- std::array<CanQualType *, 6> AllSignedIntegers = {
+ bool Int256Unsupported = !S.Context.getTargetInfo().hasInt256Type();
+ unsigned IntSkip = Int128Unsupported ? 2 : Int256Unsupported ? 1 : 0;
+ std::array<CanQualType *, 7> AllSignedIntegers = {
&S.Context.SignedCharTy, &S.Context.ShortTy, &S.Context.IntTy,
- &S.Context.LongTy, &S.Context.LongLongTy, &S.Context.Int128Ty};
+ &S.Context.LongTy, &S.Context.LongLongTy, &S.Context.Int128Ty,
+ &S.Context.Int256Ty};
ArrayRef<CanQualType *> AvailableSignedIntegers(
- AllSignedIntegers.data(), AllSignedIntegers.size() - Int128Unsupported);
- std::array<CanQualType *, 6> AllUnsignedIntegers = {
+ AllSignedIntegers.data(), AllSignedIntegers.size() - IntSkip);
+ std::array<CanQualType *, 7> AllUnsignedIntegers = {
&S.Context.UnsignedCharTy, &S.Context.UnsignedShortTy,
&S.Context.UnsignedIntTy, &S.Context.UnsignedLongTy,
- &S.Context.UnsignedLongLongTy, &S.Context.UnsignedInt128Ty};
- ArrayRef<CanQualType *> AvailableUnsignedIntegers(AllUnsignedIntegers.data(),
- AllUnsignedIntegers.size() -
- Int128Unsupported);
+ &S.Context.UnsignedLongLongTy, &S.Context.UnsignedInt128Ty,
+ &S.Context.UnsignedInt256Ty};
+ ArrayRef<CanQualType *> AvailableUnsignedIntegers(
+ AllUnsignedIntegers.data(), AllUnsignedIntegers.size() - IntSkip);
ArrayRef<CanQualType *> *Consider =
IsMakeSigned ? &AvailableSignedIntegers : &AvailableUnsignedIntegers;
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index a416c73c458b2..dbab14eca9b92 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -17510,10 +17510,10 @@ QualType TreeTransform<Derived>::RebuildArrayType(
getDerived().getBaseEntity());
QualType Types[] = {
- SemaRef.Context.UnsignedCharTy, SemaRef.Context.UnsignedShortTy,
- SemaRef.Context.UnsignedIntTy, SemaRef.Context.UnsignedLongTy,
- SemaRef.Context.UnsignedLongLongTy, SemaRef.Context.UnsignedInt128Ty
- };
+ SemaRef.Context.UnsignedCharTy, SemaRef.Context.UnsignedShortTy,
+ SemaRef.Context.UnsignedIntTy, SemaRef.Context.UnsignedLongTy,
+ SemaRef.Context.UnsignedLongLongTy, SemaRef.Context.UnsignedInt128Ty,
+ SemaRef.Context.UnsignedInt256Ty};
QualType SizeType;
for (const auto &T : Types)
if (Size->getBitWidth() == SemaRef.Context.getIntWidth(T)) {
diff --git a/clang/lib/Serialization/ASTCommon.cpp b/clang/lib/Serialization/ASTCommon.cpp
index 69db02f2efc40..b91a8547fa2f8 100644
--- a/clang/lib/Serialization/ASTCommon.cpp
+++ b/clang/lib/Serialization/ASTCommon.cpp
@@ -53,6 +53,9 @@ serialization::TypeIdxFromBuiltin(const BuiltinType *BT) {
case BuiltinType::UInt128:
ID = PREDEF_TYPE_UINT128_ID;
break;
+ case BuiltinType::UInt256:
+ ID = PREDEF_TYPE_UINT256_ID;
+ break;
case BuiltinType::Char_S:
ID = PREDEF_TYPE_CHAR_S_ID;
break;
@@ -78,6 +81,9 @@ serialization::TypeIdxFromBuiltin(const BuiltinType *BT) {
case BuiltinType::Int128:
ID = PREDEF_TYPE_INT128_ID;
break;
+ case BuiltinType::Int256:
+ ID = PREDEF_TYPE_INT256_ID;
+ break;
case BuiltinType::Half:
ID = PREDEF_TYPE_HALF_ID;
break;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index bde000234a062..c4c173fb34d3a 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -7771,6 +7771,9 @@ QualType ASTReader::GetType(TypeID ID) {
case PREDEF_TYPE_UINT128_ID:
T = Context.UnsignedInt128Ty;
break;
+ case PREDEF_TYPE_UINT256_ID:
+ T = Context.UnsignedInt256Ty;
+ break;
case PREDEF_TYPE_SCHAR_ID:
T = Context.SignedCharTy;
break;
@@ -7792,6 +7795,9 @@ QualType ASTReader::GetType(TypeID ID) {
case PREDEF_TYPE_INT128_ID:
T = Context.Int128Ty;
break;
+ case PREDEF_TYPE_INT256_ID:
+ T = Context.Int256Ty;
+ break;
case PREDEF_TYPE_BFLOAT16_ID:
T = Context.BFloat16Ty;
break;
@@ -8360,6 +8366,18 @@ Decl *ASTReader::getPredefinedDecl(PredefinedDeclIDs ID) {
NewLoaded = Context.getUInt128Decl();
break;
+ case PREDEF_DECL_INT_256_ID:
+ if (Context.Int256Decl)
+ return Context.Int256Decl;
+ NewLoaded = Context.getInt256Decl();
+ break;
+
+ case PREDEF_DECL_UNSIGNED_INT_256_ID:
+ if (Context.UInt256Decl)
+ return Context.UInt256Decl;
+ NewLoaded = Context.getUInt256Decl();
+ break;
+
case PREDEF_DECL_OBJC_INSTANCETYPE_ID:
if (Context.ObjCInstanceTypeDecl)
return Context.ObjCInstanceTypeDecl;
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index af46f84d5aac0..dfba5b62ebdd9 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5641,6 +5641,8 @@ void ASTWriter::PrepareWritingSpecialDecls(Sema &SemaRef) {
PREDEF_DECL_OBJC_PROTOCOL_ID);
RegisterPredefDecl(Context.Int128Decl, PREDEF_DECL_INT_128_ID);
RegisterPredefDecl(Context.UInt128Decl, PREDEF_DECL_UNSIGNED_INT_128_ID);
+ RegisterPredefDecl(Context.Int256Decl, PREDEF_DECL_INT_256_ID);
+ RegisterPredefDecl(Context.UInt256Decl, PREDEF_DECL_UNSIGNED_INT_256_ID);
RegisterPredefDecl(Context.ObjCInstanceTypeDecl,
PREDEF_DECL_OBJC_INSTANCETYPE_ID);
RegisterPredefDecl(Context.BuiltinVaListDecl, PREDEF_DECL_BUILTIN_VA_LIST_ID);
>From 7503a37c8b58f2527926c01c56816d49ebe67884 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:38:36 +0100
Subject: [PATCH 03/17] [clang][test] Add __int256/__uint256 clang tests
Comprehensive test coverage for the __int256 builtin type:
- AST: constant interpreter, JSON dump updates for new predefined type ID
- CodeGen: AArch64/X86 argument passing, data layout, debug info, float
conversions, overflow builtins, varargs, mangling (Itanium/MSVC)
- Sema: type acceptance on 64-bit targets, atomic/bitfield constraints,
constant evaluation, struct layout, tautological comparisons,
templates, type traits, overload resolution
- CUDA: device-side acceptance
- SYCL: spir64 rejection
- Preprocessor: __INT256_MAX__ and related macros
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
clang/test/AST/ByteCode/int256.cpp | 201 +++++++++++++++
clang/test/AST/ast-dump-default-arg-json.cpp | 46 ++++
clang/test/AST/ast-dump-default-init-json.cpp | 46 ++++
clang/test/AST/ast-dump-file-line-json.c | 46 ++++
clang/test/AST/ast-dump-lambda-json.cpp | 46 ++++
.../test/AST/ast-dump-template-decls-json.cpp | 46 ++++
...dump-template-json-win32-mangler-crash.cpp | 46 ++++
clang/test/AST/ast-dump-templates.cpp | 46 ++++
clang/test/CXX/drs/cwg4xx.cpp | 2 +
.../AArch64/aarch64-arguments-int256.c | 39 +++
.../CodeGen/AArch64/aarch64-int256-args.c | 30 +++
clang/test/CodeGen/X86/win64-int256.c | 22 ++
clang/test/CodeGen/X86/x86_64-PR42672.c | 4 +-
.../CodeGen/X86/x86_64-arguments-int256.c | 39 +++
clang/test/CodeGen/X86/x86_64-atomic-i256.c | 32 +++
clang/test/CodeGen/debug-info-int256.c | 12 +
clang/test/CodeGen/float-conv-int256.c | 63 +++++
clang/test/CodeGen/int256-func-ptr.c | 34 +++
clang/test/CodeGen/int256-globals.c | 39 +++
clang/test/CodeGen/overflow-builtins-int256.c | 59 +++++
clang/test/CodeGen/uint256_t.c | 239 ++++++++++++++++++
clang/test/CodeGen/varargs-int256.c | 67 +++++
clang/test/CodeGenCXX/mangle-int256.cpp | 32 +++
clang/test/Modules/decl-params-determinisim.m | 16 +-
clang/test/Preprocessor/init-aarch64.c | 1 +
clang/test/Preprocessor/init.c | 1 +
clang/test/Sema/256bitint.c | 72 ++++++
clang/test/Sema/atomic-builtins-int256.c | 29 +++
clang/test/Sema/atomic-int256.c | 26 ++
clang/test/Sema/bitfield-int256.c | 42 +++
clang/test/Sema/const-eval.c | 5 +
clang/test/Sema/constant-builtins-2.c | 15 ++
clang/test/Sema/enum.c | 4 +-
clang/test/Sema/struct-layout-int256.c | 70 +++++
.../test/Sema/tautological-constant-compare.c | 5 +
clang/test/Sema/types.c | 25 ++
clang/test/SemaCUDA/int256.cu | 30 +++
clang/test/SemaCXX/deleted-operator.cpp | 4 +-
clang/test/SemaCXX/int256-templates.cpp | 219 ++++++++++++++++
clang/test/SemaCXX/int256-type-traits.cpp | 74 ++++++
.../SemaCXX/overloaded-builtin-operators.cpp | 4 +-
clang/test/SemaSYCL/int256.cpp | 74 ++++++
42 files changed, 1938 insertions(+), 14 deletions(-)
create mode 100644 clang/test/AST/ByteCode/int256.cpp
create mode 100644 clang/test/CodeGen/AArch64/aarch64-arguments-int256.c
create mode 100644 clang/test/CodeGen/AArch64/aarch64-int256-args.c
create mode 100644 clang/test/CodeGen/X86/win64-int256.c
create mode 100644 clang/test/CodeGen/X86/x86_64-arguments-int256.c
create mode 100644 clang/test/CodeGen/X86/x86_64-atomic-i256.c
create mode 100644 clang/test/CodeGen/debug-info-int256.c
create mode 100644 clang/test/CodeGen/float-conv-int256.c
create mode 100644 clang/test/CodeGen/int256-func-ptr.c
create mode 100644 clang/test/CodeGen/int256-globals.c
create mode 100644 clang/test/CodeGen/overflow-builtins-int256.c
create mode 100644 clang/test/CodeGen/uint256_t.c
create mode 100644 clang/test/CodeGen/varargs-int256.c
create mode 100644 clang/test/CodeGenCXX/mangle-int256.cpp
create mode 100644 clang/test/Sema/256bitint.c
create mode 100644 clang/test/Sema/atomic-builtins-int256.c
create mode 100644 clang/test/Sema/atomic-int256.c
create mode 100644 clang/test/Sema/bitfield-int256.c
create mode 100644 clang/test/Sema/struct-layout-int256.c
create mode 100644 clang/test/SemaCUDA/int256.cu
create mode 100644 clang/test/SemaCXX/int256-templates.cpp
create mode 100644 clang/test/SemaCXX/int256-type-traits.cpp
create mode 100644 clang/test/SemaSYCL/int256.cpp
diff --git a/clang/test/AST/ByteCode/int256.cpp b/clang/test/AST/ByteCode/int256.cpp
new file mode 100644
index 0000000000000..7ec7901a04d3f
--- /dev/null
+++ b/clang/test/AST/ByteCode/int256.cpp
@@ -0,0 +1,201 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -triple x86_64-unknown-linux-gnu -std=c++20 -verify=expected,both %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -verify=ref,both %s
+
+// Constexpr evaluation tests for __int256_t / __uint256_t.
+
+namespace Arithmetic {
+ constexpr __int256_t a = 100;
+ constexpr __int256_t b = 7;
+ static_assert(a + b == 107, "");
+ static_assert(a - b == 93, "");
+ static_assert(a * b == 700, "");
+ static_assert(a / b == 14, "");
+ static_assert(a % b == 2, "");
+
+ constexpr __int256_t product = 12345 * 67890;
+ static_assert(product == 838102050, "");
+}
+
+namespace Bitwise {
+ constexpr __uint256_t x = 0xFF00FF;
+ constexpr __uint256_t y = 0x0F0F0F;
+ static_assert((x & y) == 0x0F000F, "");
+ static_assert((x | y) == 0xFF0FFF, "");
+ static_assert((x ^ y) == 0xF00FF0, "");
+ static_assert(~(__uint256_t)0 != 0, "");
+}
+
+namespace Shifts {
+ constexpr __int256_t one = 1;
+ static_assert((one << 0) == 1, "");
+ static_assert((one << 1) == 2, "");
+ static_assert((one << 64) != 0, "");
+ static_assert((one << 128) != 0, "");
+ static_assert((one << 255) != 0, "");
+ static_assert(((__uint256_t)one << 255) >> 255 == 1, "");
+
+ constexpr __uint256_t large = (__uint256_t)1 << 200;
+ static_assert(large != 0, "");
+ static_assert(large >> 200 == 1, "");
+}
+
+namespace Comparisons {
+ constexpr __int256_t a = 100;
+ constexpr __int256_t b = 7;
+ static_assert(a > b, "");
+ static_assert(b < a, "");
+ static_assert(a >= 100, "");
+ static_assert(b <= 7, "");
+ static_assert(a != b, "");
+ static_assert(a == 100, "");
+}
+
+namespace Conversions {
+ constexpr __int128_t i128 = 42;
+ constexpr __int256_t from128 = i128;
+ static_assert(from128 == 42, "");
+ constexpr __int128_t to128 = (__int128_t)from128;
+ static_assert(to128 == 42, "");
+
+ constexpr long long ll = 99;
+ constexpr __int256_t fromll = ll;
+ static_assert(fromll == 99, "");
+}
+
+namespace UnaryOps {
+ constexpr __int256_t a = 100;
+ constexpr __int256_t neg = -a;
+ static_assert(neg == -100, "");
+ static_assert(-neg == 100, "");
+}
+
+namespace Wrapping {
+ constexpr __uint256_t zero = 0;
+ constexpr __uint256_t wrap = zero - 1;
+ static_assert(wrap + 1 == 0, "");
+}
+
+namespace DivByZero {
+ constexpr __int256_t divzero = __int256_t{1} / __int256_t{0}; // both-error {{must be initialized by a constant expression}} \
+ // both-note {{division by zero}}
+ constexpr __int256_t remzero = __int256_t{1} % __int256_t{0}; // both-error {{must be initialized by a constant expression}} \
+ // both-note {{division by zero}}
+}
+
+namespace BoundaryConstants {
+ // UINT256_MAX = 2^256 - 1 = ((__uint256_t)1 << 255) | (((__uint256_t)1 << 255) - 1)
+ constexpr __uint256_t UINT256_MAX = ~(__uint256_t)0;
+ static_assert(UINT256_MAX != 0, "");
+ static_assert(UINT256_MAX + 1 == 0, ""); // wraps to zero
+ static_assert((UINT256_MAX >> 255) == 1, "");
+
+ // INT256_MAX = 2^255 - 1 (sign bit clear, all other bits set)
+ constexpr __int256_t INT256_MAX = (__int256_t)(UINT256_MAX >> 1);
+ static_assert(INT256_MAX > 0, "");
+ constexpr __uint256_t check_max = (__uint256_t)INT256_MAX;
+ static_assert((check_max >> 254) == 1, ""); // bit 254 set
+
+ // INT256_MIN = -2^255 (sign bit set, all other bits clear)
+ constexpr __int256_t INT256_MIN = -INT256_MAX - 1;
+ static_assert(INT256_MIN < 0, "");
+ static_assert(INT256_MIN + INT256_MAX == -1, "");
+
+ // Full-width values using all 256 bits
+ constexpr __uint256_t all_ones = ~(__uint256_t)0;
+ constexpr __uint256_t alternating = all_ones / 3; // 0x5555...
+ static_assert(alternating != 0, "");
+ static_assert((alternating & (alternating << 1)) == 0, ""); // no adjacent bits
+}
+
+namespace OverflowDetection {
+ // Signed overflow in constexpr is undefined behavior -- not a constant expression
+ constexpr __int256_t INT256_MAX = (__int256_t)(~(__uint256_t)0 >> 1);
+ constexpr __int256_t overflow_add = INT256_MAX + 1; // both-error {{must be initialized by a constant expression}} \
+ // both-note {{value 57896044618658097711785492504343953926634992332820282019728792003956564819968 is outside the range of representable values}}
+}
+
+namespace MoreConversions {
+ // Bool conversions
+ constexpr bool from_zero = (__int256_t)0;
+ static_assert(!from_zero, "");
+ constexpr bool from_one = (__int256_t)1;
+ static_assert(from_one, "");
+ constexpr bool from_neg = (__int256_t)-1;
+ static_assert(from_neg, "");
+
+ // Char conversions
+ constexpr char c = 'A';
+ constexpr __int256_t from_char = c;
+ static_assert(from_char == 65, "");
+ constexpr char to_char = (char)from_char;
+ static_assert(to_char == 'A', "");
+
+ // Int conversions
+ constexpr int i = 42;
+ constexpr __int256_t from_int = i;
+ static_assert(from_int == 42, "");
+ constexpr int to_int = (int)from_int;
+ static_assert(to_int == 42, "");
+
+ // Long conversions
+ constexpr long l = 1000000L;
+ constexpr __int256_t from_long = l;
+ static_assert(from_long == 1000000, "");
+
+ // __int256 <-> __int128 round-trip with negative
+ constexpr __int128_t neg128 = -42;
+ constexpr __int256_t from_neg128 = neg128;
+ static_assert(from_neg128 == -42, "");
+ constexpr __int128_t to_neg128 = (__int128_t)from_neg128;
+ static_assert(to_neg128 == -42, "");
+}
+
+namespace CompoundAssignment {
+ constexpr __int256_t test_compound() {
+ __int256_t x = 100;
+ x += 50; // 150
+ x -= 30; // 120
+ x *= 2; // 240
+ x /= 3; // 80
+ x %= 7; // 3
+ x <<= 4; // 48
+ x >>= 2; // 12
+ x &= 0xFF; // 12
+ x |= 0x100;// 268
+ x ^= 0xF; // 263
+ return x;
+ }
+ static_assert(test_compound() == 259, "");
+}
+
+namespace IncrementDecrement {
+ constexpr __int256_t test_inc_dec() {
+ __int256_t x = 0;
+ ++x; // 1
+ x++; // 2
+ --x; // 1
+ x--; // 0
+ return x;
+ }
+ static_assert(test_inc_dec() == 0, "");
+
+ // Unsigned wrapping with decrement
+ constexpr __uint256_t test_wrap_dec() {
+ __uint256_t x = 0;
+ --x; // wraps to UINT256_MAX
+ ++x; // wraps back to 0
+ return x;
+ }
+ static_assert(test_wrap_dec() == 0, "");
+}
+
+namespace ConstexprFunc {
+ constexpr __int256_t factorial(__int256_t n) {
+ __int256_t result = 1;
+ for (__int256_t i = 2; i <= n; ++i)
+ result *= i;
+ return result;
+ }
+ static_assert(factorial(10) == 3628800, "");
+ static_assert(factorial(20) == 2432902008176640000LL, "");
+}
diff --git a/clang/test/AST/ast-dump-default-arg-json.cpp b/clang/test/AST/ast-dump-default-arg-json.cpp
index b6a138934caf9..b34b1cbafc924 100644
--- a/clang/test/AST/ast-dump-default-arg-json.cpp
+++ b/clang/test/AST/ast-dump-default-arg-json.cpp
@@ -80,6 +80,52 @@ void test() {
// CHECK-NEXT: "end": {}
// CHECK-NEXT: },
// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__int256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__uint256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
// CHECK-NEXT: "name": "__NSConstantString",
// CHECK-NEXT: "type": {
// CHECK-NEXT: "qualType": "__NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-default-init-json.cpp b/clang/test/AST/ast-dump-default-init-json.cpp
index f4949a9c9eedf..50d1100ba11c6 100644
--- a/clang/test/AST/ast-dump-default-init-json.cpp
+++ b/clang/test/AST/ast-dump-default-init-json.cpp
@@ -78,6 +78,52 @@ void test() {
// CHECK-NEXT: "end": {}
// CHECK-NEXT: },
// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__int256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__uint256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
// CHECK-NEXT: "name": "__NSConstantString",
// CHECK-NEXT: "type": {
// CHECK-NEXT: "qualType": "__NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-file-line-json.c b/clang/test/AST/ast-dump-file-line-json.c
index da1c8dbd755d5..1b5e8de80df79 100644
--- a/clang/test/AST/ast-dump-file-line-json.c
+++ b/clang/test/AST/ast-dump-file-line-json.c
@@ -76,6 +76,52 @@ int e;
// CHECK-NEXT: "end": {}
// CHECK-NEXT: },
// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__int256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__uint256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
// CHECK-NEXT: "name": "__NSConstantString",
// CHECK-NEXT: "type": {
// CHECK-NEXT: "qualType": "struct __NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-lambda-json.cpp b/clang/test/AST/ast-dump-lambda-json.cpp
index fc28cc8164e17..3616426aa6f8a 100644
--- a/clang/test/AST/ast-dump-lambda-json.cpp
+++ b/clang/test/AST/ast-dump-lambda-json.cpp
@@ -87,6 +87,52 @@ void Test() {
// CHECK-NEXT: "end": {}
// CHECK-NEXT: },
// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__int256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__uint256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
// CHECK-NEXT: "name": "__NSConstantString",
// CHECK-NEXT: "type": {
// CHECK-NEXT: "qualType": "__NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-template-decls-json.cpp b/clang/test/AST/ast-dump-template-decls-json.cpp
index 70f1d3b55f3ee..0e4e6b3ab4544 100644
--- a/clang/test/AST/ast-dump-template-decls-json.cpp
+++ b/clang/test/AST/ast-dump-template-decls-json.cpp
@@ -128,6 +128,52 @@ W(int)->W<1>;
// CHECK-NEXT: "end": {}
// CHECK-NEXT: },
// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__int256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__uint256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
// CHECK-NEXT: "name": "__NSConstantString",
// CHECK-NEXT: "type": {
// CHECK-NEXT: "qualType": "__NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
index 43eae10b27b3a..54e9040740786 100644
--- a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
+++ b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
@@ -137,6 +137,52 @@ int main()
// CHECK-NEXT: "end": {}
// CHECK-NEXT: },
// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__int256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "__int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
+// CHECK-NEXT: "name": "__uint256_t",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: },
+// CHECK-NEXT: "inner": [
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "BuiltinType",
+// CHECK-NEXT: "type": {
+// CHECK-NEXT: "qualType": "unsigned __int256"
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: ]
+// CHECK-NEXT: },
+// CHECK-NEXT: {
+// CHECK-NEXT: "id": "0x{{.*}}",
+// CHECK-NEXT: "kind": "TypedefDecl",
+// CHECK-NEXT: "loc": {},
+// CHECK-NEXT: "range": {
+// CHECK-NEXT: "begin": {},
+// CHECK-NEXT: "end": {}
+// CHECK-NEXT: },
+// CHECK-NEXT: "isImplicit": true,
// CHECK-NEXT: "name": "__NSConstantString",
// CHECK-NEXT: "type": {
// CHECK-NEXT: "qualType": "__NSConstantString_tag"
diff --git a/clang/test/AST/ast-dump-templates.cpp b/clang/test/AST/ast-dump-templates.cpp
index 8cf9b6a29e332..377202e07fb71 100644
--- a/clang/test/AST/ast-dump-templates.cpp
+++ b/clang/test/AST/ast-dump-templates.cpp
@@ -343,6 +343,52 @@ namespace TestAbbreviatedTemplateDecls {
// JSON-NEXT: "end": {}
// JSON-NEXT: },
// JSON-NEXT: "isImplicit": true,
+// JSON-NEXT: "name": "__int256_t",
+// JSON-NEXT: "type": {
+// JSON-NEXT: "qualType": "__int256"
+// JSON-NEXT: },
+// JSON-NEXT: "inner": [
+// JSON-NEXT: {
+// JSON-NEXT: "id": "0x{{.*}}",
+// JSON-NEXT: "kind": "BuiltinType",
+// JSON-NEXT: "type": {
+// JSON-NEXT: "qualType": "__int256"
+// JSON-NEXT: }
+// JSON-NEXT: }
+// JSON-NEXT: ]
+// JSON-NEXT: },
+// JSON-NEXT: {
+// JSON-NEXT: "id": "0x{{.*}}",
+// JSON-NEXT: "kind": "TypedefDecl",
+// JSON-NEXT: "loc": {},
+// JSON-NEXT: "range": {
+// JSON-NEXT: "begin": {},
+// JSON-NEXT: "end": {}
+// JSON-NEXT: },
+// JSON-NEXT: "isImplicit": true,
+// JSON-NEXT: "name": "__uint256_t",
+// JSON-NEXT: "type": {
+// JSON-NEXT: "qualType": "unsigned __int256"
+// JSON-NEXT: },
+// JSON-NEXT: "inner": [
+// JSON-NEXT: {
+// JSON-NEXT: "id": "0x{{.*}}",
+// JSON-NEXT: "kind": "BuiltinType",
+// JSON-NEXT: "type": {
+// JSON-NEXT: "qualType": "unsigned __int256"
+// JSON-NEXT: }
+// JSON-NEXT: }
+// JSON-NEXT: ]
+// JSON-NEXT: },
+// JSON-NEXT: {
+// JSON-NEXT: "id": "0x{{.*}}",
+// JSON-NEXT: "kind": "TypedefDecl",
+// JSON-NEXT: "loc": {},
+// JSON-NEXT: "range": {
+// JSON-NEXT: "begin": {},
+// JSON-NEXT: "end": {}
+// JSON-NEXT: },
+// JSON-NEXT: "isImplicit": true,
// JSON-NEXT: "name": "__NSConstantString",
// JSON-NEXT: "type": {
// JSON-NEXT: "qualType": "__NSConstantString_tag"
diff --git a/clang/test/CXX/drs/cwg4xx.cpp b/clang/test/CXX/drs/cwg4xx.cpp
index 44385224aa388..920bbac855285 100644
--- a/clang/test/CXX/drs/cwg4xx.cpp
+++ b/clang/test/CXX/drs/cwg4xx.cpp
@@ -545,6 +545,8 @@ namespace cwg425 { // cwg425: 2.7
// expected-note-re at -11 {{built-in candidate operator*{{.*}}}}
// expected-note-re at -12 {{built-in candidate operator*{{.*}}}}
// expected-note-re at -13 {{built-in candidate operator*{{.*}}}}
+ // expected-note-re at -14 {{built-in candidate operator*{{.*}}}}
+ // expected-note-re at -15 {{built-in candidate operator*{{.*}}}}
template<typename T> struct is_float;
template<> struct is_float<float> { typedef void type; };
diff --git a/clang/test/CodeGen/AArch64/aarch64-arguments-int256.c b/clang/test/CodeGen/AArch64/aarch64-arguments-int256.c
new file mode 100644
index 0000000000000..e2a63645dc918
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/aarch64-arguments-int256.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify AArch64 IR generation for __int256_t arguments and returns.
+
+// CHECK-LABEL: define{{.*}} i256 @f_ret256(i256 noundef %a)
+__int256_t f_ret256(__int256_t a) { return a; }
+
+// CHECK-LABEL: define{{.*}} i256 @f_ret256u(i256 noundef %a)
+__uint256_t f_ret256u(__uint256_t a) { return a; }
+
+// Multiple 256-bit args
+// CHECK-LABEL: define{{.*}} i256 @f_two256(i256 noundef %a, i256 noundef %b)
+__int256_t f_two256(__int256_t a, __int256_t b) { return a + b; }
+
+// Mixed argument sizes: 256-bit with smaller types
+// CHECK-LABEL: define{{.*}} i256 @f_mixed(i64 noundef %x, i256 noundef %a, i32 noundef %y)
+__int256_t f_mixed(long long x, __int256_t a, int y) { return a; }
+
+// Register exhaustion: 3 i256 args still passed directly
+// CHECK-LABEL: define{{.*}} i256 @f_three256(i256 noundef %a, i256 noundef %b, i256 noundef %c)
+__int256_t f_three256(__int256_t a, __int256_t b, __int256_t c) { return a + b + c; }
+
+// Struct containing a 256-bit integer: passed/returned via sret/indirect
+struct s256 { __int256_t val; };
+
+// CHECK-LABEL: define{{.*}} void @f_struct256(ptr dead_on_unwind noalias writable sret(%struct.s256) align 16 %{{.*}}, ptr noundef dead_on_return %s)
+struct s256 f_struct256(struct s256 s) { return s; }
+
+// Nested struct with __int256: also indirect
+struct nested256 { int x; __int256_t val; int y; };
+
+// CHECK-LABEL: define{{.*}} void @f_nested256(ptr dead_on_unwind noalias writable sret(%struct.nested256) align 16 %{{.*}}, ptr noundef dead_on_return %s)
+struct nested256 f_nested256(struct nested256 s) { return s; }
+
+// Packed struct with __int256
+struct __attribute__((packed)) packed256 { char c; __int256_t val; };
+
+// CHECK-LABEL: define{{.*}} void @f_packed256(ptr dead_on_unwind noalias writable sret(%struct.packed256) align 1 %{{.*}}, ptr noundef dead_on_return %s)
+struct packed256 f_packed256(struct packed256 s) { return s; }
diff --git a/clang/test/CodeGen/AArch64/aarch64-int256-args.c b/clang/test/CodeGen/AArch64/aarch64-int256-args.c
new file mode 100644
index 0000000000000..9ee68e9583c5a
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/aarch64-int256-args.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify AArch64 handles many __int256 arguments (register exhaustion).
+// Each __int256 consumes 4 GPRs (x0-x3, x4-x7), so the 3rd+ arg must
+// spill to the stack when the backend lowers this.
+
+// CHECK-LABEL: define{{.*}} i256 @f_five(i256 noundef %a, i256 noundef %b, i256 noundef %c, i256 noundef %d, i256 noundef %e)
+// CHECK: add nsw i256
+__int256 f_five(__int256 a, __int256 b, __int256 c, __int256 d, __int256 e) {
+ return a + b + c + d + e;
+}
+
+// Mixed argument sizes: smaller args consume individual GPRs, then __int256
+// takes 4 GPRs each.
+// CHECK-LABEL: define{{.*}} i256 @f_mixed(i32 noundef %x, i256 noundef %a, i64 noundef %y, i256 noundef %b, i32 noundef %z)
+// CHECK: add nsw i256
+__int256 f_mixed(int x, __int256 a, long long y, __int256 b, int z) {
+ return a + b;
+}
+
+// Struct containing __int256: must go indirect per AAPCS (>16 bytes)
+struct s256 { __int256 val; };
+
+// CHECK-LABEL: define{{.*}} void @f_struct(ptr{{.*}}sret(%struct.s256) align 16 %{{.*}}, ptr noundef dead_on_return %s)
+struct s256 f_struct(struct s256 s) { return s; }
+
+// Verify direct scalar __int256 return (even though struct s256 is indirect)
+// CHECK-LABEL: define{{.*}} i256 @f_scalar_ret(i256 noundef %x)
+// CHECK: ret i256
+__int256 f_scalar_ret(__int256 x) { return x; }
diff --git a/clang/test/CodeGen/X86/win64-int256.c b/clang/test/CodeGen/X86/win64-int256.c
new file mode 100644
index 0000000000000..b8767c0211bc4
--- /dev/null
+++ b/clang/test/CodeGen/X86/win64-int256.c
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=GNU
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -o - %s | FileCheck %s --check-prefix=MSVC
+
+// Verify __int256 ABI on Windows targets (both GNU and MSVC).
+// On Win64, __int256 is passed/returned indirectly (pointer args, sret return).
+
+// GNU-LABEL: define dso_local void @f_ret(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, ptr noundef dead_on_return %0)
+// MSVC-LABEL: define dso_local void @f_ret(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, ptr noundef dead_on_return %0)
+__int256 f_ret(__int256 a) { return a; }
+
+// GNU-LABEL: define dso_local void @f_two(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, ptr noundef dead_on_return %0, ptr noundef dead_on_return %1)
+// MSVC-LABEL: define dso_local void @f_two(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, ptr noundef dead_on_return %0, ptr noundef dead_on_return %1)
+__int256 f_two(__int256 a, __int256 b) { return a + b; }
+
+// GNU-LABEL: define dso_local i32 @f_narrow(ptr noundef dead_on_return %0)
+// MSVC-LABEL: define dso_local i32 @f_narrow(ptr noundef dead_on_return %0)
+int f_narrow(__int256 a) { return (int)a; }
+
+// Mixed: small args passed in registers, __int256 via pointer
+// GNU-LABEL: define dso_local void @f_mixed(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, i32 noundef %x, ptr noundef dead_on_return %0, i32 noundef %y)
+// MSVC-LABEL: define dso_local void @f_mixed(ptr dead_on_unwind noalias writable sret(i256) align 16 %agg.result, i32 noundef %x, ptr noundef dead_on_return %0, i32 noundef %y)
+__int256 f_mixed(int x, __int256 a, int y) { return a; }
diff --git a/clang/test/CodeGen/X86/x86_64-PR42672.c b/clang/test/CodeGen/X86/x86_64-PR42672.c
index 42894c0c4cb57..3285b084d285f 100644
--- a/clang/test/CodeGen/X86/x86_64-PR42672.c
+++ b/clang/test/CodeGen/X86/x86_64-PR42672.c
@@ -61,10 +61,12 @@ void odd_struct(void) {
// CHECK-IMPOSSIBLE_ODD: impossible constraint in asm: cannot store value into a register
// Check Clang reports an error if attempting to return a big structure via a register.
+// Use 5 x long long (40 bytes / 320 bits) since 4 x long long (32 bytes / 256 bits)
+// can be represented as __uint256_t on targets with __int256 support.
void big_struct(void) {
#ifdef IMPOSSIBLE_BIG
struct {
- long long int v1, v2, v3, v4;
+ long long int v1, v2, v3, v4, v5;
} str;
asm("nop"
: "=r"(str));
diff --git a/clang/test/CodeGen/X86/x86_64-arguments-int256.c b/clang/test/CodeGen/X86/x86_64-arguments-int256.c
new file mode 100644
index 0000000000000..86def39e81e95
--- /dev/null
+++ b/clang/test/CodeGen/X86/x86_64-arguments-int256.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify X86-64 IR generation for __int256_t arguments and returns.
+// Per the SysV ABI, types exceeding two eightbytes (128 bits) are passed
+// and returned in memory (sret/byval).
+
+// CHECK-LABEL: define{{.*}} void @f_ret256(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %0)
+__int256_t f_ret256(__int256_t a) { return a; }
+
+// CHECK-LABEL: define{{.*}} void @f_ret256u(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %0)
+__uint256_t f_ret256u(__uint256_t a) { return a; }
+
+// Multiple 256-bit args
+// CHECK-LABEL: define{{.*}} void @f_two256(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %0, ptr noundef byval(i256) align 16 %1)
+__int256_t f_two256(__int256_t a, __int256_t b) { return a + b; }
+
+// Mixed argument sizes: 256-bit with smaller types
+// CHECK-LABEL: define{{.*}} void @f_mixed(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, i64 noundef %x, ptr noundef byval(i256) align 16 %0, i32 noundef %y)
+__int256_t f_mixed(long long x, __int256_t a, int y) { return a; }
+
+// 128-bit: still returned directly in registers (2 eightbytes)
+// CHECK-LABEL: define{{.*}} i128 @f_ret128(i128 noundef %a)
+__int128_t f_ret128(__int128_t a) { return a; }
+
+// 3 i256 args: all passed via byval pointers
+// CHECK-LABEL: define{{.*}} void @f_three256(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %0, ptr noundef byval(i256) align 16 %1, ptr noundef byval(i256) align 16 %2)
+__int256_t f_three256(__int256_t a, __int256_t b, __int256_t c) { return a + b + c; }
+
+// Struct containing a 256-bit integer: passed/returned via sret/byval
+struct s256 { __int256_t val; };
+
+// CHECK-LABEL: define{{.*}} void @f_struct256(ptr dead_on_unwind noalias writable sret(%struct.s256) align 16 %{{.*}}, ptr noundef byval(%struct.s256) align 16 %s)
+struct s256 f_struct256(struct s256 s) { return s; }
+
+// Nested struct with __int256
+struct nested256 { int x; __int256_t val; int y; };
+
+// CHECK-LABEL: define{{.*}} void @f_nested256(ptr dead_on_unwind noalias writable sret(%struct.nested256) align 16 %{{.*}}, ptr noundef byval(%struct.nested256) align 16 %s)
+struct nested256 f_nested256(struct nested256 s) { return s; }
diff --git a/clang/test/CodeGen/X86/x86_64-atomic-i256.c b/clang/test/CodeGen/X86/x86_64-atomic-i256.c
new file mode 100644
index 0000000000000..73aa91d60c5c7
--- /dev/null
+++ b/clang/test/CodeGen/X86/x86_64-atomic-i256.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify that _Atomic __int256 operations generate the correct libcalls.
+// __int256 is too large for inline atomics (256 bits > cmpxchg16b), so all
+// operations must route through __atomic_* libcalls with size=32.
+
+_Atomic __int256_t glob;
+
+// CHECK-LABEL: define{{.*}} void @atomic_load(ptr{{.*}}sret(i256)
+// CHECK: call void @__atomic_load(i64 noundef 32, ptr noundef @glob, ptr noundef %{{.*}}, i32 noundef 5)
+__int256_t atomic_load(void) {
+ return __c11_atomic_load(&glob, __ATOMIC_SEQ_CST);
+}
+
+// CHECK-LABEL: define{{.*}} void @atomic_store(ptr noundef byval(i256) align 16 %0)
+// CHECK: call void @__atomic_store(i64 noundef 32, ptr noundef @glob, ptr noundef %{{.*}}, i32 noundef 3)
+void atomic_store(__int256_t val) {
+ __c11_atomic_store(&glob, val, __ATOMIC_RELEASE);
+}
+
+// CHECK-LABEL: define{{.*}} void @atomic_exchange(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %0)
+// CHECK: call void @__atomic_exchange(i64 noundef 32, ptr noundef @glob, ptr noundef %{{.*}}, ptr noundef %{{.*}}, i32 noundef 5)
+__int256_t atomic_exchange(__int256_t val) {
+ return __c11_atomic_exchange(&glob, val, __ATOMIC_SEQ_CST);
+}
+
+// CHECK-LABEL: define{{.*}} i1 @atomic_cas(
+// CHECK: call{{.*}} i1 @__atomic_compare_exchange(i64 noundef 32, ptr noundef @glob, ptr noundef %{{.*}}, ptr noundef %{{.*}}, i32 noundef 4, i32 noundef 2)
+_Bool atomic_cas(__int256_t *expected, __int256_t desired) {
+ return __c11_atomic_compare_exchange_strong(
+ &glob, expected, desired, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+}
diff --git a/clang/test/CodeGen/debug-info-int256.c b/clang/test/CodeGen/debug-info-int256.c
new file mode 100644
index 0000000000000..eeee2dddfd7f6
--- /dev/null
+++ b/clang/test/CodeGen/debug-info-int256.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -debug-info-kind=standalone -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -debug-info-kind=standalone -emit-llvm -o - %s | FileCheck %s
+
+// Verify DWARF debug info encoding for __int256_t and __uint256_t.
+
+__int256_t s256;
+__uint256_t u256;
+
+// CHECK-DAG: !DIBasicType(name: "__int256", size: 256, encoding: DW_ATE_signed)
+// CHECK-DAG: !DIBasicType(name: "unsigned __int256", size: 256, encoding: DW_ATE_unsigned)
+// CHECK-DAG: !DIDerivedType(tag: DW_TAG_typedef, name: "__int256_t"
+// CHECK-DAG: !DIDerivedType(tag: DW_TAG_typedef, name: "__uint256_t"
diff --git a/clang/test/CodeGen/float-conv-int256.c b/clang/test/CodeGen/float-conv-int256.c
new file mode 100644
index 0000000000000..639eb9fe956c6
--- /dev/null
+++ b/clang/test/CodeGen/float-conv-int256.c
@@ -0,0 +1,63 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Test float <-> __int256_t conversions.
+
+// === Signed -> Float ===
+
+// CHECK-LABEL: define {{.*}}@int256_to_double
+// CHECK: sitofp i256 %{{.*}} to double
+double int256_to_double(__int256_t x) { return (double)x; }
+
+// CHECK-LABEL: define {{.*}}@int256_to_float
+// CHECK: sitofp i256 %{{.*}} to float
+float int256_to_float(__int256_t x) { return (float)x; }
+
+// CHECK-LABEL: define {{.*}}@int256_to_longdouble
+// CHECK: sitofp i256 %{{.*}} to x86_fp80
+long double int256_to_longdouble(__int256_t x) { return (long double)x; }
+
+// === Unsigned -> Float ===
+
+// CHECK-LABEL: define {{.*}}@uint256_to_double
+// CHECK: uitofp i256 %{{.*}} to double
+double uint256_to_double(__uint256_t x) { return (double)x; }
+
+// CHECK-LABEL: define {{.*}}@uint256_to_float
+// CHECK: uitofp i256 %{{.*}} to float
+float uint256_to_float(__uint256_t x) { return (float)x; }
+
+// CHECK-LABEL: define {{.*}}@uint256_to_longdouble
+// CHECK: uitofp i256 %{{.*}} to x86_fp80
+long double uint256_to_longdouble(__uint256_t x) { return (long double)x; }
+
+// === Float -> Signed ===
+
+// CHECK-LABEL: define {{.*}}@double_to_int256
+// CHECK: fptosi double %{{.*}} to i256
+__int256_t double_to_int256(double x) { return (__int256_t)x; }
+
+// CHECK-LABEL: define {{.*}}@float_to_int256
+// CHECK: fptosi float %{{.*}} to i256
+__int256_t float_to_int256(float x) { return (__int256_t)x; }
+
+// === Float -> Unsigned ===
+
+// CHECK-LABEL: define {{.*}}@double_to_uint256
+// CHECK: fptoui double %{{.*}} to i256
+__uint256_t double_to_uint256(double x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define {{.*}}@float_to_uint256
+// CHECK: fptoui float %{{.*}} to i256
+__uint256_t float_to_uint256(float x) { return (__uint256_t)x; }
+
+// === Long Double -> Unsigned ===
+
+// CHECK-LABEL: define {{.*}}@longdouble_to_uint256
+// CHECK: fptoui x86_fp80 %{{.*}} to i256
+__uint256_t longdouble_to_uint256(long double x) { return (__uint256_t)x; }
+
+// === Long Double -> Signed ===
+
+// CHECK-LABEL: define {{.*}}@longdouble_to_int256
+// CHECK: fptosi x86_fp80 %{{.*}} to i256
+__int256_t longdouble_to_int256(long double x) { return (__int256_t)x; }
diff --git a/clang/test/CodeGen/int256-func-ptr.c b/clang/test/CodeGen/int256-func-ptr.c
new file mode 100644
index 0000000000000..e59d2493baba9
--- /dev/null
+++ b/clang/test/CodeGen/int256-func-ptr.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=X86
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=AARCH64
+
+// Verify __int256 works correctly through function pointers and extern decls.
+
+typedef __int256 (*binop_t)(__int256, __int256);
+typedef int (*pred_t)(__int256, __int256);
+
+// X86-LABEL: define{{.*}} void @call_binop(ptr{{.*}}sret(i256){{.*}}, ptr noundef %fn, ptr noundef byval(i256) align 16 %0, ptr noundef byval(i256) align 16 %1)
+// X86: call void %{{.*}}(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %{{.*}})
+// AARCH64-LABEL: define{{.*}} i256 @call_binop(ptr noundef %fn, i256 noundef %a, i256 noundef %b)
+// AARCH64: call i256 %{{.*}}(i256 noundef %{{.*}}, i256 noundef %{{.*}})
+__int256 call_binop(binop_t fn, __int256 a, __int256 b) {
+ return fn(a, b);
+}
+
+// X86-LABEL: define{{.*}} i32 @call_pred(ptr noundef %fn, ptr noundef byval(i256) align 16 %0, ptr noundef byval(i256) align 16 %1)
+// X86: call i32 %{{.*}}(ptr noundef byval(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %{{.*}})
+// AARCH64-LABEL: define{{.*}} i32 @call_pred(ptr noundef %fn, i256 noundef %a, i256 noundef %b)
+// AARCH64: call i32 %{{.*}}(i256 noundef %{{.*}}, i256 noundef %{{.*}})
+int call_pred(pred_t fn, __int256 a, __int256 b) {
+ return fn(a, b);
+}
+
+// Cross-TU: extern function with __int256 params
+extern __int256 extern_add(__int256 a, __int256 b);
+
+// X86-LABEL: define{{.*}} void @call_extern(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %0, ptr noundef byval(i256) align 16 %1)
+// X86: call void @extern_add(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %{{.*}}, ptr noundef byval(i256) align 16 %{{.*}})
+// AARCH64-LABEL: define{{.*}} i256 @call_extern(i256 noundef %a, i256 noundef %b)
+// AARCH64: call i256 @extern_add(i256 noundef %{{.*}}, i256 noundef %{{.*}})
+__int256 call_extern(__int256 a, __int256 b) {
+ return extern_add(a, b);
+}
diff --git a/clang/test/CodeGen/int256-globals.c b/clang/test/CodeGen/int256-globals.c
new file mode 100644
index 0000000000000..f974f37402522
--- /dev/null
+++ b/clang/test/CodeGen/int256-globals.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify __int256 global/static/extern variable declarations and access.
+
+// CHECK-DAG: @global_s = global i256 0, align 16
+__int256_t global_s;
+
+// CHECK-DAG: @global_u = global i256 42, align 16
+__uint256_t global_u = 42;
+
+// CHECK-DAG: @static_s = internal global i256 0, align 16
+static __int256_t static_s;
+
+// CHECK-DAG: @extern_s = external global i256, align 16
+extern __int256_t extern_s;
+
+// CHECK-LABEL: define{{.*}} void @read_global(ptr{{.*}}sret(i256)
+// CHECK: load i256, ptr @global_s, align 16
+__int256_t read_global(void) { return global_s; }
+
+// CHECK-LABEL: define{{.*}} void @write_global(ptr{{.*}}byval(i256) align 16
+// CHECK: store i256 %{{.*}}, ptr @global_s, align 16
+void write_global(__int256_t v) { global_s = v; }
+
+// CHECK-LABEL: define{{.*}} void @read_static(ptr{{.*}}sret(i256)
+// CHECK: load i256, ptr @static_s, align 16
+__int256_t read_static(void) { return static_s; }
+
+// CHECK-LABEL: define{{.*}} void @write_static(ptr{{.*}}byval(i256) align 16
+// CHECK: store i256 %{{.*}}, ptr @static_s, align 16
+void write_static(__int256_t v) { static_s = v; }
+
+// CHECK-LABEL: define{{.*}} void @read_extern(ptr{{.*}}sret(i256)
+// CHECK: load i256, ptr @extern_s, align 16
+__int256_t read_extern(void) { return extern_s; }
+
+// CHECK-LABEL: define{{.*}} void @read_global_u(ptr{{.*}}sret(i256)
+// CHECK: load i256, ptr @global_u, align 16
+__uint256_t read_global_u(void) { return global_u; }
diff --git a/clang/test/CodeGen/overflow-builtins-int256.c b/clang/test/CodeGen/overflow-builtins-int256.c
new file mode 100644
index 0000000000000..86795d4ddcc34
--- /dev/null
+++ b/clang/test/CodeGen/overflow-builtins-int256.c
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Test overflow builtins with __int256_t and __uint256_t.
+
+void overflowed(void);
+
+// CHECK-LABEL: define {{.*}}@test_sadd_overflow_int256
+// CHECK: call { i256, i1 } @llvm.sadd.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_sadd_overflow_int256(__int256_t x, __int256_t y) {
+ __int256_t r;
+ if (__builtin_add_overflow(x, y, &r))
+ overflowed();
+ return (int)r;
+}
+
+// CHECK-LABEL: define {{.*}}@test_uadd_overflow_uint256
+// CHECK: call { i256, i1 } @llvm.uadd.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_uadd_overflow_uint256(__uint256_t x, __uint256_t y) {
+ __uint256_t r;
+ if (__builtin_add_overflow(x, y, &r))
+ overflowed();
+ return (int)r;
+}
+
+// CHECK-LABEL: define {{.*}}@test_ssub_overflow_int256
+// CHECK: call { i256, i1 } @llvm.ssub.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_ssub_overflow_int256(__int256_t x, __int256_t y) {
+ __int256_t r;
+ if (__builtin_sub_overflow(x, y, &r))
+ overflowed();
+ return (int)r;
+}
+
+// CHECK-LABEL: define {{.*}}@test_usub_overflow_uint256
+// CHECK: call { i256, i1 } @llvm.usub.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_usub_overflow_uint256(__uint256_t x, __uint256_t y) {
+ __uint256_t r;
+ if (__builtin_sub_overflow(x, y, &r))
+ overflowed();
+ return (int)r;
+}
+
+// CHECK-LABEL: define {{.*}}@test_smul_overflow_int256
+// CHECK: call { i256, i1 } @llvm.smul.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_smul_overflow_int256(__int256_t x, __int256_t y) {
+ __int256_t r;
+ if (__builtin_mul_overflow(x, y, &r))
+ overflowed();
+ return (int)r;
+}
+
+// CHECK-LABEL: define {{.*}}@test_umul_overflow_uint256
+// CHECK: call { i256, i1 } @llvm.umul.with.overflow.i256(i256 %{{.+}}, i256 %{{.+}})
+int test_umul_overflow_uint256(__uint256_t x, __uint256_t y) {
+ __uint256_t r;
+ if (__builtin_mul_overflow(x, y, &r))
+ overflowed();
+ return (int)r;
+}
diff --git a/clang/test/CodeGen/uint256_t.c b/clang/test/CodeGen/uint256_t.c
new file mode 100644
index 0000000000000..30ceb6a785f18
--- /dev/null
+++ b/clang/test/CodeGen/uint256_t.c
@@ -0,0 +1,239 @@
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-apple-darwin9 | FileCheck %s
+
+// Basic arithmetic code generation for __uint256_t / __int256_t.
+// Verifies that all operations lower to i256 LLVM IR.
+// On x86-64, __int256 is passed/returned via byval/sret (Memory class).
+
+// CHECK-LABEL: define{{.*}} void @add256(ptr{{.*}}sret(i256)
+// CHECK: add nsw i256
+__int256_t add256(__int256_t a, __int256_t b) { return a + b; }
+
+// CHECK-LABEL: define{{.*}} void @sub256(ptr{{.*}}sret(i256)
+// CHECK: sub nsw i256
+__int256_t sub256(__int256_t a, __int256_t b) { return a - b; }
+
+// CHECK-LABEL: define{{.*}} void @mul256(ptr{{.*}}sret(i256)
+// CHECK: mul i256
+__uint256_t mul256(__uint256_t a, __uint256_t b) { return a * b; }
+
+// CHECK-LABEL: define{{.*}} void @div256(ptr{{.*}}sret(i256)
+// CHECK: udiv i256
+__uint256_t div256(__uint256_t a, __uint256_t b) { return a / b; }
+
+// CHECK-LABEL: define{{.*}} void @sdiv256(ptr{{.*}}sret(i256)
+// CHECK: sdiv i256
+__int256_t sdiv256(__int256_t a, __int256_t b) { return a / b; }
+
+// Bitwise operations -- core of Hamming distance / popcount patterns
+// CHECK-LABEL: define{{.*}} void @xor256(ptr{{.*}}sret(i256)
+// CHECK: xor i256
+__uint256_t xor256(__uint256_t a, __uint256_t b) { return a ^ b; }
+
+// CHECK-LABEL: define{{.*}} void @and256(ptr{{.*}}sret(i256)
+// CHECK: and i256
+__uint256_t and256(__uint256_t a, __uint256_t b) { return a & b; }
+
+// CHECK-LABEL: define{{.*}} void @or256(ptr{{.*}}sret(i256)
+// CHECK: or i256
+__uint256_t or256(__uint256_t a, __uint256_t b) { return a | b; }
+
+// CHECK-LABEL: define{{.*}} void @not256(ptr{{.*}}sret(i256)
+// CHECK: xor i256 %{{.*}}, -1
+__uint256_t not256(__uint256_t a) { return ~a; }
+
+// CHECK-LABEL: define{{.*}} void @shl256(ptr{{.*}}sret(i256)
+// CHECK: shl i256
+__uint256_t shl256(__uint256_t a, __uint256_t b) { return a << b; }
+
+// CHECK-LABEL: define{{.*}} void @lshr256(ptr{{.*}}sret(i256)
+// CHECK: lshr i256
+__uint256_t lshr256(__uint256_t a, __uint256_t b) { return a >> b; }
+
+// CHECK-LABEL: define{{.*}} void @ashr256(ptr{{.*}}sret(i256)
+// CHECK: ashr i256
+__int256_t ashr256(__int256_t a, __int256_t b) { return a >> b; }
+
+// Widening conversion from uint64_t
+// CHECK-LABEL: define{{.*}} void @widen(ptr{{.*}}sret(i256){{.*}}, i64
+// CHECK: zext i64 %{{.*}} to i256
+__uint256_t widen(unsigned long long x) { return (__uint256_t)x; }
+
+// Narrowing conversion to uint64_t
+// CHECK-LABEL: define{{.*}} i64 @narrow(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i64
+unsigned long long narrow(__uint256_t x) { return (unsigned long long)x; }
+
+// Conversion between i128 and i256
+// CHECK-LABEL: define{{.*}} void @from128(ptr{{.*}}sret(i256){{.*}}, i128
+// CHECK: sext i128 %{{.*}} to i256
+__int256_t from128(__int128_t x) { return (__int256_t)x; }
+
+// CHECK-LABEL: define{{.*}} i128 @to128(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i128
+__int128_t to128(__int256_t x) { return (__int128_t)x; }
+
+// Comparison
+// CHECK-LABEL: define{{.*}} i32 @cmp_eq(ptr{{.*}}byval(i256){{.*}}, ptr{{.*}}byval(i256)
+// CHECK: icmp eq i256
+int cmp_eq(__int256_t a, __int256_t b) { return a == b; }
+
+// CHECK-LABEL: define{{.*}} i32 @cmp_slt(ptr{{.*}}byval(i256){{.*}}, ptr{{.*}}byval(i256)
+// CHECK: icmp slt i256
+int cmp_slt(__int256_t a, __int256_t b) { return a < b; }
+
+// CHECK-LABEL: define{{.*}} i32 @cmp_ult(ptr{{.*}}byval(i256){{.*}}, ptr{{.*}}byval(i256)
+// CHECK: icmp ult i256
+int cmp_ult(__uint256_t a, __uint256_t b) { return a < b; }
+
+// Unsigned remainder
+// CHECK-LABEL: define{{.*}} void @urem256(ptr{{.*}}sret(i256)
+// CHECK: urem i256
+__uint256_t urem256(__uint256_t a, __uint256_t b) { return a % b; }
+
+// Signed remainder
+// CHECK-LABEL: define{{.*}} void @srem256(ptr{{.*}}sret(i256)
+// CHECK: srem i256
+__int256_t srem256(__int256_t a, __int256_t b) { return a % b; }
+
+// Unary minus
+// CHECK-LABEL: define{{.*}} void @neg256(ptr{{.*}}sret(i256)
+// CHECK: sub nsw i256 0,
+__int256_t neg256(__int256_t a) { return -a; }
+
+// Bool conversion
+// CHECK-LABEL: define{{.*}} i32 @bool256(ptr{{.*}}byval(i256)
+// CHECK: icmp ne i256 %{{.*}}, 0
+int bool256(__uint256_t a) { return !!a; }
+
+// ===----------------------------------------------------------------------===
+// Comprehensive cast / conversion tests
+// ===----------------------------------------------------------------------===
+
+// --- Widening: signed small -> signed i256 (sign-extend) ---
+
+// CHECK-LABEL: define{{.*}} void @widen_schar(ptr{{.*}}sret(i256)
+// CHECK: sext i8 %{{.*}} to i256
+__int256_t widen_schar(signed char x) { return (__int256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_short(ptr{{.*}}sret(i256)
+// CHECK: sext i16 %{{.*}} to i256
+__int256_t widen_short(short x) { return (__int256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_int(ptr{{.*}}sret(i256)
+// CHECK: sext i32 %{{.*}} to i256
+__int256_t widen_int(int x) { return (__int256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_long(ptr{{.*}}sret(i256)
+// CHECK: sext i64 %{{.*}} to i256
+__int256_t widen_long(long long x) { return (__int256_t)x; }
+
+// --- Widening: unsigned small -> unsigned i256 (zero-extend) ---
+
+// CHECK-LABEL: define{{.*}} void @widen_uchar(ptr{{.*}}sret(i256)
+// CHECK: zext i8 %{{.*}} to i256
+__uint256_t widen_uchar(unsigned char x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_ushort(ptr{{.*}}sret(i256)
+// CHECK: zext i16 %{{.*}} to i256
+__uint256_t widen_ushort(unsigned short x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_uint(ptr{{.*}}sret(i256)
+// CHECK: zext i32 %{{.*}} to i256
+__uint256_t widen_uint(unsigned int x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_ulong(ptr{{.*}}sret(i256)
+// CHECK: zext i64 %{{.*}} to i256
+__uint256_t widen_ulong(unsigned long long x) { return (__uint256_t)x; }
+
+// --- Widening: unsigned i128 -> unsigned i256 (zero-extend) ---
+
+// CHECK-LABEL: define{{.*}} void @widen_u128(ptr{{.*}}sret(i256)
+// CHECK: zext i128 %{{.*}} to i256
+__uint256_t widen_u128(__uint128_t x) { return (__uint256_t)x; }
+
+// --- Widening: signed small -> unsigned i256 (sign-extend then implicit) ---
+// C semantics: cast to __int256_t first (sext), then to __uint256_t (nop).
+// The compiler folds this to sext directly to i256.
+
+// CHECK-LABEL: define{{.*}} void @widen_schar_to_u256(ptr{{.*}}sret(i256)
+// CHECK: sext i8 %{{.*}} to i256
+__uint256_t widen_schar_to_u256(signed char x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @widen_int_to_u256(ptr{{.*}}sret(i256)
+// CHECK: sext i32 %{{.*}} to i256
+__uint256_t widen_int_to_u256(int x) { return (__uint256_t)x; }
+
+// --- Narrowing: i256 -> small types (truncate) ---
+
+// CHECK-LABEL: define{{.*}} signext i8 @narrow_to_schar(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i8
+signed char narrow_to_schar(__int256_t x) { return (signed char)x; }
+
+// CHECK-LABEL: define{{.*}} zeroext i8 @narrow_to_uchar(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i8
+unsigned char narrow_to_uchar(__uint256_t x) { return (unsigned char)x; }
+
+// CHECK-LABEL: define{{.*}} signext i16 @narrow_to_short(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i16
+short narrow_to_short(__int256_t x) { return (short)x; }
+
+// CHECK-LABEL: define{{.*}} zeroext i16 @narrow_to_ushort(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i16
+unsigned short narrow_to_ushort(__uint256_t x) { return (unsigned short)x; }
+
+// CHECK-LABEL: define{{.*}} i32 @narrow_to_int(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i32
+int narrow_to_int(__int256_t x) { return (int)x; }
+
+// CHECK-LABEL: define{{.*}} i32 @narrow_to_uint(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i32
+unsigned int narrow_to_uint(__uint256_t x) { return (unsigned int)x; }
+
+// CHECK-LABEL: define{{.*}} i64 @narrow_to_long(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i64
+long long narrow_to_long(__int256_t x) { return (long long)x; }
+
+// CHECK-LABEL: define{{.*}} i64 @narrow_to_ulong(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i64
+unsigned long long narrow_to_ulong(__uint256_t x) {
+ return (unsigned long long)x;
+}
+
+// --- Narrowing: i256 -> i128 (unsigned) ---
+
+// CHECK-LABEL: define{{.*}} i128 @narrow_to_u128(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i128
+__uint128_t narrow_to_u128(__uint256_t x) { return (__uint128_t)x; }
+
+// --- Cross-sign: signed <-> unsigned i256 (no-op, same bit pattern) ---
+
+// CHECK-LABEL: define{{.*}} void @signed_to_unsigned(ptr{{.*}}sret(i256){{.*}}, ptr{{.*}}byval(i256)
+// CHECK-NOT: ext
+// CHECK-NOT: trunc
+// CHECK: ret void
+__uint256_t signed_to_unsigned(__int256_t x) { return (__uint256_t)x; }
+
+// CHECK-LABEL: define{{.*}} void @unsigned_to_signed(ptr{{.*}}sret(i256){{.*}}, ptr{{.*}}byval(i256)
+// CHECK-NOT: ext
+// CHECK-NOT: trunc
+// CHECK: ret void
+__int256_t unsigned_to_signed(__uint256_t x) { return (__int256_t)x; }
+
+// --- Multi-step: negative char -> signed i256 (sign-extension across
+// 248 bits) ---
+// This verifies that (int256_t)(char)-42 produces a 256-bit -42
+// via sign-extension, not a large positive number.
+
+// CHECK-LABEL: define{{.*}} void @neg_char_to_i256(ptr{{.*}}sret(i256)
+// CHECK: sext i8 %{{.*}} to i256
+__int256_t neg_char_to_i256(signed char x) { return x; }
+
+// --- Implicit conversions (no explicit cast) ---
+
+// CHECK-LABEL: define{{.*}} void @implicit_int_to_i256(ptr{{.*}}sret(i256)
+// CHECK: sext i32 %{{.*}} to i256
+__int256_t implicit_int_to_i256(int x) { return x; }
+
+// CHECK-LABEL: define{{.*}} i32 @implicit_i256_to_int(ptr{{.*}}byval(i256)
+// CHECK: trunc i256 %{{.*}} to i32
+int implicit_i256_to_int(__int256_t x) { return x; }
diff --git a/clang/test/CodeGen/varargs-int256.c b/clang/test/CodeGen/varargs-int256.c
new file mode 100644
index 0000000000000..22e61d22d598e
--- /dev/null
+++ b/clang/test/CodeGen/varargs-int256.c
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=X86
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=AARCH64
+
+// Test that __int256 works correctly with variadic functions (va_arg).
+
+typedef __builtin_va_list va_list;
+
+// x86_64: return via sret (Memory class per SysV ABI)
+// X86-LABEL: define{{.*}} void @va_int256(ptr dead_on_unwind noalias writable sret(i256) align 16 %{{.*}}, i32 noundef %n, ...)
+// X86: load i256, ptr %{{.*}}, align 16
+
+// AArch64: return directly (4 GPRs)
+// AARCH64-LABEL: define{{.*}} i256 @va_int256(i32 noundef %n, ...)
+// AARCH64: load i256, ptr %{{.*}}, align
+__int256 va_int256(int n, ...) {
+ va_list ap;
+ __builtin_va_start(ap, n);
+ __int256 v = __builtin_va_arg(ap, __int256);
+ __builtin_va_end(ap);
+ return v;
+}
+
+// Test passing __int256 to a variadic function call.
+void callee(int, ...);
+
+// x86_64: __int256 passed via byval pointer
+// X86-LABEL: define{{.*}} void @pass_int256(ptr noundef byval(i256) align 16 %0)
+// X86: call void (i32, ...) @callee(i32 noundef 1, ptr noundef byval(i256) align 16 %
+
+// AArch64: __int256 passed directly
+// AARCH64-LABEL: define{{.*}} void @pass_int256(i256 noundef %x)
+// AARCH64: call void (i32, ...) @callee(i32 noundef 1, i256 noundef %
+void pass_int256(__int256 x) {
+ callee(1, x);
+}
+
+// Multiple va_arg fetches of __int256
+// X86-LABEL: define{{.*}} void @va_two(ptr{{.*}}sret(i256){{.*}}, i32 noundef %n, ...)
+// X86: load i256, ptr %{{.*}}, align 16
+// X86: load i256, ptr %{{.*}}, align 16
+// X86: add nsw i256
+
+// AARCH64-LABEL: define{{.*}} i256 @va_two(i32 noundef %n, ...)
+// AARCH64: load i256
+// AARCH64: load i256
+// AARCH64: add nsw i256
+__int256 va_two(int n, ...) {
+ va_list ap;
+ __builtin_va_start(ap, n);
+ __int256 a = __builtin_va_arg(ap, __int256);
+ __int256 b = __builtin_va_arg(ap, __int256);
+ __builtin_va_end(ap);
+ return a + b;
+}
+
+// Mixed sizes in varargs: int, __int256, long long
+// X86-LABEL: define{{.*}} i64 @va_mixed(i32 noundef %n, ...)
+// AARCH64-LABEL: define{{.*}} i64 @va_mixed(i32 noundef %n, ...)
+long long va_mixed(int n, ...) {
+ va_list ap;
+ __builtin_va_start(ap, n);
+ int x = __builtin_va_arg(ap, int);
+ __int256 big = __builtin_va_arg(ap, __int256);
+ long long y = __builtin_va_arg(ap, long long);
+ __builtin_va_end(ap);
+ return x + (long long)big + y;
+}
diff --git a/clang/test/CodeGenCXX/mangle-int256.cpp b/clang/test/CodeGenCXX/mangle-int256.cpp
new file mode 100644
index 0000000000000..758cbf9e47619
--- /dev/null
+++ b/clang/test/CodeGenCXX/mangle-int256.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-apple-darwin9 | FileCheck %s --check-prefix=ITANIUM
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc | FileCheck %s --check-prefix=MS
+
+// Verify Itanium C++ name mangling for __int256_t / __uint256_t.
+// These use vendor-extended type mangling since there are no standard
+// single-letter codes for 256-bit integers (unlike 'n'/'o' for 128-bit).
+
+// Verify Microsoft C++ name mangling for __int256_t / __uint256_t.
+// These use $$_L / $$_M (extending the _L / _M pattern for __int128).
+
+// ITANIUM-LABEL: define{{.*}} void @_Z3f01u7__int256u8__uint256
+// MS-LABEL: define{{.*}} void @"?f01@@YAX$$_L$$_M at Z"
+void f01(__int256_t, __uint256_t) {}
+
+// ITANIUM-LABEL: define{{.*}} void @_Z3f02no
+// MS-LABEL: define{{.*}} void @"?f02@@YAX_L_M at Z"
+void f02(__int128_t, __uint128_t) {}
+
+// Overloading: __int256_t vs __int128_t should produce different manglings
+// ITANIUM-LABEL: define{{.*}} void @_Z3f03n
+// MS-LABEL: define{{.*}} void @"?f03@@YAX_L at Z"
+void f03(__int128_t) {}
+// ITANIUM-LABEL: define{{.*}} void @_Z3f03u7__int256
+// MS-LABEL: define{{.*}} void @"?f03@@YAX$$_L at Z"
+void f03(__int256_t) {}
+
+// ITANIUM-LABEL: define{{.*}} void @_Z3f04o
+// MS-LABEL: define{{.*}} void @"?f04@@YAX_M at Z"
+void f04(__uint128_t) {}
+// ITANIUM-LABEL: define{{.*}} void @_Z3f04u8__uint256
+// MS-LABEL: define{{.*}} void @"?f04@@YAX$$_M at Z"
+void f04(__uint256_t) {}
diff --git a/clang/test/Modules/decl-params-determinisim.m b/clang/test/Modules/decl-params-determinisim.m
index db4ed33265388..cddad068837b8 100644
--- a/clang/test/Modules/decl-params-determinisim.m
+++ b/clang/test/Modules/decl-params-determinisim.m
@@ -28,23 +28,23 @@
// CHECK: <TYPE_FUNCTION_PROTO
// CHECK-NEXT: <DECL_PARM_VAR
-// CHECK-SAME: op5=13
-// CHECK-NEXT: <DECL_PARM_VAR
-// CHECK-SAME: op5=14
-// CHECK-NEXT: <DECL_PARM_VAR
// CHECK-SAME: op5=15
// CHECK-NEXT: <DECL_PARM_VAR
// CHECK-SAME: op5=16
+// CHECK-NEXT: <DECL_PARM_VAR
+// CHECK-SAME: op5=17
+// CHECK-NEXT: <DECL_PARM_VAR
+// CHECK-SAME: op5=18
/// Decl records start at 43
// CHECK: <DECL_RECORD
-// CHECK-SAME: op5=54
-// CHECK-NEXT: <DECL_RECORD
-// CHECK-SAME: op5=55
-// CHECK-NEXT: <DECL_RECORD
// CHECK-SAME: op5=56
// CHECK-NEXT: <DECL_RECORD
// CHECK-SAME: op5=57
+// CHECK-NEXT: <DECL_RECORD
+// CHECK-SAME: op5=58
+// CHECK-NEXT: <DECL_RECORD
+// CHECK-SAME: op5=59
//--- headers/a.h
void f(struct A0 *a0,
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 09e3fc926a309..77c1cb23a56e1 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -267,6 +267,7 @@
// AARCH64-NEXT: #define __SIZEOF_DOUBLE__ 8
// AARCH64-NEXT: #define __SIZEOF_FLOAT__ 4
// AARCH64-NEXT: #define __SIZEOF_INT128__ 16
+// AARCH64-NEXT: #define __SIZEOF_INT256__ 32
// AARCH64-NEXT: #define __SIZEOF_INT__ 4
// AARCH64-NEXT: #define __SIZEOF_LONG_DOUBLE__ 16
// AARCH64-NEXT: #define __SIZEOF_LONG_LONG__ 8
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 80b7a6399e5f4..912226a7906b6 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1939,6 +1939,7 @@
// WEBASSEMBLY-NEXT:#define __SIZEOF_DOUBLE__ 8
// WEBASSEMBLY-NEXT:#define __SIZEOF_FLOAT__ 4
// WEBASSEMBLY-NEXT:#define __SIZEOF_INT128__ 16
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_INT256__ 32
// WEBASSEMBLY-NEXT:#define __SIZEOF_INT__ 4
// WEBASSEMBLY-NEXT:#define __SIZEOF_LONG_DOUBLE__ 16
// WEBASSEMBLY-NEXT:#define __SIZEOF_LONG_LONG__ 8
diff --git a/clang/test/Sema/256bitint.c b/clang/test/Sema/256bitint.c
new file mode 100644
index 0000000000000..b63c193f438be
--- /dev/null
+++ b/clang/test/Sema/256bitint.c
@@ -0,0 +1,72 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-apple-darwin9 %s -DHAVE
+// RUN: %clang_cc1 -fsyntax-only -verify -triple i686-linux-gnu %s -DHAVE_NOT
+// RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64-linux-gnu %s -DHAVE
+// RUN: %clang_cc1 -fsyntax-only -verify -triple arm-linux-gnueabi %s -DHAVE_NOT
+// RUN: %clang_cc1 -fsyntax-only -verify -triple powerpc64-linux-gnu %s -DHAVE
+// RUN: %clang_cc1 -fsyntax-only -verify -triple riscv64-linux-gnu %s -DHAVE
+// RUN: %clang_cc1 -fsyntax-only -verify -triple wasm32-unknown-unknown %s -DHAVE_NOT
+// RUN: %clang_cc1 -fsyntax-only -verify -triple wasm64-unknown-unknown %s -DHAVE
+
+#ifdef HAVE
+// expected-no-diagnostics
+
+// __int256 is supported on all 64-bit targets
+
+__int256_t b256s = (__int256_t)0;
+__uint256_t b256u = (__uint256_t)-1;
+
+// Explicit signed/unsigned qualifiers
+__int256 i256 = (__int256)0;
+signed __int256 si256 = (signed __int256)0;
+unsigned __int256 ui256 = (unsigned __int256)-1;
+
+// sizeof / alignof
+int sz[sizeof(__int256_t) == 32 ? 1 : -1];
+int al[_Alignof(__int256_t) == 16 ? 1 : -1];
+int sz2[sizeof(__uint256_t) == 32 ? 1 : -1];
+int al2[_Alignof(__uint256_t) == 16 ? 1 : -1];
+
+// __SIZEOF_INT256__ predefined macro
+int sizemacro[__SIZEOF_INT256__ == 32 ? 1 : -1];
+
+// Basic arithmetic
+__int256_t arith_add(__int256_t a, __int256_t b) { return a + b; }
+__int256_t arith_sub(__int256_t a, __int256_t b) { return a - b; }
+__int256_t arith_mul(__int256_t a, __int256_t b) { return a * b; }
+__int256_t arith_div(__int256_t a, __int256_t b) { return a / b; }
+__int256_t arith_rem(__int256_t a, __int256_t b) { return a % b; }
+
+// Bitwise operations (key for Hamming distance / popcount use cases)
+__uint256_t bit_and(__uint256_t a, __uint256_t b) { return a & b; }
+__uint256_t bit_or(__uint256_t a, __uint256_t b) { return a | b; }
+__uint256_t bit_xor(__uint256_t a, __uint256_t b) { return a ^ b; }
+__uint256_t bit_not(__uint256_t a) { return ~a; }
+__uint256_t bit_shl(__uint256_t a, __uint256_t b) { return a << b; }
+__uint256_t bit_shr(__uint256_t a, __uint256_t b) { return a >> b; }
+
+// Comparisons
+int cmp_eq(__int256_t a, __int256_t b) { return a == b; }
+int cmp_lt(__int256_t a, __int256_t b) { return a < b; }
+int cmp_gt(__int256_t a, __int256_t b) { return a > b; }
+
+// Conversions between int256 and int128
+__int256_t from128(__int128_t x) { return (__int256_t)x; }
+__int128_t to128(__int256_t x) { return (__int128_t)x; }
+
+// Conversion from smaller types
+__int256_t from64(long long x) { return (__int256_t)x; }
+__uint256_t fromu64(unsigned long long x) { return (__uint256_t)x; }
+
+// Typedef equivalence
+typedef __int256_t MyInt256;
+MyInt256 typedef_test(MyInt256 a) { return a; }
+
+#else
+
+__int256 n; // expected-error {{__int256 is not supported on this target}}
+
+#if defined(__SIZEOF_INT256__)
+#error __SIZEOF_INT256__ should not be defined
+#endif
+
+#endif
diff --git a/clang/test/Sema/atomic-builtins-int256.c b/clang/test/Sema/atomic-builtins-int256.c
new file mode 100644
index 0000000000000..4fbb0fffcb5f4
--- /dev/null
+++ b/clang/test/Sema/atomic-builtins-int256.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-unknown-linux-gnu %s
+
+// Verify that __sync_* builtins reject __int256 (max atomic width is 16 bytes).
+// The __c11_atomic_* builtins accept __int256 (via libcalls) and are tested
+// separately in atomic-int256.c and CodeGen/X86/x86_64-atomic-i256.c.
+
+__int256 test_sync_add(__int256 *addr, __int256 val) {
+ return __sync_fetch_and_add(addr, val); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
+
+__int256 test_sync_sub(__int256 *addr, __int256 val) {
+ return __sync_fetch_and_sub(addr, val); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
+
+__int256 test_sync_or(__int256 *addr, __int256 val) {
+ return __sync_fetch_and_or(addr, val); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
+
+__int256 test_sync_and(__int256 *addr, __int256 val) {
+ return __sync_fetch_and_and(addr, val); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
+
+__int256 test_sync_xor(__int256 *addr, __int256 val) {
+ return __sync_fetch_and_xor(addr, val); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
+
+_Bool test_sync_cas(__int256 *addr, __int256 oldval, __int256 newval) {
+ return __sync_bool_compare_and_swap(addr, oldval, newval); // expected-error {{address argument to atomic builtin must be a pointer to 1,2,4,8 or 16 byte type}}
+}
diff --git a/clang/test/Sema/atomic-int256.c b/clang/test/Sema/atomic-int256.c
new file mode 100644
index 0000000000000..6257338e50ad4
--- /dev/null
+++ b/clang/test/Sema/atomic-int256.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-unknown-linux-gnu %s
+// expected-no-diagnostics
+
+// __int256 is never lock-free (256 bits > max atomic width on any current target)
+_Static_assert(!__atomic_always_lock_free(32, 0), "__int256 should not be always lock-free");
+
+// _Atomic __int256_t variables should compile
+_Atomic __int256_t atomic_s256;
+_Atomic __uint256_t atomic_u256;
+
+// Atomic load/store should compile (will use libcalls)
+__int256_t load_atomic(void) {
+ return __c11_atomic_load(&atomic_s256, __ATOMIC_SEQ_CST);
+}
+
+void store_atomic(__int256_t val) {
+ __c11_atomic_store(&atomic_s256, val, __ATOMIC_SEQ_CST);
+}
+
+__uint256_t load_atomic_unsigned(void) {
+ return __c11_atomic_load(&atomic_u256, __ATOMIC_SEQ_CST);
+}
+
+void store_atomic_unsigned(__uint256_t val) {
+ __c11_atomic_store(&atomic_u256, val, __ATOMIC_SEQ_CST);
+}
diff --git a/clang/test/Sema/bitfield-int256.c b/clang/test/Sema/bitfield-int256.c
new file mode 100644
index 0000000000000..89a0bdc242668
--- /dev/null
+++ b/clang/test/Sema/bitfield-int256.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-linux-gnu %s
+
+// Test __int256 bitfield support.
+
+struct S1 {
+ __int256 x : 200;
+ __int256 y : 56;
+};
+
+_Static_assert(sizeof(struct S1) == 32, "S1 should be 32 bytes");
+
+struct S2 {
+ char a;
+ __int256 x : 100;
+};
+
+struct S3 {
+ unsigned __int256 x : 256; // Full width bitfield
+};
+
+_Static_assert(sizeof(struct S3) == 32, "S3 should be 32 bytes");
+
+struct S4 {
+ __int256 x : 1; // Single bit signed
+ unsigned __int256 y : 1; // Single bit unsigned
+};
+
+// Test bitfield access
+int test_bitfield(void) {
+ struct S1 s = {};
+ s.x = 42;
+ s.y = -1;
+ return (int)s.x + (int)s.y;
+}
+
+// Test zero-width bitfield
+struct S5 {
+ __int256 : 0; // Zero-width bitfield for alignment
+ int x;
+};
+
+// expected-no-diagnostics
diff --git a/clang/test/Sema/const-eval.c b/clang/test/Sema/const-eval.c
index 53face901d75e..9be6cd73b4355 100644
--- a/clang/test/Sema/const-eval.c
+++ b/clang/test/Sema/const-eval.c
@@ -143,6 +143,11 @@ void *PR28739b = &PR28739b + (__int128)(unsigned long)-1; // ex
__int128 PR28739c = (&PR28739c + (__int128)(unsigned long)-1) - &PR28739c; // expected-warning {{refers past the last possible element}}
void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1]; // expected-warning {{refers past the last possible element}}
+#ifdef __SIZEOF_INT256__
+// __int256 pointer arithmetic -- same pattern as __int128 above.
+__int256 PR28739_256 = (&PR28739_256 + (__int256)(unsigned long)-1) - &PR28739_256; // expected-warning {{refers past the last possible element}}
+#endif
+
struct PR35214_X {
int k;
int arr[];
diff --git a/clang/test/Sema/constant-builtins-2.c b/clang/test/Sema/constant-builtins-2.c
index bb005981b6daf..20dbd6c584cf3 100644
--- a/clang/test/Sema/constant-builtins-2.c
+++ b/clang/test/Sema/constant-builtins-2.c
@@ -315,6 +315,12 @@ char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ?
char clz56[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1))) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
char clz57[__builtin_clzg((unsigned __int128)(1 << (BITSIZE(__int128) - 1)), 42) == 0 ? 1 : -1]; // expected-error {{variable length array declaration not allowed at file scope}}
#endif
+#ifdef __SIZEOF_INT256__
+int clz256_0 = __builtin_clzg((unsigned __int256)0); // expected-error {{not a compile-time constant}}
+char clz256_1[__builtin_clzg((unsigned __int256)0, 42) == 42 ? 1 : -1];
+char clz256_2[__builtin_clzg((unsigned __int256)0x1) == BITSIZE(__int256) - 1 ? 1 : -1];
+char clz256_3[__builtin_clzg((unsigned __int256)0xf) == BITSIZE(__int256) - 4 ? 1 : -1];
+#endif
int clz58 = __builtin_clzg((unsigned _BitInt(128))0); // expected-error {{not a compile-time constant}}
char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
@@ -381,6 +387,12 @@ char ctz53[__builtin_ctzg((unsigned __int128)0x10, 42) == 4 ? 1 : -1];
char ctz54[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1)) == BITSIZE(__int128) - 1 ? 1 : -1];
char ctz55[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1), 42) == BITSIZE(__int128) - 1 ? 1 : -1];
#endif
+#ifdef __SIZEOF_INT256__
+int ctz256_0 = __builtin_ctzg((unsigned __int256)0); // expected-error {{not a compile-time constant}}
+char ctz256_1[__builtin_ctzg((unsigned __int256)0, 42) == 42 ? 1 : -1];
+char ctz256_2[__builtin_ctzg((unsigned __int256)0x1) == 0 ? 1 : -1];
+char ctz256_3[__builtin_ctzg((unsigned __int256)0x10) == 4 ? 1 : -1];
+#endif
int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0); // expected-error {{not a compile-time constant}}
char ctz57[__builtin_ctzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
char ctz58[__builtin_ctzg((unsigned _BitInt(128))0x1) == 0 ? 1 : -1];
@@ -408,6 +420,9 @@ char popcount15[__builtin_popcountg(~0ULL) == BITSIZE(long long) ? 1 : -1];
#ifdef __SIZEOF_INT128__
char popcount16[__builtin_popcountg(~(unsigned __int128)0) == BITSIZE(__int128) ? 1 : -1];
#endif
+#ifdef __SIZEOF_INT256__
+char popcount256[__builtin_popcountg(~(unsigned __int256)0) == BITSIZE(__int256) ? 1 : -1];
+#endif
char popcount17[__builtin_popcountg(~(unsigned _BitInt(128))0) == BITSIZE(_BitInt(128)) ? 1 : -1];
char parity1[__builtin_parity(0) == 0 ? 1 : -1];
diff --git a/clang/test/Sema/enum.c b/clang/test/Sema/enum.c
index f0da5f097fa80..9f9e4d9baeabc 100644
--- a/clang/test/Sema/enum.c
+++ b/clang/test/Sema/enum.c
@@ -206,7 +206,9 @@ _Static_assert(
long long : 0,
unsigned long long : 0,
__int128_t : 0,
- __uint128_t : 1
+ __uint128_t : 1,
+ __int256_t : 0,
+ __uint256_t : 0
)
);
diff --git a/clang/test/Sema/struct-layout-int256.c b/clang/test/Sema/struct-layout-int256.c
new file mode 100644
index 0000000000000..6f56ee921a086
--- /dev/null
+++ b/clang/test/Sema/struct-layout-int256.c
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-linux-gnu %s
+
+// Test struct layout, alignment, and padding with __int256.
+
+// Basic alignment and size
+_Static_assert(sizeof(__int256) == 32, "");
+_Static_assert(_Alignof(__int256) == 16, "");
+_Static_assert(sizeof(unsigned __int256) == 32, "");
+
+// Struct with __int256 member
+struct Basic {
+ __int256 x;
+};
+_Static_assert(sizeof(struct Basic) == 32, "");
+_Static_assert(_Alignof(struct Basic) == 16, "");
+
+// Struct with padding before __int256
+struct Padded {
+ char a;
+ __int256 x;
+};
+// 'a' at offset 0 (1 byte), 15 bytes padding, 'x' at offset 16
+_Static_assert(sizeof(struct Padded) == 48, "");
+_Static_assert(_Alignof(struct Padded) == 16, "");
+
+// Struct with multiple __int256 members
+struct Multi {
+ __int256 x;
+ __int256 y;
+};
+_Static_assert(sizeof(struct Multi) == 64, "");
+
+// Nested struct
+struct Nested {
+ struct Basic inner;
+ int z;
+};
+_Static_assert(sizeof(struct Nested) == 48, ""); // 32 + 4 + 12 padding
+
+// Union with __int256
+union U {
+ __int256 x;
+ char bytes[32];
+ long long parts[4];
+};
+_Static_assert(sizeof(union U) == 32, "");
+_Static_assert(_Alignof(union U) == 16, "");
+
+// Array of __int256
+struct ArrayMember {
+ __int256 arr[2];
+};
+_Static_assert(sizeof(struct ArrayMember) == 64, "");
+
+// Packed struct
+struct __attribute__((packed)) Packed {
+ char a;
+ __int256 x;
+};
+_Static_assert(sizeof(struct Packed) == 33, "");
+_Static_assert(_Alignof(struct Packed) == 1, "");
+
+// Aligned struct override
+struct __attribute__((aligned(64))) OverAligned {
+ __int256 x;
+};
+_Static_assert(sizeof(struct OverAligned) == 64, "");
+_Static_assert(_Alignof(struct OverAligned) == 64, "");
+
+// expected-no-diagnostics
diff --git a/clang/test/Sema/tautological-constant-compare.c b/clang/test/Sema/tautological-constant-compare.c
index 04b8a1416be0b..561979a3665a7 100644
--- a/clang/test/Sema/tautological-constant-compare.c
+++ b/clang/test/Sema/tautological-constant-compare.c
@@ -486,6 +486,11 @@ int main(void)
if (i128 == -1) // used to crash
return 0;
#endif
+#if __SIZEOF_INT256__
+ __int256 i256 = value();
+ if (i256 == -1) // mirrors __int128 test above
+ return 0;
+#endif
enum E {
diff --git a/clang/test/Sema/types.c b/clang/test/Sema/types.c
index 2be0e6544f3d7..baae91f61bc2e 100644
--- a/clang/test/Sema/types.c
+++ b/clang/test/Sema/types.c
@@ -39,6 +39,31 @@ typedef unsigned __int128 check_uint_128;
typedef __uint128_t check_uint_128; // expected-note {{here}}
typedef int check_uint_128; // expected-error {{different types ('int' vs '__uint128_t' (aka 'unsigned __int128'))}}
+#ifdef __SIZEOF_INT256__
+// __int256_t / __uint256_t are available (mirrors __int128_t tests above).
+void a256(void) {
+ __int256_t s;
+ __uint256_t t;
+}
+
+// __int256 is a keyword
+int c256(void) {
+ __int256 i;
+ unsigned __int256 j;
+ long unsigned __int256 k; // expected-error {{'long __int256' is invalid}}
+ int __int256; // expected-error {{cannot combine with previous}} expected-warning {{does not declare anything}}
+}
+
+// __int256_t is __int256; __uint256_t is unsigned __int256.
+typedef __int256 check_int_256;
+typedef __int256_t check_int_256; // expected-note {{here}}
+typedef int check_int_256; // expected-error {{different types ('int' vs '__int256_t' (aka '__int256'))}}
+
+typedef unsigned __int256 check_uint_256;
+typedef __uint256_t check_uint_256; // expected-note {{here}}
+typedef int check_uint_256; // expected-error {{different types ('int' vs '__uint256_t' (aka 'unsigned __int256'))}}
+#endif
+
// Array type merging should convert array size to whatever matches the target
// pointer size.
extern int i[1LL];
diff --git a/clang/test/SemaCUDA/int256.cu b/clang/test/SemaCUDA/int256.cu
new file mode 100644
index 0000000000000..ece1e099e0d5c
--- /dev/null
+++ b/clang/test/SemaCUDA/int256.cu
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
+// RUN: -aux-triple x86_64-unknown-linux-gnu \
+// RUN: -fcuda-is-device -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa \
+// RUN: -aux-triple x86_64-unknown-linux-gnu \
+// RUN: -fcuda-is-device -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple nvptx \
+// RUN: -aux-triple x86_64-unknown-linux-gnu \
+// RUN: -fcuda-is-device -verify -fsyntax-only %s
+
+// Verify that __int256 is allowed in CUDA device code when the host target
+// supports it, matching the __int128 behavior (see allow-int128.cu).
+// In CUDA mode, the host type system is shared with the device — type support
+// diagnostics are deferred and not emitted for CUDA device compilations.
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+__int256 h_glb;
+__device__ __int256 d_glb;
+
+__device__ __int256 bar() {
+ return d_glb;
+}
+
+__global__ void kernel() {
+ bar();
+}
diff --git a/clang/test/SemaCXX/deleted-operator.cpp b/clang/test/SemaCXX/deleted-operator.cpp
index 64b2b22e5661c..2f8c882db51b9 100644
--- a/clang/test/SemaCXX/deleted-operator.cpp
+++ b/clang/test/SemaCXX/deleted-operator.cpp
@@ -8,8 +8,8 @@ struct PR10757 {
int PR10757f() {
PR10757 a1;
// FIXME: We get a ridiculous number of "built-in candidate" notes here...
- if(~a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 6-8 {{built-in candidate}}
- if(a1==a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 1-144 {{built-in candidate}}
+ if(~a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 6-10 {{built-in candidate}}
+ if(a1==a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 1-196 {{built-in candidate}}
}
struct DelOpDel {
diff --git a/clang/test/SemaCXX/int256-templates.cpp b/clang/test/SemaCXX/int256-templates.cpp
new file mode 100644
index 0000000000000..857cd13db2230
--- /dev/null
+++ b/clang/test/SemaCXX/int256-templates.cpp
@@ -0,0 +1,219 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -fsyntax-only -verify %s
+//
+// Test __int256 behavior with C++ templates, SFINAE, concepts, and conversions.
+//
+// This exercises advanced C++ interactions that upstream reviewers are likely
+// to probe: NTTP (non-type template parameters), SFINAE, implicit/explicit
+// conversions, constexpr template metaprogramming, and aggregate initialization.
+//
+// Uses Clang builtin type traits (__is_integral, etc.) to avoid depending on
+// standard library headers, which are not available in %clang_cc1 tests.
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -std=c++20 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Minimal enable_if for SFINAE testing without <type_traits>
+template <bool B, typename T = void> struct enable_if {};
+template <typename T> struct enable_if<true, T> { using type = T; };
+template <bool B, typename T = void> using enable_if_t = typename enable_if<B, T>::type;
+
+// Minimal is_same
+template <typename T, typename U> struct is_same { static constexpr bool value = false; };
+template <typename T> struct is_same<T, T> { static constexpr bool value = true; };
+
+// Minimal conditional
+template <bool B, typename T, typename F> struct conditional { using type = F; };
+template <typename T, typename F> struct conditional<true, T, F> { using type = T; };
+template <bool B, typename T, typename F> using conditional_t = typename conditional<B, T, F>::type;
+
+// ========================================================================
+// 1. Non-type template parameter (NTTP)
+// ========================================================================
+
+// __int256 can be used as a non-type template parameter in C++20.
+template <__int256_t V>
+struct IntConstant {
+ static constexpr __int256_t value = V;
+};
+
+static_assert(IntConstant<0>::value == 0);
+static_assert(IntConstant<42>::value == 42);
+static_assert(IntConstant<-1>::value == -1);
+
+// Large NTTP value
+static_assert(IntConstant<((__int256_t)1 << 200)>::value == ((__int256_t)1 << 200));
+
+// Unsigned NTTP
+template <__uint256_t V>
+struct UIntConstant {
+ static constexpr __uint256_t value = V;
+};
+
+static_assert(UIntConstant<0>::value == 0);
+static_assert(UIntConstant<~(__uint256_t)0>::value == ~(__uint256_t)0);
+
+// ========================================================================
+// 2. SFINAE on __is_integral
+// ========================================================================
+
+// Clang builtin __is_integral works for __int256 types.
+static_assert(__is_integral(__int256_t));
+static_assert(__is_integral(__uint256_t));
+static_assert(__is_integral(const __int256_t));
+static_assert(__is_integral(volatile __uint256_t));
+
+// SFINAE: enable_if selects the correct overload.
+template <typename T, enable_if_t<__is_integral(T)>* = nullptr>
+constexpr int classify(T) { return 1; } // integral
+
+template <typename T, enable_if_t<__is_floating_point(T)>* = nullptr>
+constexpr int classify(T) { return 2; } // floating
+
+static_assert(classify((__int256_t)42) == 1);
+static_assert(classify((__uint256_t)42) == 1);
+static_assert(classify(3.14) == 2);
+
+// ========================================================================
+// 3. Builtin type traits for __int256
+// ========================================================================
+
+// __is_signed / __is_unsigned
+static_assert(__is_signed(__int256_t));
+static_assert(!__is_unsigned(__int256_t));
+static_assert(__is_unsigned(__uint256_t));
+static_assert(!__is_signed(__uint256_t));
+
+// __is_arithmetic
+static_assert(__is_arithmetic(__int256_t));
+static_assert(__is_arithmetic(__uint256_t));
+
+// __is_fundamental
+static_assert(__is_fundamental(__int256_t));
+static_assert(__is_fundamental(__uint256_t));
+
+// __is_scalar
+static_assert(__is_scalar(__int256_t));
+static_assert(__is_scalar(__uint256_t));
+
+// __is_trivially_copyable
+static_assert(__is_trivially_copyable(__int256_t));
+static_assert(__is_trivially_copyable(__uint256_t));
+
+// __is_standard_layout
+static_assert(__is_standard_layout(__int256_t));
+static_assert(__is_standard_layout(__uint256_t));
+
+// __is_trivially_constructible
+static_assert(__is_trivially_constructible(__int256_t));
+static_assert(__is_trivially_destructible(__int256_t));
+
+// __is_constructible from various integer types
+static_assert(__is_constructible(__int256_t, int));
+static_assert(__is_constructible(__int256_t, long long));
+static_assert(__is_constructible(__int256_t, __int128_t));
+static_assert(__is_constructible(__uint256_t, unsigned));
+static_assert(__is_constructible(__uint256_t, __uint128_t));
+
+// __is_convertible (implicit conversions)
+static_assert(__is_convertible_to(int, __int256_t));
+static_assert(__is_convertible_to(__int128_t, __int256_t));
+static_assert(__is_convertible_to(__int256_t, __int128_t));
+
+// ========================================================================
+// 4. Implicit conversions: __int128 <-> __int256
+// ========================================================================
+
+// __int128 -> __int256: implicit widening (no data loss)
+constexpr __int256_t widen_s(__int128_t x) { return x; }
+constexpr __uint256_t widen_u(__uint128_t x) { return x; }
+
+static_assert(widen_s(42) == 42);
+static_assert(widen_s(-1) == -1);
+static_assert(widen_u(42) == 42);
+
+// __int256 -> __int128: implicit narrowing (may lose data)
+constexpr __int128_t narrow_s(__int256_t x) { return x; }
+constexpr __uint128_t narrow_u(__uint256_t x) { return x; }
+
+static_assert(narrow_s(42) == 42);
+static_assert(narrow_u(42) == 42);
+
+// int -> __int256: implicit widening
+constexpr __int256_t from_int(int x) { return x; }
+static_assert(from_int(42) == 42);
+static_assert(from_int(-1) == -1);
+
+// ========================================================================
+// 5. Template argument deduction
+// ========================================================================
+
+template <typename T>
+constexpr T identity(T x) { return x; }
+
+static_assert(identity((__int256_t)42) == 42);
+static_assert(identity((__uint256_t)42) == 42);
+
+// Deduction with auto
+constexpr auto auto_val = (__int256_t)100;
+static_assert(is_same<decltype(auto_val), const __int256_t>::value);
+
+// ========================================================================
+// 6. constexpr template metaprogramming
+// ========================================================================
+
+// Recursive constexpr factorial
+template <typename T>
+constexpr T factorial(T n) {
+ return n <= 1 ? T(1) : n * factorial(n - 1);
+}
+
+// 20! = 2432902008176640000 (fits in 64-bit)
+static_assert(factorial((__int256_t)20) == 2432902008176640000LL);
+
+// 34! = 295232799039604140847618609643520000000 (doesn't fit in 128-bit)
+constexpr __int256_t fact34 = factorial((__int256_t)34);
+// Verify lower 64 bits (computed from 34! mod 2^64)
+static_assert((unsigned long long)fact34 == 0x445DA75B00000000ULL);
+
+// ========================================================================
+// 7. Variadic templates
+// ========================================================================
+
+template <typename... Ts>
+constexpr auto sum(Ts... args) {
+ return (args + ...);
+}
+
+static_assert(sum((__int256_t)1, (__int256_t)2, (__int256_t)3) == 6);
+
+// ========================================================================
+// 8. Conditional type selection
+// ========================================================================
+
+static_assert(sizeof(conditional_t<true, __int256_t, __int128_t>) == 32);
+static_assert(sizeof(conditional_t<false, __int256_t, __int128_t>) == 16);
+
+// ========================================================================
+// 9. Array and aggregate initialization
+// ========================================================================
+
+struct Pair256 {
+ __int256_t first;
+ __uint256_t second;
+};
+
+constexpr Pair256 p = {42, 100};
+static_assert(p.first == 42);
+static_assert(p.second == 100);
+
+constexpr __int256_t arr[] = {1, 2, 3, 4, 5};
+static_assert(arr[0] + arr[4] == 6);
+
+// ========================================================================
+// 10. sizeof / alignof
+// ========================================================================
+
+static_assert(sizeof(__int256_t) == 32);
+static_assert(sizeof(__uint256_t) == 32);
+static_assert(alignof(__int256_t) == 16);
+static_assert(alignof(__uint256_t) == 16);
+static_assert(sizeof(__int256_t) == 2 * sizeof(__int128_t));
diff --git a/clang/test/SemaCXX/int256-type-traits.cpp b/clang/test/SemaCXX/int256-type-traits.cpp
new file mode 100644
index 0000000000000..c756532306823
--- /dev/null
+++ b/clang/test/SemaCXX/int256-type-traits.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-unknown-linux-gnu -std=c++20 %s
+// expected-no-diagnostics
+
+// --- Type classification traits ---
+
+static_assert(__is_integral(__int256));
+static_assert(__is_integral(unsigned __int256));
+static_assert(__is_integral(__int256_t));
+static_assert(__is_integral(__uint256_t));
+
+static_assert(__is_arithmetic(__int256));
+static_assert(__is_arithmetic(unsigned __int256));
+
+static_assert(__is_scalar(__int256));
+static_assert(__is_scalar(unsigned __int256));
+
+static_assert(__is_fundamental(__int256));
+static_assert(__is_fundamental(unsigned __int256));
+
+// --- Signedness traits ---
+
+static_assert(__is_signed(__int256));
+static_assert(!__is_unsigned(__int256));
+static_assert(!__is_signed(unsigned __int256));
+static_assert(__is_unsigned(unsigned __int256));
+
+static_assert(__is_signed(__int256_t));
+static_assert(__is_unsigned(__uint256_t));
+
+// --- __builtin_is_implicit_lifetime ---
+
+static_assert(__builtin_is_implicit_lifetime(__int256));
+static_assert(__builtin_is_implicit_lifetime(unsigned __int256));
+
+// --- __make_signed / __make_unsigned ---
+
+static_assert(__is_same(__make_signed(__int256), __int256));
+static_assert(__is_same(__make_signed(unsigned __int256), __int256));
+static_assert(__is_same(__make_unsigned(__int256), unsigned __int256));
+static_assert(__is_same(__make_unsigned(unsigned __int256), unsigned __int256));
+
+// With cv-qualifiers
+static_assert(__is_same(__make_signed(const __int256), const __int256));
+static_assert(__is_same(__make_signed(volatile unsigned __int256), volatile __int256));
+static_assert(__is_same(__make_signed(const volatile unsigned __int256), const volatile __int256));
+static_assert(__is_same(__make_unsigned(const __int256), const unsigned __int256));
+static_assert(__is_same(__make_unsigned(volatile __int256), volatile unsigned __int256));
+
+// --- Enum with __int256 underlying type ---
+
+enum E256 : __int256_t { E256_Zero = 0, E256_One = 1 };
+enum U256 : __uint256_t { U256_Zero = 0, U256_One = 1 };
+
+static_assert(__is_same(__make_signed(E256), __int256_t));
+static_assert(__is_same(__make_unsigned(E256), __uint256_t));
+static_assert(__is_same(__make_signed(U256), __int256_t));
+static_assert(__is_same(__make_unsigned(U256), __uint256_t));
+
+// --- sizeof / alignof ---
+
+static_assert(sizeof(__int256) == 32);
+static_assert(alignof(__int256) == 16);
+static_assert(sizeof(unsigned __int256) == 32);
+static_assert(alignof(unsigned __int256) == 16);
+static_assert(sizeof(__int256_t) == 32);
+static_assert(sizeof(__uint256_t) == 32);
+
+// --- Overload resolution ---
+
+constexpr int select_overload(__int128) { return 128; }
+constexpr int select_overload(__int256_t) { return 256; }
+
+static_assert(select_overload((__int256_t)0) == 256);
+static_assert(select_overload((__int128)0) == 128);
diff --git a/clang/test/SemaCXX/overloaded-builtin-operators.cpp b/clang/test/SemaCXX/overloaded-builtin-operators.cpp
index 0c76df79e6e14..7243969896181 100644
--- a/clang/test/SemaCXX/overloaded-builtin-operators.cpp
+++ b/clang/test/SemaCXX/overloaded-builtin-operators.cpp
@@ -195,7 +195,7 @@ struct A {
void test_dr425(A a) {
(void)(1.0f * a); // expected-error{{ambiguous}} \
- // expected-note 12{{candidate}}
+ // expected-note 14{{candidate}}
}
// pr5432
@@ -238,7 +238,7 @@ namespace PR8477 {
// FIXME: It would be nice to report fewer candidates here.
(void)(foo - foo); // expected-error{{use of overloaded operator '-' is ambiguous}} \
// expected-note 4{{built-in candidate operator-}} \
- // expected-note{{142 candidates omitted}}
+ // expected-note{{194 candidates omitted}}
return foo[zero] == zero;
}
}
diff --git a/clang/test/SemaSYCL/int256.cpp b/clang/test/SemaSYCL/int256.cpp
new file mode 100644
index 0000000000000..a1516a0c6f05c
--- /dev/null
+++ b/clang/test/SemaSYCL/int256.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -triple spir64 -aux-triple x86_64-unknown-linux-gnu \
+// RUN: -fsycl-is-device -verify -fsyntax-only %s
+
+// Verify that __int256 is rejected in SYCL device code on targets
+// that don't support it, mirroring the __int128 restriction test.
+
+typedef __uint256_t BIGTY;
+
+template <class T>
+class Z {
+public:
+ // expected-note at +1 {{'field' defined here}}
+ T field;
+ // expected-note at +1 2{{'field1' defined here}}
+ __int256 field1;
+};
+
+void host_ok(void) {
+ __int256 A;
+ int B = sizeof(__int256);
+ Z<__int256> C;
+ C.field1 = A;
+}
+
+void usage() {
+ // expected-note at +1 {{'A' defined here}}
+ __int256 A;
+ Z<__int256> C;
+ // expected-error at +3 2{{expression requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+ // expected-error at +2 {{'A' requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+ // expected-error at +1 {{'field1' requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+ C.field1 = A;
+}
+
+template <typename Name, typename Func>
+__attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
+ // expected-note at +1 2{{called by 'kernel}}
+ kernelFunc();
+}
+
+int main() {
+ // expected-note at +1 {{'CapturedToDevice' defined here}}
+ __int256 CapturedToDevice = 1;
+ host_ok();
+ kernel<class variables>([=]() {
+ // expected-error at +1 {{'CapturedToDevice' requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+ auto C = CapturedToDevice;
+ Z<__int256> S;
+ // expected-error at +2 {{expression requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+ // expected-error at +1 {{'field1' requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+ S.field1 += 1;
+ // expected-error at +2 {{expression requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+ // expected-error at +1 {{'field' requires 256 bit size '__int256' type support, but target 'spir64' does not support it}}
+ S.field = 1;
+ });
+
+ kernel<class functions>([=]() {
+ // expected-note at +1 {{called by 'operator()'}}
+ usage();
+ });
+
+ kernel<class ok>([=]() {
+ Z<__int256> S;
+ auto A = sizeof(CapturedToDevice);
+ });
+
+ return 0;
+}
+
+// no error expected for host-side functions
+BIGTY zoo(BIGTY h) {
+ h = 1;
+ return h;
+}
>From bdd7361adb4234bfb935704aaeb74641a01788a8 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:38:46 +0100
Subject: [PATCH 04/17] [clang][docs] Add __int256 documentation and release
notes
Document the new __int256/__uint256 builtin type in LanguageExtensions.rst
with usage examples, target availability, and comparison to _BitInt(256).
Add release note entry for Clang 22.
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
clang/docs/LanguageExtensions.rst | 101 +++++++++++++++++++++++++++++-
clang/docs/ReleaseNotes.rst | 5 ++
2 files changed, 104 insertions(+), 2 deletions(-)
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index a3e487f910725..a1e058d959148 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -451,6 +451,101 @@ favor of the standard type.
Note: the ABI for ``_BitInt(N)`` is still in the process of being stabilized,
so this type should not yet be used in interfaces that require ABI stability.
+``__int256``
+------------
+
+Clang supports ``__int256`` as a builtin 256-bit integer type on targets that
+opt in (currently x86-64 and AArch64). It is the 256-bit analogue of
+``__int128`` — a first-class builtin type with full type trait integration,
+proper alignment, and register-based calling conventions.
+
+**Type spellings:**
+
+- ``__int256``, ``signed __int256``, ``__int256_t`` — signed 256-bit integer
+- ``unsigned __int256``, ``__uint256_t`` — unsigned 256-bit integer
+
+**Feature detection:**
+
+Use ``__SIZEOF_INT256__`` (defined as ``32`` when available) or
+``__is_target_feature("int256")`` for preprocessor-level detection:
+
+.. code-block:: c
+
+ #ifdef __SIZEOF_INT256__
+ // __int256 is available
+ #endif
+
+**Properties:**
+
+- Size: 32 bytes (256 bits)
+- Alignment: 32 bytes (natural alignment, matching ``__m256i``)
+- ABI: register-based on x86-64 (arguments in GPRs, return via sret)
+- Integer rank: above ``__int128`` (correct implicit conversion rules)
+
+**Supported operations:**
+
+All standard integer operations work: arithmetic (``+``, ``-``, ``*``, ``/``,
+``%``), bitwise (``&``, ``|``, ``^``, ``~``, ``<<``, ``>>``), comparisons
+(``==``, ``!=``, ``<``, ``>``, ``<=``, ``>=``), and conversions to/from other
+integer and floating-point types.
+
+**Supported builtins:**
+
+- ``__builtin_popcountg``, ``__builtin_clzg``, ``__builtin_ctzg``
+- ``__builtin_add_overflow``, ``__builtin_sub_overflow``, ``__builtin_mul_overflow``
+- ``__builtin_ffs`` (expanded inline via ``cttz``)
+
+**C++ type traits:**
+
+In C++ mode, ``__int256`` is a full integral type:
+
+- ``__is_integral(__int256)`` is ``true``
+- ``__is_arithmetic``, ``__is_scalar``, ``__is_fundamental`` are ``true``
+- ``__is_signed(__int256)`` is ``true``; ``__is_unsigned(unsigned __int256)`` is ``true``
+- ``__make_signed(unsigned __int256)`` yields ``__int256``
+- ``__make_unsigned(__int256)`` yields ``unsigned __int256``
+- Enums with ``__int256_t`` or ``__uint256_t`` as underlying type are supported
+
+**Relationship to** ``_BitInt(256)``:
+
+Both ``__int256`` and ``_BitInt(256)`` produce identical ``i256`` IR operations,
+but they differ in ABI, alignment, and type trait behavior:
+
+.. list-table::
+ :header-rows: 1
+
+ * - Property
+ - ``__int256``
+ - ``_BitInt(256)``
+ * - Alignment
+ - 32 bytes
+ - 8 bytes
+ * - x86-64 SysV args
+ - Direct (4 GPRs)
+ - Indirect (byval)
+ * - x86-64 SysV return
+ - Indirect (sret)
+ - Indirect (sret)
+ * - AArch64 args
+ - Direct (4 GPRs: x0-x3)
+ - Indirect (byval)
+ * - AArch64 return
+ - Direct (4 GPRs: x0-x3)
+ - Indirect (sret)
+ * - Win64 ABI
+ - Indirect
+ - Indirect
+ * - ``__is_integral``
+ - ``true``
+ - ``false``
+ * - ``std::numeric_limits``
+ - Fully specialized
+ - Not specialized
+
+The ABI difference has measurable performance impact: the register-based
+calling convention avoids memory round-trips for ``__int256`` function
+arguments on x86-64 and both arguments and return values on AArch64.
+
C keywords supported in all language modes
------------------------------------------
@@ -4561,7 +4656,8 @@ argument can be of any unsigned integer type or fixed boolean vector.
``__builtin_popcountg`` is meant to be a type-generic alternative to the
``__builtin_popcount{,l,ll}`` builtins, with support for other integer types,
-such as ``unsigned __int128`` and C23 ``unsigned _BitInt(N)``.
+such as ``unsigned __int128``, ``unsigned __int256``, and C23
+``unsigned _BitInt(N)``.
``__builtin_clzg`` and ``__builtin_ctzg``
-----------------------------------------
@@ -4608,7 +4704,8 @@ only one argument is provided, then the behavior is undefined.
``__builtin_clzg`` (respectively ``__builtin_ctzg``) is meant to be a
type-generic alternative to the ``__builtin_clz{,l,ll}`` (respectively
``__builtin_ctz{,l,ll}``) builtins, with support for other integer types, such
-as ``unsigned __int128`` and C23 ``unsigned _BitInt(N)``.
+as ``unsigned __int128``, ``unsigned __int256``, and C23
+``unsigned _BitInt(N)``.
``__builtin_counted_by_ref``
----------------------------
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 668097236fe97..066de124e0451 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -145,6 +145,11 @@ C23 Feature Support
Non-comprehensive list of changes in this release
-------------------------------------------------
+- Added ``__int256`` and ``__uint256`` as builtin extended integer types on all
+ 64-bit targets, analogous to ``__int128``/``__uint128``. These types provide
+ native 256-bit integer arithmetic with compiler-rt runtime support for
+ arithmetic, division, shifts, and float conversions.
+
- Added ``__builtin_stdc_rotate_left`` and ``__builtin_stdc_rotate_right``
for bit rotation of unsigned integers including ``_BitInt`` types. Rotation
counts are normalized modulo the bit-width and support negative values.
>From 1c832b7711ed63bbfa6557628c76afc34cd0e32b Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:38:59 +0100
Subject: [PATCH 05/17] [llvm] Add i256 data layout, libcall routing, and
codegen support
- Add i256:256 alignment to all 64-bit target data layout strings
(X86-64, AArch64, RISC-V 64, PPC64, SystemZ, etc.)
- Register i256 division/modulo runtime libcalls (udivoi4, etc.) in
RuntimeLibcalls.td for X86-64 and AArch64
- Add i256 type legalization in X86 and AArch64 backends
(setOperationAction for div/rem/mul to LibCall)
- Add LegalizeDAG support for 256-bit libcall expansion
- Add llvm-libgcc version script entries for new builtins
Shifts are NOT registered as libcalls (uses default ExpandThroughStack)
to avoid sanitizer link failures.
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
llvm-libgcc/gcc_s.ver.in | 10 +++
llvm/include/llvm/IR/RuntimeLibcalls.td | 70 +++++++++++++++++--
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 69 +++++++++---------
llvm/lib/CodeGen/TargetLoweringBase.cpp | 54 ++++++++++++++
.../Target/AArch64/AArch64ISelLowering.cpp | 4 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 7 ++
llvm/lib/TargetParser/TargetDataLayout.cpp | 22 +++---
7 files changed, 185 insertions(+), 51 deletions(-)
diff --git a/llvm-libgcc/gcc_s.ver.in b/llvm-libgcc/gcc_s.ver.in
index e0bbf0e071553..22dc316fd0fb9 100644
--- a/llvm-libgcc/gcc_s.ver.in
+++ b/llvm-libgcc/gcc_s.ver.in
@@ -67,6 +67,16 @@ GCC_4.3.0 { __bswapdi2; __bswapsi2; __emutls_get_address; };
GCC_3.4.4 { __absvti2; __addvti3; __mulvti3; __negvti2; __subvti3; };
GCC_4.2.0 { __floatuntidf; __floatuntisf; };
GCC_7.0.0 { __divmodti4; };
+
+ // 256-bit integer builtins (compiler-rt only, requires __int256 support)
+ COMPILER_RT_256 {
+ __ashloi3; __ashroi3; __lshroi3; __multi5; __divoi3; __udivoi3; __modoi3;
+ __umodoi3; __negoi2; __cmpoi2; __ucmpoi2; __udivmodoi4; __divmodoi4;
+ __clzoi2; __ctzoi2; __popcountoi2; __parityoi2; __ffsoi2;
+ __absvoi2; __addvoi3; __subvoi3; __mulvoi3; __negvoi2; __muloi5;
+ __fixsfoi; __fixdfoi; __fixunssfoi; __fixunsdfoi;
+ __floatoisf; __floatoidf; __floatunoisf; __floatunoidf;
+ };
#endif
#if defined(GLOBAL_X86)
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index e4a926d3cb1d3..e35fbbd41e54c 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -81,21 +81,21 @@ def ExceptionModelIsSjLj : RuntimeLibcallPredicate<
//--------------------------------------------------------------------
// Integer
-foreach IntTy = ["I16", "I32", "I64", "I128"] in {
+foreach IntTy = ["I16", "I32", "I64", "I128", "I256"] in {
def SHL_#IntTy : RuntimeLibcall;
def SRL_#IntTy : RuntimeLibcall;
def SRA_#IntTy : RuntimeLibcall;
}
-foreach IntTy = ["I8", "I16", "I32", "I64", "I128"] in {
+foreach IntTy = ["I8", "I16", "I32", "I64", "I128", "I256"] in {
def MUL_#IntTy : RuntimeLibcall;
}
-foreach IntTy = ["I32", "I64", "I128" ] in {
+foreach IntTy = ["I32", "I64", "I128", "I256"] in {
def MULO_#IntTy : RuntimeLibcall;
}
-foreach IntTy = ["I8", "I16", "I32", "I64", "I128"] in {
+foreach IntTy = ["I8", "I16", "I32", "I64", "I128", "I256"] in {
def SDIV_#IntTy : RuntimeLibcall;
def UDIV_#IntTy : RuntimeLibcall;
def SREM_#IntTy : RuntimeLibcall;
@@ -108,7 +108,7 @@ foreach IntTy = ["I32", "I64" ] in {
def NEG_#IntTy : RuntimeLibcall;
}
-foreach IntTy = ["I32", "I64", "I128"] in {
+foreach IntTy = ["I32", "I64", "I128", "I256"] in {
def CTLZ_#IntTy : RuntimeLibcall;
def CTPOP_#IntTy : RuntimeLibcall;
}
@@ -305,12 +305,16 @@ def FPTOSINT_F64_I128 : RuntimeLibcall;
def FPTOSINT_F80_I32 : RuntimeLibcall;
def FPTOSINT_F80_I64 : RuntimeLibcall;
def FPTOSINT_F80_I128 : RuntimeLibcall;
+def FPTOSINT_F80_I256 : RuntimeLibcall;
def FPTOSINT_F128_I32 : RuntimeLibcall;
def FPTOSINT_F128_I64 : RuntimeLibcall;
def FPTOSINT_F128_I128 : RuntimeLibcall;
def FPTOSINT_PPCF128_I32 : RuntimeLibcall;
def FPTOSINT_PPCF128_I64 : RuntimeLibcall;
def FPTOSINT_PPCF128_I128 : RuntimeLibcall;
+def FPTOSINT_F32_I256 : RuntimeLibcall;
+def FPTOSINT_F64_I256 : RuntimeLibcall;
+def FPTOSINT_F128_I256 : RuntimeLibcall;
def FPTOUINT_F16_I32 : RuntimeLibcall;
def FPTOUINT_F16_I64 : RuntimeLibcall;
def FPTOUINT_F16_I128 : RuntimeLibcall;
@@ -323,12 +327,16 @@ def FPTOUINT_F64_I128 : RuntimeLibcall;
def FPTOUINT_F80_I32 : RuntimeLibcall;
def FPTOUINT_F80_I64 : RuntimeLibcall;
def FPTOUINT_F80_I128 : RuntimeLibcall;
+def FPTOUINT_F80_I256 : RuntimeLibcall;
def FPTOUINT_F128_I32 : RuntimeLibcall;
def FPTOUINT_F128_I64 : RuntimeLibcall;
def FPTOUINT_F128_I128 : RuntimeLibcall;
def FPTOUINT_PPCF128_I32 : RuntimeLibcall;
def FPTOUINT_PPCF128_I64 : RuntimeLibcall;
def FPTOUINT_PPCF128_I128 : RuntimeLibcall;
+def FPTOUINT_F32_I256 : RuntimeLibcall;
+def FPTOUINT_F64_I256 : RuntimeLibcall;
+def FPTOUINT_F128_I256 : RuntimeLibcall;
def SINTTOFP_I32_F16 : RuntimeLibcall;
def SINTTOFP_I32_F32 : RuntimeLibcall;
def SINTTOFP_I32_F64 : RuntimeLibcall;
@@ -348,6 +356,10 @@ def SINTTOFP_I128_F64 : RuntimeLibcall;
def SINTTOFP_I128_F80 : RuntimeLibcall;
def SINTTOFP_I128_F128 : RuntimeLibcall;
def SINTTOFP_I128_PPCF128 : RuntimeLibcall;
+def SINTTOFP_I256_F32 : RuntimeLibcall;
+def SINTTOFP_I256_F64 : RuntimeLibcall;
+def SINTTOFP_I256_F80 : RuntimeLibcall;
+def SINTTOFP_I256_F128 : RuntimeLibcall;
def UINTTOFP_I32_F16 : RuntimeLibcall;
def UINTTOFP_I32_F32 : RuntimeLibcall;
def UINTTOFP_I32_F64 : RuntimeLibcall;
@@ -367,6 +379,10 @@ def UINTTOFP_I128_F64 : RuntimeLibcall;
def UINTTOFP_I128_F80 : RuntimeLibcall;
def UINTTOFP_I128_F128 : RuntimeLibcall;
def UINTTOFP_I128_PPCF128 : RuntimeLibcall;
+def UINTTOFP_I256_F32 : RuntimeLibcall;
+def UINTTOFP_I256_F64 : RuntimeLibcall;
+def UINTTOFP_I256_F80 : RuntimeLibcall;
+def UINTTOFP_I256_F128 : RuntimeLibcall;
def CONVERT_F128_PPCF128 : RuntimeLibcall;
def CONVERT_PPCF128_F128 : RuntimeLibcall;
@@ -926,24 +942,28 @@ def __divhi3 : RuntimeLibcallImpl<SDIV_I16>;
def __divsi3 : RuntimeLibcallImpl<SDIV_I32>;
def __divdi3 : RuntimeLibcallImpl<SDIV_I64>;
def __divti3 : RuntimeLibcallImpl<SDIV_I128>;
+def __divoi3 : RuntimeLibcallImpl<SDIV_I256>;
def __udivqi3 : RuntimeLibcallImpl<UDIV_I8>;
def __udivhi3 : RuntimeLibcallImpl<UDIV_I16>;
def __udivsi3 : RuntimeLibcallImpl<UDIV_I32>;
def __udivdi3 : RuntimeLibcallImpl<UDIV_I64>;
def __udivti3 : RuntimeLibcallImpl<UDIV_I128>;
+def __udivoi3 : RuntimeLibcallImpl<UDIV_I256>;
def __modqi3 : RuntimeLibcallImpl<SREM_I8>;
def __modhi3 : RuntimeLibcallImpl<SREM_I16>;
def __modsi3 : RuntimeLibcallImpl<SREM_I32>;
def __moddi3 : RuntimeLibcallImpl<SREM_I64>;
def __modti3 : RuntimeLibcallImpl<SREM_I128>;
+def __modoi3 : RuntimeLibcallImpl<SREM_I256>;
def __umodqi3 : RuntimeLibcallImpl<UREM_I8>;
def __umodhi3 : RuntimeLibcallImpl<UREM_I16>;
def __umodsi3 : RuntimeLibcallImpl<UREM_I32>;
def __umoddi3 : RuntimeLibcallImpl<UREM_I64>;
def __umodti3 : RuntimeLibcallImpl<UREM_I128>;
+def __umodoi3 : RuntimeLibcallImpl<UREM_I256>;
def __negsi2 : RuntimeLibcallImpl<NEG_I32>;
def __negdi2 : RuntimeLibcallImpl<NEG_I64>;
@@ -951,10 +971,12 @@ def __negdi2 : RuntimeLibcallImpl<NEG_I64>;
def __clzsi2 : RuntimeLibcallImpl<CTLZ_I32>;
def __clzdi2 : RuntimeLibcallImpl<CTLZ_I64>;
def __clzti2 : RuntimeLibcallImpl<CTLZ_I128>;
+def __clzoi2 : RuntimeLibcallImpl<CTLZ_I256>;
def __popcountsi2 : RuntimeLibcallImpl<CTPOP_I32>;
def __popcountdi2 : RuntimeLibcallImpl<CTPOP_I64>;
def __popcountti2 : RuntimeLibcallImpl<CTPOP_I128>;
+def __popcountoi2 : RuntimeLibcallImpl<CTPOP_I256>;
def __addsf3 : RuntimeLibcallImpl<ADD_F32>;
def __adddf3 : RuntimeLibcallImpl<ADD_F64>;
@@ -1023,15 +1045,19 @@ def __fixhfti : RuntimeLibcallImpl<FPTOSINT_F16_I128>;
def __fixsfsi : RuntimeLibcallImpl<FPTOSINT_F32_I32>;
def __fixsfdi : RuntimeLibcallImpl<FPTOSINT_F32_I64>;
def __fixsfti : RuntimeLibcallImpl<FPTOSINT_F32_I128>;
+def __fixsfoi : RuntimeLibcallImpl<FPTOSINT_F32_I256>;
def __fixdfsi : RuntimeLibcallImpl<FPTOSINT_F64_I32>;
def __fixdfdi : RuntimeLibcallImpl<FPTOSINT_F64_I64>;
def __fixdfti : RuntimeLibcallImpl<FPTOSINT_F64_I128>;
+def __fixdfoi : RuntimeLibcallImpl<FPTOSINT_F64_I256>;
def __fixxfsi : RuntimeLibcallImpl<FPTOSINT_F80_I32>;
def __fixxfdi : RuntimeLibcallImpl<FPTOSINT_F80_I64>;
def __fixxfti : RuntimeLibcallImpl<FPTOSINT_F80_I128>;
+def __fixxfoi : RuntimeLibcallImpl<FPTOSINT_F80_I256>;
def __fixtfsi : RuntimeLibcallImpl<FPTOSINT_F128_I32>;
def __fixtfdi_f128 : RuntimeLibcallImpl<FPTOSINT_F128_I64, "__fixtfdi">;
def __fixtfti_f128 : RuntimeLibcallImpl<FPTOSINT_F128_I128, "__fixtfti">;
+def __fixtfoi_f128 : RuntimeLibcallImpl<FPTOSINT_F128_I256, "__fixtfoi">;
def __gcc_qtou : RuntimeLibcallImpl<FPTOSINT_PPCF128_I32>;
def __fixtfdi_ppcf128 : RuntimeLibcallImpl<FPTOSINT_PPCF128_I64, "__fixtfdi">;
def __fixtfti_ppcf128 : RuntimeLibcallImpl<FPTOSINT_PPCF128_I128, "__fixtfti">;
@@ -1041,15 +1067,19 @@ def __fixunshfti : RuntimeLibcallImpl<FPTOUINT_F16_I128>;
def __fixunssfsi : RuntimeLibcallImpl<FPTOUINT_F32_I32>;
def __fixunssfdi : RuntimeLibcallImpl<FPTOUINT_F32_I64>;
def __fixunssfti : RuntimeLibcallImpl<FPTOUINT_F32_I128>;
+def __fixunssfoi : RuntimeLibcallImpl<FPTOUINT_F32_I256>;
def __fixunsdfsi : RuntimeLibcallImpl<FPTOUINT_F64_I32>;
def __fixunsdfdi : RuntimeLibcallImpl<FPTOUINT_F64_I64>;
def __fixunsdfti : RuntimeLibcallImpl<FPTOUINT_F64_I128>;
+def __fixunsdfoi : RuntimeLibcallImpl<FPTOUINT_F64_I256>;
def __fixunsxfsi : RuntimeLibcallImpl<FPTOUINT_F80_I32>;
def __fixunsxfdi : RuntimeLibcallImpl<FPTOUINT_F80_I64>;
def __fixunsxfti : RuntimeLibcallImpl<FPTOUINT_F80_I128>;
+def __fixunsxfoi : RuntimeLibcallImpl<FPTOUINT_F80_I256>;
def __fixunstfsi_f128 : RuntimeLibcallImpl<FPTOUINT_F128_I32, "__fixunstfsi">;
def __fixunstfdi_f128 : RuntimeLibcallImpl<FPTOUINT_F128_I64, "__fixunstfdi">;
def __fixunstfti_f128 : RuntimeLibcallImpl<FPTOUINT_F128_I128, "__fixunstfti">;
+def __fixunstfoi_f128 : RuntimeLibcallImpl<FPTOUINT_F128_I256, "__fixunstfoi">;
def __fixunstfsi_ppcf128 : RuntimeLibcallImpl<FPTOUINT_PPCF128_I32, "__fixunstfsi">;
def __fixunstfdi_ppcf128 : RuntimeLibcallImpl<FPTOUINT_PPCF128_I64, "__fixunstfdi">;
def __fixunstfti_ppcf128 : RuntimeLibcallImpl<FPTOUINT_PPCF128_I128, "__fixunstfti">;
@@ -1072,6 +1102,10 @@ def __floattidf : RuntimeLibcallImpl<SINTTOFP_I128_F64>;
def __floattixf : RuntimeLibcallImpl<SINTTOFP_I128_F80>;
def __floattitf_f128 : RuntimeLibcallImpl<SINTTOFP_I128_F128, "__floattitf">;
def __floattitf_ppcf128 : RuntimeLibcallImpl<SINTTOFP_I128_PPCF128, "__floattitf">;
+def __floatoisf : RuntimeLibcallImpl<SINTTOFP_I256_F32>;
+def __floatoidf : RuntimeLibcallImpl<SINTTOFP_I256_F64>;
+def __floatoixf : RuntimeLibcallImpl<SINTTOFP_I256_F80>;
+def __floatoitf_f128 : RuntimeLibcallImpl<SINTTOFP_I256_F128, "__floatoitf">;
def __floatunsihf : RuntimeLibcallImpl<UINTTOFP_I32_F16>;
def __floatunsisf : RuntimeLibcallImpl<UINTTOFP_I32_F32>;
def __floatunsidf : RuntimeLibcallImpl<UINTTOFP_I32_F64>;
@@ -1091,6 +1125,10 @@ def __floatuntidf : RuntimeLibcallImpl<UINTTOFP_I128_F64>;
def __floatuntixf : RuntimeLibcallImpl<UINTTOFP_I128_F80>;
def __floatuntitf_f128 : RuntimeLibcallImpl<UINTTOFP_I128_F128, "__floatuntitf">;
def __floatuntitf_ppcf128 : RuntimeLibcallImpl<UINTTOFP_I128_PPCF128, "__floatuntitf">;
+def __floatunoisf : RuntimeLibcallImpl<UINTTOFP_I256_F32>;
+def __floatunoidf : RuntimeLibcallImpl<UINTTOFP_I256_F64>;
+def __floatunoixf : RuntimeLibcallImpl<UINTTOFP_I256_F80>;
+def __floatunoitf_f128 : RuntimeLibcallImpl<UINTTOFP_I256_F128, "__floatunoitf">;
def __extendkftf2 : RuntimeLibcallImpl<CONVERT_F128_PPCF128>;
def __trunctfkf2 : RuntimeLibcallImpl<CONVERT_PPCF128_F128>;
@@ -1788,6 +1826,17 @@ defset list<RuntimeLibcallImpl> Int128RTLibcalls = {
def __multi3 : RuntimeLibcallImpl<MUL_I128>;
}
+defset list<RuntimeLibcallImpl> Int256RTLibcalls = {
+ // i256 libcalls are intentionally NOT registered. The backend expands i256
+ // operations inline (shifts via ExpandToParts, multiplication via
+ // forceExpandMultiply). Registering libcalls like __multi5 causes an ABI
+ // mismatch on targets where __int256 uses indirect passing (sret/byval):
+ // the backend generates calls with split-scalar convention (4 x i64 in regs)
+ // while the compiled builtins expect indirect pointers. Additionally,
+ // registering libcalls causes link failures in sanitizer runtimes that embed
+ // UBSan but don't link against compiler-rt builtins.
+}
+
//--------------------------------------------------------------------
// compiler-rt only, not available by default
//--------------------------------------------------------------------
@@ -1800,6 +1849,10 @@ defset list<RuntimeLibcallImpl> CompilerRTOnlyInt128Libcalls = {
def __muloti4 : RuntimeLibcallImpl<MULO_I128>;
}
+defset list<RuntimeLibcallImpl> CompilerRTOnlyInt256Libcalls = {
+ def __muloi5 : RuntimeLibcallImpl<MULO_I256>;
+}
+
//--------------------------------------------------------------------
// Define implementation other libcalls
//--------------------------------------------------------------------
@@ -1976,7 +2029,9 @@ defvar DefaultRuntimeLibcallImpls_f128 =
defvar DefaultRuntimeLibcallImplsBaseList =
!listremove(
!listremove(
- !listremove(AllDefaultRuntimeLibcallImpls, Int128RTLibcalls),
+ !listremove(
+ !listremove(AllDefaultRuntimeLibcallImpls, Int128RTLibcalls),
+ Int256RTLibcalls),
DefaultRuntimeLibcallImpls_f80),
DefaultRuntimeLibcallImpls_ppcf128);
@@ -2170,6 +2225,7 @@ def AArch64SystemLibrary : SystemRuntimeLibrary<
LibmHasFrexpF128, LibmHasLdexpF128,
AArch64LibcallImpls,
LibcallImpls<(add Int128RTLibcalls), isAArch64_ILP64>,
+ Int256RTLibcalls,
LibcallImpls<(add bzero), isOSDarwin>,
DarwinExp10, DarwinSinCosStret, DarwinMemsetPattern,
MacOSUnlockedIO,
@@ -3419,7 +3475,7 @@ def X86_32SystemLibrary
def X86_64SystemLibrary
: SystemRuntimeLibrary<isX86_64,
- (add X86CommonLibcalls, Int128RTLibcalls)>;
+ (add X86CommonLibcalls, Int128RTLibcalls, Int256RTLibcalls)>;
//===----------------------------------------------------------------------===//
// XCore Runtime Libcalls
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index eb20e7982a102..817f98cec23b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -153,7 +153,8 @@ class SelectionDAGLegalize {
SDValue ExpandIntLibCall(SDNode *Node, bool isSigned, RTLIB::Libcall Call_I8,
RTLIB::Libcall Call_I16, RTLIB::Libcall Call_I32,
- RTLIB::Libcall Call_I64, RTLIB::Libcall Call_I128);
+ RTLIB::Libcall Call_I64, RTLIB::Libcall Call_I128,
+ RTLIB::Libcall Call_I256);
void ExpandArgFPLibCall(SDNode *Node,
RTLIB::Libcall Call_F32, RTLIB::Libcall Call_F64,
RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128,
@@ -161,7 +162,8 @@ class SelectionDAGLegalize {
SmallVectorImpl<SDValue> &Results);
SDValue ExpandBitCountingLibCall(SDNode *Node, RTLIB::Libcall CallI32,
RTLIB::Libcall CallI64,
- RTLIB::Libcall CallI128);
+ RTLIB::Libcall CallI128,
+ RTLIB::Libcall CallI256);
void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
SDValue ExpandSincosStretLibCall(SDNode *Node) const;
@@ -2269,12 +2271,10 @@ void SelectionDAGLegalize::ExpandFastFPLibCall(
ExpandFPLibCall(Node, LC, Results);
}
-SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
- RTLIB::Libcall Call_I8,
- RTLIB::Libcall Call_I16,
- RTLIB::Libcall Call_I32,
- RTLIB::Libcall Call_I64,
- RTLIB::Libcall Call_I128) {
+SDValue SelectionDAGLegalize::ExpandIntLibCall(
+ SDNode *Node, bool isSigned, RTLIB::Libcall Call_I8,
+ RTLIB::Libcall Call_I16, RTLIB::Libcall Call_I32, RTLIB::Libcall Call_I64,
+ RTLIB::Libcall Call_I128, RTLIB::Libcall Call_I256) {
RTLIB::Libcall LC;
switch (Node->getSimpleValueType(0).SimpleTy) {
default: llvm_unreachable("Unexpected request for libcall!");
@@ -2283,6 +2283,9 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
case MVT::i32: LC = Call_I32; break;
case MVT::i64: LC = Call_I64; break;
case MVT::i128: LC = Call_I128; break;
+ case MVT::i256:
+ LC = Call_I256;
+ break;
}
return ExpandLibCall(LC, Node, isSigned).first;
}
@@ -2305,7 +2308,7 @@ void SelectionDAGLegalize::ExpandArgFPLibCall(SDNode* Node,
SDValue SelectionDAGLegalize::ExpandBitCountingLibCall(
SDNode *Node, RTLIB::Libcall CallI32, RTLIB::Libcall CallI64,
- RTLIB::Libcall CallI128) {
+ RTLIB::Libcall CallI128, RTLIB::Libcall CallI256) {
RTLIB::Libcall LC;
switch (Node->getSimpleValueType(0).SimpleTy) {
default:
@@ -2319,6 +2322,9 @@ SDValue SelectionDAGLegalize::ExpandBitCountingLibCall(
case MVT::i128:
LC = CallI128;
break;
+ case MVT::i256:
+ LC = CallI256;
+ break;
}
// Bit-counting libcalls have one unsigned argument and return `int`.
@@ -5325,28 +5331,24 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
break;
}
case ISD::SREM:
- Results.push_back(ExpandIntLibCall(Node, true,
- RTLIB::SREM_I8,
- RTLIB::SREM_I16, RTLIB::SREM_I32,
- RTLIB::SREM_I64, RTLIB::SREM_I128));
+ Results.push_back(ExpandIntLibCall(
+ Node, true, RTLIB::SREM_I8, RTLIB::SREM_I16, RTLIB::SREM_I32,
+ RTLIB::SREM_I64, RTLIB::SREM_I128, RTLIB::SREM_I256));
break;
case ISD::UREM:
- Results.push_back(ExpandIntLibCall(Node, false,
- RTLIB::UREM_I8,
- RTLIB::UREM_I16, RTLIB::UREM_I32,
- RTLIB::UREM_I64, RTLIB::UREM_I128));
+ Results.push_back(ExpandIntLibCall(
+ Node, false, RTLIB::UREM_I8, RTLIB::UREM_I16, RTLIB::UREM_I32,
+ RTLIB::UREM_I64, RTLIB::UREM_I128, RTLIB::UREM_I256));
break;
case ISD::SDIV:
- Results.push_back(ExpandIntLibCall(Node, true,
- RTLIB::SDIV_I8,
- RTLIB::SDIV_I16, RTLIB::SDIV_I32,
- RTLIB::SDIV_I64, RTLIB::SDIV_I128));
+ Results.push_back(ExpandIntLibCall(
+ Node, true, RTLIB::SDIV_I8, RTLIB::SDIV_I16, RTLIB::SDIV_I32,
+ RTLIB::SDIV_I64, RTLIB::SDIV_I128, RTLIB::SDIV_I256));
break;
case ISD::UDIV:
- Results.push_back(ExpandIntLibCall(Node, false,
- RTLIB::UDIV_I8,
- RTLIB::UDIV_I16, RTLIB::UDIV_I32,
- RTLIB::UDIV_I64, RTLIB::UDIV_I128));
+ Results.push_back(ExpandIntLibCall(
+ Node, false, RTLIB::UDIV_I8, RTLIB::UDIV_I16, RTLIB::UDIV_I32,
+ RTLIB::UDIV_I64, RTLIB::UDIV_I128, RTLIB::UDIV_I256));
break;
case ISD::SDIVREM:
case ISD::UDIVREM:
@@ -5354,18 +5356,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
ExpandDivRemLibCall(Node, Results);
break;
case ISD::MUL:
- Results.push_back(ExpandIntLibCall(Node, false,
- RTLIB::MUL_I8,
- RTLIB::MUL_I16, RTLIB::MUL_I32,
- RTLIB::MUL_I64, RTLIB::MUL_I128));
+ Results.push_back(ExpandIntLibCall(
+ Node, false, RTLIB::MUL_I8, RTLIB::MUL_I16, RTLIB::MUL_I32,
+ RTLIB::MUL_I64, RTLIB::MUL_I128, RTLIB::MUL_I256));
break;
case ISD::CTLZ_ZERO_UNDEF:
- Results.push_back(ExpandBitCountingLibCall(
- Node, RTLIB::CTLZ_I32, RTLIB::CTLZ_I64, RTLIB::CTLZ_I128));
+ Results.push_back(
+ ExpandBitCountingLibCall(Node, RTLIB::CTLZ_I32, RTLIB::CTLZ_I64,
+ RTLIB::CTLZ_I128, RTLIB::CTLZ_I256));
break;
case ISD::CTPOP:
- Results.push_back(ExpandBitCountingLibCall(
- Node, RTLIB::CTPOP_I32, RTLIB::CTPOP_I64, RTLIB::CTPOP_I128));
+ Results.push_back(
+ ExpandBitCountingLibCall(Node, RTLIB::CTPOP_I32, RTLIB::CTPOP_I64,
+ RTLIB::CTPOP_I128, RTLIB::CTPOP_I256));
break;
case ISD::RESET_FPENV: {
// It is legalized to call 'fesetenv(FE_DFL_ENV)'. On most targets
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index cc5a4219536ac..355063a91ec40 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -130,6 +130,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getSHL(EVT VT) {
return RTLIB::SHL_I64;
if (VT == MVT::i128)
return RTLIB::SHL_I128;
+ if (VT == MVT::i256)
+ return RTLIB::SHL_I256;
return RTLIB::UNKNOWN_LIBCALL;
}
@@ -143,6 +145,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getSRL(EVT VT) {
return RTLIB::SRL_I64;
if (VT == MVT::i128)
return RTLIB::SRL_I128;
+ if (VT == MVT::i256)
+ return RTLIB::SRL_I256;
return RTLIB::UNKNOWN_LIBCALL;
}
@@ -156,6 +160,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getSRA(EVT VT) {
return RTLIB::SRA_I64;
if (VT == MVT::i128)
return RTLIB::SRA_I128;
+ if (VT == MVT::i256)
+ return RTLIB::SRA_I256;
return RTLIB::UNKNOWN_LIBCALL;
}
@@ -169,6 +175,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getMUL(EVT VT) {
return RTLIB::MUL_I64;
if (VT == MVT::i128)
return RTLIB::MUL_I128;
+ if (VT == MVT::i256)
+ return RTLIB::MUL_I256;
return RTLIB::UNKNOWN_LIBCALL;
}
@@ -179,6 +187,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getMULO(EVT VT) {
return RTLIB::MULO_I64;
if (VT == MVT::i128)
return RTLIB::MULO_I128;
+ if (VT == MVT::i256)
+ return RTLIB::MULO_I256;
return RTLIB::UNKNOWN_LIBCALL;
}
@@ -191,6 +201,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getSDIV(EVT VT) {
return RTLIB::SDIV_I64;
if (VT == MVT::i128)
return RTLIB::SDIV_I128;
+ if (VT == MVT::i256)
+ return RTLIB::SDIV_I256;
return RTLIB::UNKNOWN_LIBCALL;
}
@@ -203,6 +215,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getUDIV(EVT VT) {
return RTLIB::UDIV_I64;
if (VT == MVT::i128)
return RTLIB::UDIV_I128;
+ if (VT == MVT::i256)
+ return RTLIB::UDIV_I256;
return RTLIB::UNKNOWN_LIBCALL;
}
@@ -215,6 +229,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getSREM(EVT VT) {
return RTLIB::SREM_I64;
if (VT == MVT::i128)
return RTLIB::SREM_I128;
+ if (VT == MVT::i256)
+ return RTLIB::SREM_I256;
return RTLIB::UNKNOWN_LIBCALL;
}
@@ -227,6 +243,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getUREM(EVT VT) {
return RTLIB::UREM_I64;
if (VT == MVT::i128)
return RTLIB::UREM_I128;
+ if (VT == MVT::i256)
+ return RTLIB::UREM_I256;
return RTLIB::UNKNOWN_LIBCALL;
}
@@ -237,6 +255,8 @@ LLVM_ABI RTLIB::Libcall RTLIB::getCTPOP(EVT VT) {
return RTLIB::CTPOP_I64;
if (VT == MVT::i128)
return RTLIB::CTPOP_I128;
+ if (VT == MVT::i256)
+ return RTLIB::CTPOP_I256;
return RTLIB::UNKNOWN_LIBCALL;
}
@@ -356,6 +376,8 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
return FPTOSINT_F32_I64;
if (RetVT == MVT::i128)
return FPTOSINT_F32_I128;
+ if (RetVT == MVT::i256)
+ return FPTOSINT_F32_I256;
} else if (OpVT == MVT::f64) {
if (RetVT == MVT::i32)
return FPTOSINT_F64_I32;
@@ -363,6 +385,8 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
return FPTOSINT_F64_I64;
if (RetVT == MVT::i128)
return FPTOSINT_F64_I128;
+ if (RetVT == MVT::i256)
+ return FPTOSINT_F64_I256;
} else if (OpVT == MVT::f80) {
if (RetVT == MVT::i32)
return FPTOSINT_F80_I32;
@@ -370,6 +394,8 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
return FPTOSINT_F80_I64;
if (RetVT == MVT::i128)
return FPTOSINT_F80_I128;
+ if (RetVT == MVT::i256)
+ return FPTOSINT_F80_I256;
} else if (OpVT == MVT::f128) {
if (RetVT == MVT::i32)
return FPTOSINT_F128_I32;
@@ -377,6 +403,8 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
return FPTOSINT_F128_I64;
if (RetVT == MVT::i128)
return FPTOSINT_F128_I128;
+ if (RetVT == MVT::i256)
+ return FPTOSINT_F128_I256;
} else if (OpVT == MVT::ppcf128) {
if (RetVT == MVT::i32)
return FPTOSINT_PPCF128_I32;
@@ -405,6 +433,8 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
return FPTOUINT_F32_I64;
if (RetVT == MVT::i128)
return FPTOUINT_F32_I128;
+ if (RetVT == MVT::i256)
+ return FPTOUINT_F32_I256;
} else if (OpVT == MVT::f64) {
if (RetVT == MVT::i32)
return FPTOUINT_F64_I32;
@@ -412,6 +442,8 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
return FPTOUINT_F64_I64;
if (RetVT == MVT::i128)
return FPTOUINT_F64_I128;
+ if (RetVT == MVT::i256)
+ return FPTOUINT_F64_I256;
} else if (OpVT == MVT::f80) {
if (RetVT == MVT::i32)
return FPTOUINT_F80_I32;
@@ -419,6 +451,8 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
return FPTOUINT_F80_I64;
if (RetVT == MVT::i128)
return FPTOUINT_F80_I128;
+ if (RetVT == MVT::i256)
+ return FPTOUINT_F80_I256;
} else if (OpVT == MVT::f128) {
if (RetVT == MVT::i32)
return FPTOUINT_F128_I32;
@@ -426,6 +460,8 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
return FPTOUINT_F128_I64;
if (RetVT == MVT::i128)
return FPTOUINT_F128_I128;
+ if (RetVT == MVT::i256)
+ return FPTOUINT_F128_I256;
} else if (OpVT == MVT::ppcf128) {
if (RetVT == MVT::i32)
return FPTOUINT_PPCF128_I32;
@@ -481,6 +517,15 @@ RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
return SINTTOFP_I128_F128;
if (RetVT == MVT::ppcf128)
return SINTTOFP_I128_PPCF128;
+ } else if (OpVT == MVT::i256) {
+ if (RetVT == MVT::f32)
+ return SINTTOFP_I256_F32;
+ if (RetVT == MVT::f64)
+ return SINTTOFP_I256_F64;
+ if (RetVT == MVT::f80)
+ return SINTTOFP_I256_F80;
+ if (RetVT == MVT::f128)
+ return SINTTOFP_I256_F128;
}
return UNKNOWN_LIBCALL;
}
@@ -529,6 +574,15 @@ RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
return UINTTOFP_I128_F128;
if (RetVT == MVT::ppcf128)
return UINTTOFP_I128_PPCF128;
+ } else if (OpVT == MVT::i256) {
+ if (RetVT == MVT::f32)
+ return UINTTOFP_I256_F32;
+ if (RetVT == MVT::f64)
+ return UINTTOFP_I256_F64;
+ if (RetVT == MVT::f80)
+ return UINTTOFP_I256_F80;
+ if (RetVT == MVT::f128)
+ return UINTTOFP_I256_F128;
}
return UNKNOWN_LIBCALL;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 01e07a70aaaf4..9cdbc4417e636 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1241,7 +1241,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setHasExtractBitsInsn(true);
- setMaxDivRemBitWidthSupported(128);
+ setMaxDivRemBitWidthSupported(256);
+
+ setMaxLargeFPConvertBitWidthSupported(256);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
if (Subtarget->hasSME())
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2188f6466682b..7f98d9f2bb14e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -173,6 +173,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
else
setMaxAtomicSizeInBitsSupported(32);
+ // Note: i256 div/rem and FP conversions are intentionally NOT routed to
+ // libcalls on x86-64. The x86-64 SysV ABI classifies __int256 as MEMORY
+ // (> 2 eightbytes), so the frontend uses indirect passing (sret/byval).
+ // Backend-generated libcalls pass i256 as a split scalar (4 x i64 in
+ // registers), creating an ABI mismatch with the compiled builtins.
+ // Instead, ExpandLargeDivRem and ExpandLargeFPConvert expand these
+ // operations at the IR level.
setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
setMaxLargeFPConvertBitWidthSupported(128);
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index b985c1eec4244..a0e9d6e6aea75 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -85,8 +85,8 @@ static std::string computeAArch64DataLayout(const Triple &TT) {
if (TT.getArch() == Triple::aarch64_32)
return "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
"n32:64-S128-Fn32";
- return "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-"
- "Fn32";
+ return "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
+ "n32:64-S128-Fn32";
}
if (TT.isOSBinFormatCOFF())
return "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:"
@@ -200,9 +200,10 @@ static std::string computeMipsDataLayout(const Triple &TT, StringRef ABIName) {
// 32 bit registers are always available and the stack is at least 64 bit
// aligned. On N64 64 bit registers are also available and the stack is
// 128 bit aligned.
- if (ABI == MipsABI::N64 || ABI == MipsABI::N32)
- Ret += "-i128:128-n32:64-S128";
- else
+ if (ABI == MipsABI::N64 || ABI == MipsABI::N32) {
+ Ret += "-i128:128";
+ Ret += "-n32:64-S128";
+ } else
Ret += "-n32-S64";
return Ret;
@@ -242,9 +243,10 @@ static std::string computePowerDataLayout(const Triple &T, StringRef ABIName) {
Ret += "-i64:64";
// PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
- if (is64Bit)
- Ret += "-i128:128-n32:64";
- else
+ if (is64Bit) {
+ Ret += "-i128:128";
+ Ret += "-n32:64";
+ } else
Ret += "-n32";
// The ABI alignment for doubles on AIX is 4 bytes.
@@ -410,9 +412,9 @@ static std::string computeX86DataLayout(const Triple &TT) {
// Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
// 128 bit integers are not specified in the 32-bit ABIs but are used
// internally for lowering f128, so we match the alignment to that.
- if (Is64Bit || TT.isOSWindows())
+ if (Is64Bit || TT.isOSWindows()) {
Ret += "-i64:64-i128:128";
- else if (TT.isOSIAMCU())
+ } else if (TT.isOSIAMCU())
Ret += "-i64:32-f64:32";
else
Ret += "-i128:128-f64:32:64";
>From 7e1a66ca5b3845c81262a2a929b04004fa1b3e0f Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:39:26 +0100
Subject: [PATCH 06/17] [llvm][test] Add and update i256 codegen tests
Add new i256-specific tests and regenerate affected existing tests:
- AArch64: bitcount (NEON/SVE/CSSC), comparisons, division, multiply,
shifts, wide-scalar shift legalization, GlobalISel multiway splits
- X86: comparisons, division, multiply, shifts (i128/i256/i512),
div-rem recomposition, overflow multiply, APX i1024 multiply,
expand-large-fp (fptosi/fptoui/sitofp/uitofp for i129+), various
regressions (dagcombine-cse, scheduler-backtracking, pr38539)
- RISC-V: i256 arithmetic (add, sub, mul, div, shifts)
Existing tests regenerated with update_llc_test_checks.py to reflect
i256 data layout alignment changes.
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
.../GlobalISel/split-wide-shifts-multiway.ll | 210 +-
llvm/test/CodeGen/AArch64/div-i256.ll | 1091 +---
llvm/test/CodeGen/AArch64/shift-i256.ll | 29 +-
...lar-shift-by-byte-multiple-legalization.ll | 102 +-
.../AArch64/wide-scalar-shift-legalization.ll | 98 +-
llvm/test/CodeGen/RISCV/i256-arith.ll | 1442 +++++
llvm/test/CodeGen/X86/apx/mul-i1024.ll | 3014 ++++-----
llvm/test/CodeGen/X86/bittest-big-integer.ll | 2 +-
llvm/test/CodeGen/X86/cmp-i256.ll | 450 ++
llvm/test/CodeGen/X86/dagcombine-cse.ll | 38 +-
llvm/test/CodeGen/X86/div-i256.ll | 5475 +++++++++++++++++
.../CodeGen/X86/expand-large-fp-optnone.ll | 240 +-
llvm/test/CodeGen/X86/fp-i129.ll | 96 +-
llvm/test/CodeGen/X86/i128-sdiv.ll | 327 +-
llvm/test/CodeGen/X86/memfold-mov32r0.ll | 4 +-
llvm/test/CodeGen/X86/mul-i1024.ll | 1686 ++---
llvm/test/CodeGen/X86/mul-i512.ll | 392 +-
.../CodeGen/X86/scheduler-backtracking.ll | 168 +-
.../CodeGen/X86/shift-i256-narrow-amount.ll | 382 ++
llvm/test/CodeGen/X86/shift-i256.ll | 122 +-
llvm/test/CodeGen/X86/shift-i512.ll | 2736 ++++----
llvm/test/CodeGen/X86/smul-with-overflow.ll | 218 +-
.../X86/smulo-128-legalisation-lowering.ll | 290 +-
llvm/test/CodeGen/X86/udivmodei5.ll | 4968 ++++++++++++++-
llvm/test/CodeGen/X86/umul-with-overflow.ll | 186 +-
...lar-shift-by-byte-multiple-legalization.ll | 4510 ++++++++------
.../X86/wide-scalar-shift-legalization.ll | 1066 ++--
...ad-of-small-alloca-with-zero-upper-half.ll | 778 ++-
.../CodeGen/X86/widen-load-of-small-alloca.ll | 222 +-
.../X86/expand-large-fp-convert-fptosi129.ll | 186 +-
.../X86/expand-large-fp-convert-fptoui129.ll | 162 +-
.../X86/expand-large-fp-convert-si129tofp.ll | 553 +-
.../X86/expand-large-fp-convert-ui129tofp.ll | 553 +-
.../X86/expand-large-fp-optnone.ll | 84 +-
.../Transforms/ExpandIRInsts/X86/sdiv129.ll | 4 +-
.../Transforms/ExpandIRInsts/X86/srem129.ll | 4 +-
.../Transforms/ExpandIRInsts/X86/udiv129.ll | 4 +-
.../Transforms/ExpandIRInsts/X86/urem129.ll | 4 +-
38 files changed, 21229 insertions(+), 10667 deletions(-)
create mode 100644 llvm/test/CodeGen/RISCV/i256-arith.ll
create mode 100644 llvm/test/CodeGen/X86/cmp-i256.ll
create mode 100644 llvm/test/CodeGen/X86/div-i256.ll
create mode 100644 llvm/test/CodeGen/X86/shift-i256-narrow-amount.ll
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index d669c49cb019b..e477a78d546e9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -7,8 +7,13 @@ target triple = "arm64-apple-macosx14.0.0"
define void @test_shl_i512(ptr %result, ptr %input, i32 %shift) {
; SDAG-LABEL: test_shl_i512:
; SDAG: ; %bb.0: ; %entry
-; SDAG-NEXT: sub sp, sp, #128
-; SDAG-NEXT: .cfi_def_cfa_offset 128
+; SDAG-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SDAG-NEXT: sub x9, sp, #144
+; SDAG-NEXT: mov x29, sp
+; SDAG-NEXT: and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT: .cfi_def_cfa w29, 16
+; SDAG-NEXT: .cfi_offset w30, -8
+; SDAG-NEXT: .cfi_offset w29, -16
; SDAG-NEXT: ldp x9, x8, [x1, #48]
; SDAG-NEXT: movi.2d v0, #0000000000000000
; SDAG-NEXT: ldp q1, q2, [x1]
@@ -64,7 +69,8 @@ define void @test_shl_i512(ptr %result, ptr %input, i32 %shift) {
; SDAG-NEXT: orr x9, x11, x16
; SDAG-NEXT: stp x13, x10, [x0, #16]
; SDAG-NEXT: stp x8, x9, [x0]
-; SDAG-NEXT: add sp, sp, #128
+; SDAG-NEXT: mov sp, x29
+; SDAG-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; SDAG-NEXT: ret
;
; GISEL-LABEL: test_shl_i512:
@@ -360,8 +366,13 @@ entry:
define void @test_lshr_i512(ptr %result, ptr %input, i32 %shift) {
; SDAG-LABEL: test_lshr_i512:
; SDAG: ; %bb.0: ; %entry
-; SDAG-NEXT: sub sp, sp, #128
-; SDAG-NEXT: .cfi_def_cfa_offset 128
+; SDAG-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SDAG-NEXT: sub x9, sp, #144
+; SDAG-NEXT: mov x29, sp
+; SDAG-NEXT: and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT: .cfi_def_cfa w29, 16
+; SDAG-NEXT: .cfi_offset w30, -8
+; SDAG-NEXT: .cfi_offset w29, -16
; SDAG-NEXT: ldp x9, x8, [x1, #48]
; SDAG-NEXT: movi.2d v0, #0000000000000000
; SDAG-NEXT: ldp q1, q2, [x1]
@@ -416,7 +427,8 @@ define void @test_lshr_i512(ptr %result, ptr %input, i32 %shift) {
; SDAG-NEXT: orr x9, x12, x10
; SDAG-NEXT: orr x8, x8, x15
; SDAG-NEXT: stp x9, x8, [x0]
-; SDAG-NEXT: add sp, sp, #128
+; SDAG-NEXT: mov sp, x29
+; SDAG-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; SDAG-NEXT: ret
;
; GISEL-LABEL: test_lshr_i512:
@@ -706,8 +718,13 @@ entry:
define void @test_ashr_i512(ptr %result, ptr %input, i32 %shift) {
; SDAG-LABEL: test_ashr_i512:
; SDAG: ; %bb.0: ; %entry
-; SDAG-NEXT: sub sp, sp, #128
-; SDAG-NEXT: .cfi_def_cfa_offset 128
+; SDAG-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; SDAG-NEXT: sub x9, sp, #144
+; SDAG-NEXT: mov x29, sp
+; SDAG-NEXT: and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT: .cfi_def_cfa w29, 16
+; SDAG-NEXT: .cfi_offset w30, -8
+; SDAG-NEXT: .cfi_offset w29, -16
; SDAG-NEXT: ldp x9, x8, [x1, #48]
; SDAG-NEXT: mov x11, sp
; SDAG-NEXT: ldp q0, q1, [x1]
@@ -764,7 +781,8 @@ define void @test_ashr_i512(ptr %result, ptr %input, i32 %shift) {
; SDAG-NEXT: orr x9, x12, x10
; SDAG-NEXT: orr x8, x8, x15
; SDAG-NEXT: stp x9, x8, [x0]
-; SDAG-NEXT: add sp, sp, #128
+; SDAG-NEXT: mov sp, x29
+; SDAG-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; SDAG-NEXT: ret
;
; GISEL-LABEL: test_ashr_i512:
@@ -1086,14 +1104,16 @@ entry:
define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; SDAG-LABEL: test_shl_i1024:
; SDAG: ; %bb.0: ; %entry
-; SDAG-NEXT: sub sp, sp, #352
-; SDAG-NEXT: stp x28, x27, [sp, #256] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x26, x25, [sp, #272] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x24, x23, [sp, #288] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x22, x21, [sp, #304] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x20, x19, [sp, #320] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x29, x30, [sp, #336] ; 16-byte Folded Spill
-; SDAG-NEXT: .cfi_def_cfa_offset 352
+; SDAG-NEXT: stp x28, x27, [sp, #-96]! ; 16-byte Folded Spill
+; SDAG-NEXT: sub x9, sp, #256
+; SDAG-NEXT: stp x26, x25, [sp, #16] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x24, x23, [sp, #32] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x22, x21, [sp, #48] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x20, x19, [sp, #64] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill
+; SDAG-NEXT: add x29, sp, #80
+; SDAG-NEXT: and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT: .cfi_def_cfa w29, 16
; SDAG-NEXT: .cfi_offset w30, -8
; SDAG-NEXT: .cfi_offset w29, -16
; SDAG-NEXT: .cfi_offset w19, -24
@@ -1120,7 +1140,6 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; SDAG-NEXT: lsr x9, x8, #3
; SDAG-NEXT: stp q0, q0, [sp]
; SDAG-NEXT: stp q0, q0, [sp, #32]
-; SDAG-NEXT: ldp x29, x30, [sp, #336] ; 16-byte Folded Reload
; SDAG-NEXT: and x9, x9, #0x78
; SDAG-NEXT: stp q0, q0, [sp, #64]
; SDAG-NEXT: stp q0, q0, [sp, #96]
@@ -1161,33 +1180,33 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; SDAG-NEXT: lsr x3, x3, x4
; SDAG-NEXT: lsr x5, x5, x4
; SDAG-NEXT: lsr x7, x7, x4
-; SDAG-NEXT: lsr x22, x22, #1
; SDAG-NEXT: lsr x25, x25, x4
; SDAG-NEXT: lsr x4, x28, x4
; SDAG-NEXT: orr x1, x1, x20
; SDAG-NEXT: lsl x20, x23, x8
; SDAG-NEXT: lsl x23, x24, x8
+; SDAG-NEXT: lsr x22, x22, #1
; SDAG-NEXT: lsr x28, x26, #1
-; SDAG-NEXT: lsr x22, x22, x10
; SDAG-NEXT: lsl x24, x27, x8
-; SDAG-NEXT: orr x4, x23, x4
; SDAG-NEXT: lsl x6, x6, x8
-; SDAG-NEXT: lsl x2, x2, x8
+; SDAG-NEXT: orr x4, x23, x4
+; SDAG-NEXT: lsr x22, x22, x10
+; SDAG-NEXT: lsl x13, x13, x8
; SDAG-NEXT: lsr x27, x28, x10
; SDAG-NEXT: stp x4, x1, [x0, #112]
; SDAG-NEXT: lsl x1, x26, x8
; SDAG-NEXT: orr x20, x20, x22
; SDAG-NEXT: lsr x4, x9, #1
-; SDAG-NEXT: lsl x13, x13, x8
+; SDAG-NEXT: lsl x14, x14, x8
; SDAG-NEXT: orr x22, x24, x27
; SDAG-NEXT: orr x1, x1, x25
; SDAG-NEXT: stp x21, x20, [x0, #80]
-; SDAG-NEXT: lsr x20, x17, #1
; SDAG-NEXT: stp x1, x22, [x0, #96]
; SDAG-NEXT: lsr x1, x11, #1
+; SDAG-NEXT: lsr x20, x17, #1
; SDAG-NEXT: lsr x21, x12, #1
-; SDAG-NEXT: lsl x14, x14, x8
; SDAG-NEXT: lsl x15, x15, x8
+; SDAG-NEXT: lsl x2, x2, x8
; SDAG-NEXT: lsr x20, x20, x10
; SDAG-NEXT: lsl x17, x17, x8
; SDAG-NEXT: orr x6, x6, x7
@@ -1196,25 +1215,26 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; SDAG-NEXT: lsr x1, x1, x10
; SDAG-NEXT: lsl x11, x11, x8
; SDAG-NEXT: lsr x10, x4, x10
-; SDAG-NEXT: stp x6, x19, [x0, #64]
-; SDAG-NEXT: orr x2, x2, x20
; SDAG-NEXT: lsl x8, x9, x8
+; SDAG-NEXT: orr x2, x2, x20
; SDAG-NEXT: orr x17, x17, x5
-; SDAG-NEXT: ldp x20, x19, [sp, #320] ; 16-byte Folded Reload
; SDAG-NEXT: orr x15, x15, x7
-; SDAG-NEXT: ldp x22, x21, [sp, #304] ; 16-byte Folded Reload
; SDAG-NEXT: orr x12, x12, x3
-; SDAG-NEXT: ldp x24, x23, [sp, #288] ; 16-byte Folded Reload
; SDAG-NEXT: orr x14, x14, x1
-; SDAG-NEXT: ldp x26, x25, [sp, #272] ; 16-byte Folded Reload
; SDAG-NEXT: orr x11, x11, x16
-; SDAG-NEXT: ldp x28, x27, [sp, #256] ; 16-byte Folded Reload
; SDAG-NEXT: orr x9, x13, x10
+; SDAG-NEXT: stp x6, x19, [x0, #64]
; SDAG-NEXT: stp x17, x2, [x0, #48]
; SDAG-NEXT: stp x12, x15, [x0, #32]
; SDAG-NEXT: stp x11, x14, [x0, #16]
; SDAG-NEXT: stp x8, x9, [x0]
-; SDAG-NEXT: add sp, sp, #352
+; SDAG-NEXT: sub sp, x29, #80
+; SDAG-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x22, x21, [sp, #48] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x24, x23, [sp, #32] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x26, x25, [sp, #16] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x28, x27, [sp], #96 ; 16-byte Folded Reload
; SDAG-NEXT: ret
;
; GISEL-LABEL: test_shl_i1024:
@@ -2354,23 +2374,28 @@ entry:
define void @test_lshr_i1024(ptr %result, ptr %input, i32 %shift) {
; SDAG-LABEL: test_lshr_i1024:
; SDAG: ; %bb.0: ; %entry
-; SDAG-NEXT: sub sp, sp, #336
-; SDAG-NEXT: stp x28, x27, [sp, #256] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x26, x25, [sp, #272] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x24, x23, [sp, #288] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x22, x21, [sp, #304] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x20, x19, [sp, #320] ; 16-byte Folded Spill
-; SDAG-NEXT: .cfi_def_cfa_offset 336
-; SDAG-NEXT: .cfi_offset w19, -8
-; SDAG-NEXT: .cfi_offset w20, -16
-; SDAG-NEXT: .cfi_offset w21, -24
-; SDAG-NEXT: .cfi_offset w22, -32
-; SDAG-NEXT: .cfi_offset w23, -40
-; SDAG-NEXT: .cfi_offset w24, -48
-; SDAG-NEXT: .cfi_offset w25, -56
-; SDAG-NEXT: .cfi_offset w26, -64
-; SDAG-NEXT: .cfi_offset w27, -72
-; SDAG-NEXT: .cfi_offset w28, -80
+; SDAG-NEXT: stp x28, x27, [sp, #-96]! ; 16-byte Folded Spill
+; SDAG-NEXT: sub x9, sp, #256
+; SDAG-NEXT: stp x26, x25, [sp, #16] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x24, x23, [sp, #32] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x22, x21, [sp, #48] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x20, x19, [sp, #64] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill
+; SDAG-NEXT: add x29, sp, #80
+; SDAG-NEXT: and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT: .cfi_def_cfa w29, 16
+; SDAG-NEXT: .cfi_offset w30, -8
+; SDAG-NEXT: .cfi_offset w29, -16
+; SDAG-NEXT: .cfi_offset w19, -24
+; SDAG-NEXT: .cfi_offset w20, -32
+; SDAG-NEXT: .cfi_offset w21, -40
+; SDAG-NEXT: .cfi_offset w22, -48
+; SDAG-NEXT: .cfi_offset w23, -56
+; SDAG-NEXT: .cfi_offset w24, -64
+; SDAG-NEXT: .cfi_offset w25, -72
+; SDAG-NEXT: .cfi_offset w26, -80
+; SDAG-NEXT: .cfi_offset w27, -88
+; SDAG-NEXT: .cfi_offset w28, -96
; SDAG-NEXT: ldp x8, x9, [x1, #112]
; SDAG-NEXT: movi.2d v0, #0000000000000000
; SDAG-NEXT: ldp q1, q2, [x1]
@@ -2435,7 +2460,6 @@ define void @test_lshr_i1024(ptr %result, ptr %input, i32 %shift) {
; SDAG-NEXT: lsl x1, x27, x1
; SDAG-NEXT: lsl x23, x23, x15
; SDAG-NEXT: orr x5, x22, x5
-; SDAG-NEXT: ldp x28, x27, [sp, #256] ; 16-byte Folded Reload
; SDAG-NEXT: orr x19, x25, x19
; SDAG-NEXT: lsl x25, x26, #1
; SDAG-NEXT: orr x20, x23, x20
@@ -2452,32 +2476,34 @@ define void @test_lshr_i1024(ptr %result, ptr %input, i32 %shift) {
; SDAG-NEXT: lsl x1, x9, #1
; SDAG-NEXT: stp x7, x26, [x0, #112]
; SDAG-NEXT: lsl x7, x10, #1
+; SDAG-NEXT: lsl x5, x12, #1
; SDAG-NEXT: orr x3, x3, x21
; SDAG-NEXT: orr x13, x19, x13
-; SDAG-NEXT: lsl x5, x12, #1
; SDAG-NEXT: lsr x9, x9, x8
; SDAG-NEXT: stp x13, x3, [x0, #48]
; SDAG-NEXT: lsl x13, x1, x15
; SDAG-NEXT: lsr x23, x23, x8
+; SDAG-NEXT: stp x4, x6, [x0, #64]
; SDAG-NEXT: lsr x12, x12, x8
+; SDAG-NEXT: lsl x4, x5, x15
; SDAG-NEXT: lsr x8, x10, x8
; SDAG-NEXT: lsl x10, x7, x15
-; SDAG-NEXT: stp x4, x6, [x0, #64]
-; SDAG-NEXT: lsl x4, x5, x15
; SDAG-NEXT: orr x9, x9, x17
; SDAG-NEXT: orr x11, x13, x11
-; SDAG-NEXT: ldp x20, x19, [sp, #320] ; 16-byte Folded Reload
-; SDAG-NEXT: stp x11, x9, [x0, #16]
-; SDAG-NEXT: orr x9, x10, x23
; SDAG-NEXT: orr x12, x12, x2
-; SDAG-NEXT: ldp x22, x21, [sp, #304] ; 16-byte Folded Reload
; SDAG-NEXT: orr x16, x4, x16
-; SDAG-NEXT: ldp x24, x23, [sp, #288] ; 16-byte Folded Reload
+; SDAG-NEXT: stp x11, x9, [x0, #16]
; SDAG-NEXT: orr x8, x8, x14
-; SDAG-NEXT: ldp x26, x25, [sp, #272] ; 16-byte Folded Reload
+; SDAG-NEXT: orr x9, x10, x23
; SDAG-NEXT: stp x16, x12, [x0, #32]
; SDAG-NEXT: stp x9, x8, [x0]
-; SDAG-NEXT: add sp, sp, #336
+; SDAG-NEXT: sub sp, x29, #80
+; SDAG-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x22, x21, [sp, #48] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x24, x23, [sp, #32] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x26, x25, [sp, #16] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x28, x27, [sp], #96 ; 16-byte Folded Reload
; SDAG-NEXT: ret
;
; GISEL-LABEL: test_lshr_i1024:
@@ -3574,23 +3600,28 @@ entry:
define void @test_ashr_i1024(ptr %result, ptr %input, i32 %shift) {
; SDAG-LABEL: test_ashr_i1024:
; SDAG: ; %bb.0: ; %entry
-; SDAG-NEXT: sub sp, sp, #336
-; SDAG-NEXT: stp x28, x27, [sp, #256] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x26, x25, [sp, #272] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x24, x23, [sp, #288] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x22, x21, [sp, #304] ; 16-byte Folded Spill
-; SDAG-NEXT: stp x20, x19, [sp, #320] ; 16-byte Folded Spill
-; SDAG-NEXT: .cfi_def_cfa_offset 336
-; SDAG-NEXT: .cfi_offset w19, -8
-; SDAG-NEXT: .cfi_offset w20, -16
-; SDAG-NEXT: .cfi_offset w21, -24
-; SDAG-NEXT: .cfi_offset w22, -32
-; SDAG-NEXT: .cfi_offset w23, -40
-; SDAG-NEXT: .cfi_offset w24, -48
-; SDAG-NEXT: .cfi_offset w25, -56
-; SDAG-NEXT: .cfi_offset w26, -64
-; SDAG-NEXT: .cfi_offset w27, -72
-; SDAG-NEXT: .cfi_offset w28, -80
+; SDAG-NEXT: stp x28, x27, [sp, #-96]! ; 16-byte Folded Spill
+; SDAG-NEXT: sub x9, sp, #256
+; SDAG-NEXT: stp x26, x25, [sp, #16] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x24, x23, [sp, #32] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x22, x21, [sp, #48] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x20, x19, [sp, #64] ; 16-byte Folded Spill
+; SDAG-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill
+; SDAG-NEXT: add x29, sp, #80
+; SDAG-NEXT: and sp, x9, #0xffffffffffffffe0
+; SDAG-NEXT: .cfi_def_cfa w29, 16
+; SDAG-NEXT: .cfi_offset w30, -8
+; SDAG-NEXT: .cfi_offset w29, -16
+; SDAG-NEXT: .cfi_offset w19, -24
+; SDAG-NEXT: .cfi_offset w20, -32
+; SDAG-NEXT: .cfi_offset w21, -40
+; SDAG-NEXT: .cfi_offset w22, -48
+; SDAG-NEXT: .cfi_offset w23, -56
+; SDAG-NEXT: .cfi_offset w24, -64
+; SDAG-NEXT: .cfi_offset w25, -72
+; SDAG-NEXT: .cfi_offset w26, -80
+; SDAG-NEXT: .cfi_offset w27, -88
+; SDAG-NEXT: .cfi_offset w28, -96
; SDAG-NEXT: ldp x8, x9, [x1, #112]
; SDAG-NEXT: mov x11, sp
; SDAG-NEXT: ldp q0, q1, [x1]
@@ -3659,7 +3690,6 @@ define void @test_ashr_i1024(ptr %result, ptr %input, i32 %shift) {
; SDAG-NEXT: lsl x1, x27, x1
; SDAG-NEXT: lsl x23, x23, x15
; SDAG-NEXT: orr x5, x22, x5
-; SDAG-NEXT: ldp x28, x27, [sp, #256] ; 16-byte Folded Reload
; SDAG-NEXT: orr x19, x25, x19
; SDAG-NEXT: lsl x25, x26, #1
; SDAG-NEXT: orr x20, x23, x20
@@ -3676,32 +3706,34 @@ define void @test_ashr_i1024(ptr %result, ptr %input, i32 %shift) {
; SDAG-NEXT: lsl x1, x9, #1
; SDAG-NEXT: stp x7, x26, [x0, #112]
; SDAG-NEXT: lsl x7, x10, #1
+; SDAG-NEXT: lsl x5, x12, #1
; SDAG-NEXT: orr x3, x3, x21
; SDAG-NEXT: orr x13, x19, x13
-; SDAG-NEXT: lsl x5, x12, #1
; SDAG-NEXT: lsr x9, x9, x8
; SDAG-NEXT: stp x13, x3, [x0, #48]
; SDAG-NEXT: lsl x13, x1, x15
; SDAG-NEXT: lsr x23, x23, x8
+; SDAG-NEXT: stp x4, x6, [x0, #64]
; SDAG-NEXT: lsr x12, x12, x8
+; SDAG-NEXT: lsl x4, x5, x15
; SDAG-NEXT: lsr x8, x10, x8
; SDAG-NEXT: lsl x10, x7, x15
-; SDAG-NEXT: stp x4, x6, [x0, #64]
-; SDAG-NEXT: lsl x4, x5, x15
; SDAG-NEXT: orr x9, x9, x17
; SDAG-NEXT: orr x11, x13, x11
-; SDAG-NEXT: ldp x20, x19, [sp, #320] ; 16-byte Folded Reload
-; SDAG-NEXT: stp x11, x9, [x0, #16]
-; SDAG-NEXT: orr x9, x10, x23
; SDAG-NEXT: orr x12, x12, x2
-; SDAG-NEXT: ldp x22, x21, [sp, #304] ; 16-byte Folded Reload
; SDAG-NEXT: orr x16, x4, x16
-; SDAG-NEXT: ldp x24, x23, [sp, #288] ; 16-byte Folded Reload
+; SDAG-NEXT: stp x11, x9, [x0, #16]
; SDAG-NEXT: orr x8, x8, x14
-; SDAG-NEXT: ldp x26, x25, [sp, #272] ; 16-byte Folded Reload
+; SDAG-NEXT: orr x9, x10, x23
; SDAG-NEXT: stp x16, x12, [x0, #32]
; SDAG-NEXT: stp x9, x8, [x0]
-; SDAG-NEXT: add sp, sp, #336
+; SDAG-NEXT: sub sp, x29, #80
+; SDAG-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x22, x21, [sp, #48] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x24, x23, [sp, #32] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x26, x25, [sp, #16] ; 16-byte Folded Reload
+; SDAG-NEXT: ldp x28, x27, [sp], #96 ; 16-byte Folded Reload
; SDAG-NEXT: ret
;
; GISEL-LABEL: test_ashr_i1024:
diff --git a/llvm/test/CodeGen/AArch64/div-i256.ll b/llvm/test/CodeGen/AArch64/div-i256.ll
index 48ac1963f465a..c18346062edbf 100644
--- a/llvm/test/CodeGen/AArch64/div-i256.ll
+++ b/llvm/test/CodeGen/AArch64/div-i256.ll
@@ -3,198 +3,10 @@
define i256 @udiv256(i256 %a, i256 %b) nounwind {
; CHECK-LABEL: udiv256:
-; CHECK: // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT: orr x8, x5, x7
-; CHECK-NEXT: orr x9, x4, x6
-; CHECK-NEXT: orr x10, x0, x2
-; CHECK-NEXT: orr x8, x9, x8
-; CHECK-NEXT: orr x9, x1, x3
-; CHECK-NEXT: clz x11, x5
-; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: orr x8, x10, x9
-; CHECK-NEXT: clz x10, x7
-; CHECK-NEXT: ccmp x8, #0, #4, ne
-; CHECK-NEXT: clz x8, x6
-; CHECK-NEXT: clz x12, x1
-; CHECK-NEXT: cset w9, eq
-; CHECK-NEXT: add x8, x8, #64
-; CHECK-NEXT: cmp x7, #0
-; CHECK-NEXT: csel x8, x10, x8, ne
-; CHECK-NEXT: clz x10, x4
-; CHECK-NEXT: cmp x5, #0
-; CHECK-NEXT: add x10, x10, #64
-; CHECK-NEXT: csel x10, x11, x10, ne
-; CHECK-NEXT: orr x11, x6, x7
-; CHECK-NEXT: add x10, x10, #128
-; CHECK-NEXT: cmp x11, #0
-; CHECK-NEXT: clz x11, x3
-; CHECK-NEXT: csel x8, x8, x10, ne
-; CHECK-NEXT: clz x10, x2
-; CHECK-NEXT: cmp x3, #0
-; CHECK-NEXT: add x10, x10, #64
-; CHECK-NEXT: csel x10, x11, x10, ne
-; CHECK-NEXT: clz x11, x0
-; CHECK-NEXT: cmp x1, #0
-; CHECK-NEXT: add x11, x11, #64
-; CHECK-NEXT: csel x11, x12, x11, ne
-; CHECK-NEXT: orr x12, x2, x3
-; CHECK-NEXT: add x11, x11, #128
-; CHECK-NEXT: cmp x12, #0
-; CHECK-NEXT: csel x10, x10, x11, ne
-; CHECK-NEXT: subs x15, x8, x10
-; CHECK-NEXT: mov w8, #255 // =0xff
-; CHECK-NEXT: ngcs x13, xzr
-; CHECK-NEXT: ngcs x14, xzr
-; CHECK-NEXT: ngc x12, xzr
-; CHECK-NEXT: cmp x8, x15
-; CHECK-NEXT: ngcs xzr, x13
-; CHECK-NEXT: ngcs xzr, x14
-; CHECK-NEXT: ngcs xzr, x12
-; CHECK-NEXT: csinc w16, w9, wzr, hs
-; CHECK-NEXT: cmp w16, #0
-; CHECK-NEXT: csel x8, xzr, x3, ne
-; CHECK-NEXT: csel x9, xzr, x2, ne
-; CHECK-NEXT: csel x10, xzr, x1, ne
-; CHECK-NEXT: csel x11, xzr, x0, ne
-; CHECK-NEXT: tbnz w16, #0, .LBB0_6
-; CHECK-NEXT: // %bb.1: // %_udiv-special-cases
-; CHECK-NEXT: eor x16, x15, #0xff
-; CHECK-NEXT: orr x17, x13, x12
-; CHECK-NEXT: orr x16, x16, x14
-; CHECK-NEXT: orr x16, x16, x17
-; CHECK-NEXT: cbz x16, .LBB0_6
-; CHECK-NEXT: // %bb.2: // %udiv-bb1
-; CHECK-NEXT: sub sp, sp, #208
-; CHECK-NEXT: mov w8, #255 // =0xff
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x9, sp, #64
-; CHECK-NEXT: sub x11, x8, x15
-; CHECK-NEXT: add x9, x9, #32
-; CHECK-NEXT: stp x28, x27, [sp, #128] // 16-byte Folded Spill
-; CHECK-NEXT: lsr x8, x11, #3
-; CHECK-NEXT: stp x26, x25, [sp, #144] // 16-byte Folded Spill
-; CHECK-NEXT: and x18, x11, #0x3f
-; CHECK-NEXT: stp x24, x23, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT: eor x18, x18, #0x3f
-; CHECK-NEXT: and x8, x8, #0x18
-; CHECK-NEXT: stp x22, x21, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #192] // 16-byte Folded Spill
-; CHECK-NEXT: sub x16, x9, x8
-; CHECK-NEXT: adds x8, x15, #1
-; CHECK-NEXT: stp x0, x1, [sp, #96]
-; CHECK-NEXT: adcs x9, x13, xzr
-; CHECK-NEXT: mvn w19, w11
-; CHECK-NEXT: stp x2, x3, [sp, #112]
-; CHECK-NEXT: adcs x10, x14, xzr
-; CHECK-NEXT: stp q0, q0, [sp, #64]
-; CHECK-NEXT: ldp x15, x17, [x16, #8]
-; CHECK-NEXT: ldr x13, [x16, #24]
-; CHECK-NEXT: ldr x16, [x16]
-; CHECK-NEXT: lsr x14, x15, #1
-; CHECK-NEXT: lsr x20, x17, #1
-; CHECK-NEXT: lsr x21, x16, #1
-; CHECK-NEXT: lsl x17, x17, x11
-; CHECK-NEXT: lsr x19, x14, x19
-; CHECK-NEXT: adcs x14, x12, xzr
-; CHECK-NEXT: lsl x12, x13, x11
-; CHECK-NEXT: lsr x13, x20, x18
-; CHECK-NEXT: lsl x20, x15, x11
-; CHECK-NEXT: lsr x18, x21, x18
-; CHECK-NEXT: lsl x11, x16, x11
-; CHECK-NEXT: cset w21, hs
-; CHECK-NEXT: mov x16, xzr
-; CHECK-NEXT: orr x15, x12, x13
-; CHECK-NEXT: orr x13, x17, x19
-; CHECK-NEXT: orr x12, x20, x18
-; CHECK-NEXT: tbnz w21, #0, .LBB0_5
-; CHECK-NEXT: // %bb.3: // %udiv-preheader
-; CHECK-NEXT: lsr x20, x8, #3
-; CHECK-NEXT: stp x0, x1, [sp]
-; CHECK-NEXT: mov x1, sp
-; CHECK-NEXT: stp q0, q0, [sp, #32]
-; CHECK-NEXT: mov x18, xzr
-; CHECK-NEXT: mov x19, xzr
-; CHECK-NEXT: and x0, x20, #0x18
-; CHECK-NEXT: stp x2, x3, [sp, #16]
-; CHECK-NEXT: and x2, x8, #0x3f
-; CHECK-NEXT: add x0, x1, x0
-; CHECK-NEXT: eor x2, x2, #0x3f
-; CHECK-NEXT: mvn w20, w8
-; CHECK-NEXT: ldp x1, x3, [x0, #16]
-; CHECK-NEXT: mov x17, xzr
-; CHECK-NEXT: ldp x24, x21, [x0]
-; CHECK-NEXT: subs x0, x4, #1
-; CHECK-NEXT: lsl x22, x3, #1
-; CHECK-NEXT: lsl x23, x1, #1
-; CHECK-NEXT: lsr x25, x1, x8
-; CHECK-NEXT: lsl x26, x21, #1
-; CHECK-NEXT: mov x1, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: lsr x27, x24, x8
-; CHECK-NEXT: lsl x22, x22, x2
-; CHECK-NEXT: lsl x20, x23, x20
-; CHECK-NEXT: lsr x23, x21, x8
-; CHECK-NEXT: lsl x26, x26, x2
-; CHECK-NEXT: adcs x2, x5, x1
-; CHECK-NEXT: orr x21, x22, x25
-; CHECK-NEXT: lsr x22, x3, x8
-; CHECK-NEXT: adcs x3, x6, x1
-; CHECK-NEXT: orr x24, x23, x20
-; CHECK-NEXT: orr x23, x26, x27
-; CHECK-NEXT: adc x20, x7, x1
-; CHECK-NEXT: .LBB0_4: // %udiv-do-while
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: extr x25, x23, x15, #63
-; CHECK-NEXT: extr x26, x24, x23, #63
-; CHECK-NEXT: extr x27, x21, x24, #63
-; CHECK-NEXT: extr x22, x22, x21, #63
-; CHECK-NEXT: extr x15, x15, x13, #63
-; CHECK-NEXT: extr x13, x13, x12, #63
-; CHECK-NEXT: cmp x0, x25
-; CHECK-NEXT: sbcs xzr, x2, x26
-; CHECK-NEXT: orr x13, x19, x13
-; CHECK-NEXT: orr x15, x17, x15
-; CHECK-NEXT: sbcs xzr, x3, x27
-; CHECK-NEXT: mov x17, xzr
-; CHECK-NEXT: sbc x21, x20, x22
-; CHECK-NEXT: asr x28, x21, #63
-; CHECK-NEXT: and x21, x28, x4
-; CHECK-NEXT: subs x23, x25, x21
-; CHECK-NEXT: and x21, x28, x5
-; CHECK-NEXT: and x25, x28, x7
-; CHECK-NEXT: sbcs x24, x26, x21
-; CHECK-NEXT: and x21, x28, x6
-; CHECK-NEXT: sbcs x21, x27, x21
-; CHECK-NEXT: sbc x22, x22, x25
-; CHECK-NEXT: subs x8, x8, #1
-; CHECK-NEXT: extr x25, x12, x11, #63
-; CHECK-NEXT: adcs x9, x9, x1
-; CHECK-NEXT: orr x11, x16, x11, lsl #1
-; CHECK-NEXT: and x16, x28, #0x1
-; CHECK-NEXT: adcs x10, x10, x1
-; CHECK-NEXT: orr x12, x18, x25
-; CHECK-NEXT: adc x14, x14, x1
-; CHECK-NEXT: orr x19, x8, x10
-; CHECK-NEXT: orr x18, x9, x14
-; CHECK-NEXT: orr x25, x19, x18
-; CHECK-NEXT: mov x18, xzr
-; CHECK-NEXT: mov x19, xzr
-; CHECK-NEXT: cbnz x25, .LBB0_4
-; CHECK-NEXT: .LBB0_5: // %udiv-loop-exit
-; CHECK-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT: extr x10, x12, x11, #63
-; CHECK-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT: extr x9, x13, x12, #63
-; CHECK-NEXT: ldp x24, x23, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT: extr x8, x15, x13, #63
-; CHECK-NEXT: ldp x26, x25, [sp, #144] // 16-byte Folded Reload
-; CHECK-NEXT: orr x11, x16, x11, lsl #1
-; CHECK-NEXT: ldp x28, x27, [sp, #128] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #208
-; CHECK-NEXT: .LBB0_6: // %udiv-end
-; CHECK-NEXT: mov x0, x11
-; CHECK-NEXT: mov x1, x10
-; CHECK-NEXT: mov x2, x9
-; CHECK-NEXT: mov x3, x8
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: bl __udivoi3
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%r = udiv i256 %a, %b
ret i256 %r
@@ -202,223 +14,10 @@ define i256 @udiv256(i256 %a, i256 %b) nounwind {
define i256 @sdiv256(i256 %a, i256 %b) nounwind {
; CHECK-LABEL: sdiv256:
-; CHECK: // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT: asr x12, x3, #63
-; CHECK-NEXT: asr x13, x7, #63
-; CHECK-NEXT: eor x8, x0, x12
-; CHECK-NEXT: eor x9, x1, x12
-; CHECK-NEXT: eor x10, x4, x13
-; CHECK-NEXT: subs x14, x8, x12
-; CHECK-NEXT: eor x8, x2, x12
-; CHECK-NEXT: eor x11, x5, x13
-; CHECK-NEXT: sbcs x15, x9, x12
-; CHECK-NEXT: eor x9, x3, x12
-; CHECK-NEXT: clz x16, x14
-; CHECK-NEXT: sbcs x18, x8, x12
-; CHECK-NEXT: clz x17, x15
-; CHECK-NEXT: add x16, x16, #64
-; CHECK-NEXT: sbc x0, x9, x12
-; CHECK-NEXT: subs x8, x10, x13
-; CHECK-NEXT: eor x10, x6, x13
-; CHECK-NEXT: sbcs x9, x11, x13
-; CHECK-NEXT: eor x11, x7, x13
-; CHECK-NEXT: orr x1, x14, x18
-; CHECK-NEXT: sbcs x10, x10, x13
-; CHECK-NEXT: orr x4, x15, x0
-; CHECK-NEXT: clz x5, x8
-; CHECK-NEXT: sbc x11, x11, x13
-; CHECK-NEXT: orr x2, x8, x10
-; CHECK-NEXT: orr x1, x1, x4
-; CHECK-NEXT: orr x3, x9, x11
-; CHECK-NEXT: add x5, x5, #64
-; CHECK-NEXT: orr x6, x10, x11
-; CHECK-NEXT: orr x2, x2, x3
-; CHECK-NEXT: clz x3, x10
-; CHECK-NEXT: eor x12, x13, x12
-; CHECK-NEXT: cmp x2, #0
-; CHECK-NEXT: add x3, x3, #64
-; CHECK-NEXT: clz x2, x9
-; CHECK-NEXT: ccmp x1, #0, #4, ne
-; CHECK-NEXT: clz x1, x11
-; CHECK-NEXT: cset w4, eq
-; CHECK-NEXT: cmp x11, #0
-; CHECK-NEXT: csel x1, x1, x3, ne
-; CHECK-NEXT: cmp x9, #0
-; CHECK-NEXT: clz x3, x18
-; CHECK-NEXT: csel x2, x2, x5, ne
-; CHECK-NEXT: cmp x6, #0
-; CHECK-NEXT: add x3, x3, #64
-; CHECK-NEXT: add x2, x2, #128
-; CHECK-NEXT: clz x5, x0
-; CHECK-NEXT: csel x1, x1, x2, ne
-; CHECK-NEXT: cmp x0, #0
-; CHECK-NEXT: csel x2, x5, x3, ne
-; CHECK-NEXT: cmp x15, #0
-; CHECK-NEXT: orr x3, x18, x0
-; CHECK-NEXT: csel x16, x17, x16, ne
-; CHECK-NEXT: cmp x3, #0
-; CHECK-NEXT: mov w3, #255 // =0xff
-; CHECK-NEXT: add x16, x16, #128
-; CHECK-NEXT: csel x16, x2, x16, ne
-; CHECK-NEXT: subs x2, x1, x16
-; CHECK-NEXT: ngcs x16, xzr
-; CHECK-NEXT: ngcs x17, xzr
-; CHECK-NEXT: ngc x1, xzr
-; CHECK-NEXT: cmp x3, x2
-; CHECK-NEXT: ngcs xzr, x16
-; CHECK-NEXT: ngcs xzr, x17
-; CHECK-NEXT: ngcs xzr, x1
-; CHECK-NEXT: csinc w5, w4, wzr, hs
-; CHECK-NEXT: cmp w5, #0
-; CHECK-NEXT: csel x13, xzr, x0, ne
-; CHECK-NEXT: csel x4, xzr, x18, ne
-; CHECK-NEXT: csel x7, xzr, x15, ne
-; CHECK-NEXT: csel x3, xzr, x14, ne
-; CHECK-NEXT: tbnz w5, #0, .LBB1_6
-; CHECK-NEXT: // %bb.1: // %_udiv-special-cases
-; CHECK-NEXT: eor x5, x2, #0xff
-; CHECK-NEXT: orr x6, x16, x1
-; CHECK-NEXT: orr x5, x5, x17
-; CHECK-NEXT: orr x5, x5, x6
-; CHECK-NEXT: cbz x5, .LBB1_6
-; CHECK-NEXT: // %bb.2: // %udiv-bb1
-; CHECK-NEXT: sub sp, sp, #224
-; CHECK-NEXT: mov w13, #255 // =0xff
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x4, sp, #64
-; CHECK-NEXT: sub x3, x13, x2
-; CHECK-NEXT: add x4, x4, #32
-; CHECK-NEXT: stp x0, x29, [sp, #120] // 8-byte Folded Spill
-; CHECK-NEXT: lsr x13, x3, #3
-; CHECK-NEXT: stp x28, x27, [sp, #144] // 16-byte Folded Spill
-; CHECK-NEXT: stp x26, x25, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT: and x13, x13, #0x18
-; CHECK-NEXT: stp x24, x23, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT: stp x22, x21, [sp, #192] // 16-byte Folded Spill
-; CHECK-NEXT: sub x4, x4, x13
-; CHECK-NEXT: adds x13, x2, #1
-; CHECK-NEXT: stp x20, x19, [sp, #208] // 16-byte Folded Spill
-; CHECK-NEXT: and x19, x3, #0x3f
-; CHECK-NEXT: adcs x16, x16, xzr
-; CHECK-NEXT: stp x14, x15, [sp, #96]
-; CHECK-NEXT: mvn w20, w3
-; CHECK-NEXT: eor x19, x19, #0x3f
-; CHECK-NEXT: str x18, [sp, #112]
-; CHECK-NEXT: adcs x17, x17, xzr
-; CHECK-NEXT: stp q0, q0, [sp, #64]
-; CHECK-NEXT: ldp x2, x6, [x4, #8]
-; CHECK-NEXT: ldr x7, [x4]
-; CHECK-NEXT: ldr x5, [x4, #24]
-; CHECK-NEXT: lsr x22, x7, #1
-; CHECK-NEXT: lsr x4, x2, #1
-; CHECK-NEXT: lsr x21, x6, #1
-; CHECK-NEXT: lsl x5, x5, x3
-; CHECK-NEXT: lsl x6, x6, x3
-; CHECK-NEXT: lsl x2, x2, x3
-; CHECK-NEXT: lsr x20, x4, x20
-; CHECK-NEXT: lsr x21, x21, x19
-; CHECK-NEXT: lsr x19, x22, x19
-; CHECK-NEXT: adcs x4, x1, xzr
-; CHECK-NEXT: lsl x1, x7, x3
-; CHECK-NEXT: cset w22, hs
-; CHECK-NEXT: orr x5, x5, x21
-; CHECK-NEXT: orr x3, x6, x20
-; CHECK-NEXT: orr x2, x2, x19
-; CHECK-NEXT: mov x6, xzr
-; CHECK-NEXT: tbnz w22, #0, .LBB1_5
-; CHECK-NEXT: // %bb.3: // %udiv-preheader
-; CHECK-NEXT: lsr x21, x13, #3
-; CHECK-NEXT: stp x14, x15, [sp]
-; CHECK-NEXT: mov x15, sp
-; CHECK-NEXT: stp q0, q0, [sp, #32]
-; CHECK-NEXT: mov x19, xzr
-; CHECK-NEXT: mov x20, xzr
-; CHECK-NEXT: and x14, x21, #0x18
-; CHECK-NEXT: stp x18, x0, [sp, #16]
-; CHECK-NEXT: and x18, x13, #0x3f
-; CHECK-NEXT: add x14, x15, x14
-; CHECK-NEXT: eor x18, x18, #0x3f
-; CHECK-NEXT: mvn w21, w13
-; CHECK-NEXT: ldp x15, x0, [x14, #16]
-; CHECK-NEXT: mov x7, xzr
-; CHECK-NEXT: ldp x25, x22, [x14]
-; CHECK-NEXT: subs x14, x8, #1
-; CHECK-NEXT: lsl x23, x0, #1
-; CHECK-NEXT: lsl x24, x15, #1
-; CHECK-NEXT: lsr x26, x15, x13
-; CHECK-NEXT: lsl x27, x22, #1
-; CHECK-NEXT: mov x15, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: lsr x28, x25, x13
-; CHECK-NEXT: lsl x23, x23, x18
-; CHECK-NEXT: lsl x21, x24, x21
-; CHECK-NEXT: lsr x24, x22, x13
-; CHECK-NEXT: lsl x27, x27, x18
-; CHECK-NEXT: adcs x18, x9, x15
-; CHECK-NEXT: orr x22, x23, x26
-; CHECK-NEXT: lsr x23, x0, x13
-; CHECK-NEXT: adcs x0, x10, x15
-; CHECK-NEXT: orr x25, x24, x21
-; CHECK-NEXT: orr x24, x27, x28
-; CHECK-NEXT: adc x21, x11, x15
-; CHECK-NEXT: .LBB1_4: // %udiv-do-while
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: extr x26, x24, x5, #63
-; CHECK-NEXT: extr x27, x25, x24, #63
-; CHECK-NEXT: extr x28, x22, x25, #63
-; CHECK-NEXT: extr x23, x23, x22, #63
-; CHECK-NEXT: extr x5, x5, x3, #63
-; CHECK-NEXT: extr x3, x3, x2, #63
-; CHECK-NEXT: cmp x14, x26
-; CHECK-NEXT: sbcs xzr, x18, x27
-; CHECK-NEXT: orr x3, x20, x3
-; CHECK-NEXT: orr x5, x7, x5
-; CHECK-NEXT: sbcs xzr, x0, x28
-; CHECK-NEXT: mov x7, xzr
-; CHECK-NEXT: sbc x22, x21, x23
-; CHECK-NEXT: asr x29, x22, #63
-; CHECK-NEXT: and x22, x29, x8
-; CHECK-NEXT: subs x24, x26, x22
-; CHECK-NEXT: and x22, x29, x9
-; CHECK-NEXT: and x26, x29, x11
-; CHECK-NEXT: sbcs x25, x27, x22
-; CHECK-NEXT: and x22, x29, x10
-; CHECK-NEXT: sbcs x22, x28, x22
-; CHECK-NEXT: sbc x23, x23, x26
-; CHECK-NEXT: subs x13, x13, #1
-; CHECK-NEXT: extr x26, x2, x1, #63
-; CHECK-NEXT: adcs x16, x16, x15
-; CHECK-NEXT: orr x1, x6, x1, lsl #1
-; CHECK-NEXT: and x6, x29, #0x1
-; CHECK-NEXT: adcs x17, x17, x15
-; CHECK-NEXT: orr x2, x19, x26
-; CHECK-NEXT: adc x4, x4, x15
-; CHECK-NEXT: orr x20, x13, x17
-; CHECK-NEXT: orr x19, x16, x4
-; CHECK-NEXT: orr x26, x20, x19
-; CHECK-NEXT: mov x19, xzr
-; CHECK-NEXT: mov x20, xzr
-; CHECK-NEXT: cbnz x26, .LBB1_4
-; CHECK-NEXT: .LBB1_5: // %udiv-loop-exit
-; CHECK-NEXT: ldp x20, x19, [sp, #208] // 16-byte Folded Reload
-; CHECK-NEXT: extr x7, x2, x1, #63
-; CHECK-NEXT: ldp x22, x21, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT: extr x4, x3, x2, #63
-; CHECK-NEXT: ldp x24, x23, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT: extr x13, x5, x3, #63
-; CHECK-NEXT: ldp x26, x25, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT: orr x3, x6, x1, lsl #1
-; CHECK-NEXT: ldp x28, x27, [sp, #144] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #128] // 8-byte Reload
-; CHECK-NEXT: add sp, sp, #224
-; CHECK-NEXT: .LBB1_6: // %udiv-end
-; CHECK-NEXT: eor x8, x3, x12
-; CHECK-NEXT: eor x9, x7, x12
-; CHECK-NEXT: subs x0, x8, x12
-; CHECK-NEXT: eor x8, x4, x12
-; CHECK-NEXT: sbcs x1, x9, x12
-; CHECK-NEXT: eor x9, x13, x12
-; CHECK-NEXT: sbcs x2, x8, x12
-; CHECK-NEXT: sbc x3, x9, x12
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: bl __divoi3
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%r = sdiv i256 %a, %b
ret i256 %r
@@ -426,232 +25,10 @@ define i256 @sdiv256(i256 %a, i256 %b) nounwind {
define i256 @urem256(i256 %a, i256 %b) nounwind {
; CHECK-LABEL: urem256:
-; CHECK: // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT: orr x8, x5, x7
-; CHECK-NEXT: orr x9, x4, x6
-; CHECK-NEXT: orr x10, x0, x2
-; CHECK-NEXT: orr x8, x9, x8
-; CHECK-NEXT: orr x9, x1, x3
-; CHECK-NEXT: clz x11, x1
-; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: orr x8, x10, x9
-; CHECK-NEXT: clz x9, x7
-; CHECK-NEXT: ccmp x8, #0, #4, ne
-; CHECK-NEXT: clz x8, x6
-; CHECK-NEXT: clz x10, x5
-; CHECK-NEXT: cset w13, eq
-; CHECK-NEXT: add x8, x8, #64
-; CHECK-NEXT: cmp x7, #0
-; CHECK-NEXT: csel x8, x9, x8, ne
-; CHECK-NEXT: clz x9, x4
-; CHECK-NEXT: cmp x5, #0
-; CHECK-NEXT: add x9, x9, #64
-; CHECK-NEXT: csel x9, x10, x9, ne
-; CHECK-NEXT: orr x10, x6, x7
-; CHECK-NEXT: add x9, x9, #128
-; CHECK-NEXT: cmp x10, #0
-; CHECK-NEXT: clz x10, x3
-; CHECK-NEXT: csel x8, x8, x9, ne
-; CHECK-NEXT: clz x9, x2
-; CHECK-NEXT: cmp x3, #0
-; CHECK-NEXT: add x9, x9, #64
-; CHECK-NEXT: csel x9, x10, x9, ne
-; CHECK-NEXT: clz x10, x0
-; CHECK-NEXT: cmp x1, #0
-; CHECK-NEXT: add x10, x10, #64
-; CHECK-NEXT: csel x10, x11, x10, ne
-; CHECK-NEXT: orr x11, x2, x3
-; CHECK-NEXT: add x10, x10, #128
-; CHECK-NEXT: cmp x11, #0
-; CHECK-NEXT: csel x9, x9, x10, ne
-; CHECK-NEXT: subs x12, x8, x9
-; CHECK-NEXT: mov w8, #255 // =0xff
-; CHECK-NEXT: ngcs x9, xzr
-; CHECK-NEXT: ngcs x10, xzr
-; CHECK-NEXT: ngc x11, xzr
-; CHECK-NEXT: cmp x8, x12
-; CHECK-NEXT: ngcs xzr, x9
-; CHECK-NEXT: ngcs xzr, x10
-; CHECK-NEXT: ngcs xzr, x11
-; CHECK-NEXT: csinc w15, w13, wzr, hs
-; CHECK-NEXT: cmp w15, #0
-; CHECK-NEXT: csel x13, xzr, x3, ne
-; CHECK-NEXT: csel x17, xzr, x2, ne
-; CHECK-NEXT: csel x14, xzr, x1, ne
-; CHECK-NEXT: csel x8, xzr, x0, ne
-; CHECK-NEXT: tbnz w15, #0, .LBB2_6
-; CHECK-NEXT: // %bb.1: // %_udiv-special-cases
-; CHECK-NEXT: eor x15, x12, #0xff
-; CHECK-NEXT: orr x16, x9, x11
-; CHECK-NEXT: orr x15, x15, x10
-; CHECK-NEXT: orr x15, x15, x16
-; CHECK-NEXT: cbz x15, .LBB2_6
-; CHECK-NEXT: // %bb.2: // %udiv-bb1
-; CHECK-NEXT: sub sp, sp, #256
-; CHECK-NEXT: mov w8, #255 // =0xff
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x14, sp, #96
-; CHECK-NEXT: sub x13, x8, x12
-; CHECK-NEXT: add x14, x14, #32
-; CHECK-NEXT: stp x29, x30, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT: lsr x8, x13, #3
-; CHECK-NEXT: stp x28, x27, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT: and x18, x13, #0x3f
-; CHECK-NEXT: stp x26, x25, [sp, #192] // 16-byte Folded Spill
-; CHECK-NEXT: eor x18, x18, #0x3f
-; CHECK-NEXT: and x8, x8, #0x18
-; CHECK-NEXT: stp x24, x23, [sp, #208] // 16-byte Folded Spill
-; CHECK-NEXT: stp x22, x21, [sp, #224] // 16-byte Folded Spill
-; CHECK-NEXT: sub x14, x14, x8
-; CHECK-NEXT: adds x8, x12, #1
-; CHECK-NEXT: stp x20, x19, [sp, #240] // 16-byte Folded Spill
-; CHECK-NEXT: mvn w19, w13
-; CHECK-NEXT: adcs x9, x9, xzr
-; CHECK-NEXT: stp x0, x1, [sp, #128]
-; CHECK-NEXT: adcs x10, x10, xzr
-; CHECK-NEXT: stp x2, x3, [sp, #144]
-; CHECK-NEXT: stp q0, q0, [sp, #96]
-; CHECK-NEXT: ldp x12, x16, [x14, #8]
-; CHECK-NEXT: ldr x15, [x14, #24]
-; CHECK-NEXT: ldr x17, [x14]
-; CHECK-NEXT: stp x1, x2, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: lsl x15, x15, x13
-; CHECK-NEXT: str x0, [sp, #8] // 8-byte Spill
-; CHECK-NEXT: lsr x14, x12, #1
-; CHECK-NEXT: lsr x20, x16, #1
-; CHECK-NEXT: lsl x16, x16, x13
-; CHECK-NEXT: lsl x12, x12, x13
-; CHECK-NEXT: lsr x21, x14, x19
-; CHECK-NEXT: lsr x19, x17, #1
-; CHECK-NEXT: lsr x20, x20, x18
-; CHECK-NEXT: adcs x14, x11, xzr
-; CHECK-NEXT: lsl x11, x17, x13
-; CHECK-NEXT: lsr x18, x19, x18
-; CHECK-NEXT: cset w19, hs
-; CHECK-NEXT: orr x15, x15, x20
-; CHECK-NEXT: orr x13, x16, x21
-; CHECK-NEXT: mov x16, xzr
-; CHECK-NEXT: orr x12, x12, x18
-; CHECK-NEXT: tbnz w19, #0, .LBB2_5
-; CHECK-NEXT: // %bb.3: // %udiv-preheader
-; CHECK-NEXT: lsr x20, x8, #3
-; CHECK-NEXT: stp x0, x1, [sp, #32]
-; CHECK-NEXT: add x1, sp, #32
-; CHECK-NEXT: stp q0, q0, [sp, #64]
-; CHECK-NEXT: and x21, x8, #0x3f
-; CHECK-NEXT: mvn w22, w8
-; CHECK-NEXT: and x0, x20, #0x18
-; CHECK-NEXT: stp x2, x3, [sp, #48]
-; CHECK-NEXT: eor x26, x21, #0x3f
-; CHECK-NEXT: add x0, x1, x0
-; CHECK-NEXT: mov x21, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: mov x18, xzr
-; CHECK-NEXT: ldp x2, x20, [x0, #16]
-; CHECK-NEXT: mov x19, xzr
-; CHECK-NEXT: ldp x0, x23, [x0]
-; CHECK-NEXT: mov x17, xzr
-; CHECK-NEXT: lsl x24, x20, #1
-; CHECK-NEXT: lsl x25, x2, #1
-; CHECK-NEXT: lsr x27, x2, x8
-; CHECK-NEXT: lsl x28, x23, #1
-; CHECK-NEXT: subs x2, x4, #1
-; CHECK-NEXT: lsr x30, x23, x8
-; CHECK-NEXT: lsl x24, x24, x26
-; CHECK-NEXT: lsl x29, x25, x22
-; CHECK-NEXT: lsr x1, x0, x8
-; CHECK-NEXT: lsl x0, x28, x26
-; CHECK-NEXT: adcs x22, x5, x21
-; CHECK-NEXT: lsr x26, x20, x8
-; CHECK-NEXT: adcs x23, x6, x21
-; CHECK-NEXT: orr x25, x24, x27
-; CHECK-NEXT: orr x28, x30, x29
-; CHECK-NEXT: orr x27, x0, x1
-; CHECK-NEXT: adc x24, x7, x21
-; CHECK-NEXT: .LBB2_4: // %udiv-do-while
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: extr x29, x27, x15, #63
-; CHECK-NEXT: extr x30, x28, x27, #63
-; CHECK-NEXT: extr x20, x25, x28, #63
-; CHECK-NEXT: extr x26, x26, x25, #63
-; CHECK-NEXT: extr x15, x15, x13, #63
-; CHECK-NEXT: extr x13, x13, x12, #63
-; CHECK-NEXT: cmp x2, x29
-; CHECK-NEXT: sbcs xzr, x22, x30
-; CHECK-NEXT: orr x13, x19, x13
-; CHECK-NEXT: orr x15, x17, x15
-; CHECK-NEXT: sbcs xzr, x23, x20
-; CHECK-NEXT: mov x17, xzr
-; CHECK-NEXT: sbc x25, x24, x26
-; CHECK-NEXT: asr x0, x25, #63
-; CHECK-NEXT: and x25, x0, x4
-; CHECK-NEXT: subs x27, x29, x25
-; CHECK-NEXT: and x25, x0, x5
-; CHECK-NEXT: sbcs x28, x30, x25
-; CHECK-NEXT: and x25, x0, x6
-; CHECK-NEXT: sbcs x25, x20, x25
-; CHECK-NEXT: and x20, x0, x7
-; CHECK-NEXT: sbc x26, x26, x20
-; CHECK-NEXT: subs x8, x8, #1
-; CHECK-NEXT: extr x20, x12, x11, #63
-; CHECK-NEXT: adcs x9, x9, x21
-; CHECK-NEXT: orr x11, x16, x11, lsl #1
-; CHECK-NEXT: and x16, x0, #0x1
-; CHECK-NEXT: adcs x10, x10, x21
-; CHECK-NEXT: orr x12, x18, x20
-; CHECK-NEXT: adc x14, x14, x21
-; CHECK-NEXT: orr x19, x8, x10
-; CHECK-NEXT: orr x18, x9, x14
-; CHECK-NEXT: orr x0, x19, x18
-; CHECK-NEXT: mov x18, xzr
-; CHECK-NEXT: mov x19, xzr
-; CHECK-NEXT: cbnz x0, .LBB2_4
-; CHECK-NEXT: .LBB2_5: // %udiv-loop-exit
-; CHECK-NEXT: ldp x1, x2, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: extr x14, x12, x11, #63
-; CHECK-NEXT: ldp x20, x19, [sp, #240] // 16-byte Folded Reload
-; CHECK-NEXT: extr x17, x13, x12, #63
-; CHECK-NEXT: ldp x22, x21, [sp, #224] // 16-byte Folded Reload
-; CHECK-NEXT: extr x13, x15, x13, #63
-; CHECK-NEXT: ldp x24, x23, [sp, #208] // 16-byte Folded Reload
-; CHECK-NEXT: orr x8, x16, x11, lsl #1
-; CHECK-NEXT: ldp x26, x25, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x0, [sp, #8] // 8-byte Reload
-; CHECK-NEXT: ldp x28, x27, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x29, x30, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #256
-; CHECK-NEXT: .LBB2_6: // %udiv-end
-; CHECK-NEXT: umulh x9, x17, x4
-; CHECK-NEXT: umulh x10, x8, x6
-; CHECK-NEXT: madd x9, x17, x5, x9
-; CHECK-NEXT: madd x10, x8, x7, x10
-; CHECK-NEXT: mul x11, x8, x6
-; CHECK-NEXT: mul x12, x17, x4
-; CHECK-NEXT: madd x9, x13, x4, x9
-; CHECK-NEXT: madd x10, x14, x6, x10
-; CHECK-NEXT: adds x11, x12, x11
-; CHECK-NEXT: umulh x15, x4, x8
-; CHECK-NEXT: mul x16, x5, x8
-; CHECK-NEXT: adc x9, x9, x10
-; CHECK-NEXT: umulh x13, x5, x8
-; CHECK-NEXT: mul x18, x4, x14
-; CHECK-NEXT: adds x10, x16, x15
-; CHECK-NEXT: umulh x17, x4, x14
-; CHECK-NEXT: cinc x13, x13, hs
-; CHECK-NEXT: mul x12, x5, x14
-; CHECK-NEXT: adds x10, x18, x10
-; CHECK-NEXT: umulh x6, x5, x14
-; CHECK-NEXT: cinc x14, x17, hs
-; CHECK-NEXT: mul x8, x4, x8
-; CHECK-NEXT: adds x13, x13, x14
-; CHECK-NEXT: cset w14, hs
-; CHECK-NEXT: adds x12, x12, x13
-; CHECK-NEXT: adc x13, x6, x14
-; CHECK-NEXT: adds x11, x12, x11
-; CHECK-NEXT: adc x9, x13, x9
-; CHECK-NEXT: subs x0, x0, x8
-; CHECK-NEXT: sbcs x1, x1, x10
-; CHECK-NEXT: sbcs x2, x2, x11
-; CHECK-NEXT: sbc x3, x3, x9
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: bl __umodoi3
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%r = urem i256 %a, %b
ret i256 %r
@@ -659,258 +36,10 @@ define i256 @urem256(i256 %a, i256 %b) nounwind {
define i256 @srem256(i256 %a, i256 %b) nounwind {
; CHECK-LABEL: srem256:
-; CHECK: // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT: sub sp, sp, #256
-; CHECK-NEXT: asr x8, x3, #63
-; CHECK-NEXT: stp x22, x21, [sp, #224] // 16-byte Folded Spill
-; CHECK-NEXT: asr x16, x7, #63
-; CHECK-NEXT: stp x24, x23, [sp, #208] // 16-byte Folded Spill
-; CHECK-NEXT: eor x9, x0, x8
-; CHECK-NEXT: eor x10, x1, x8
-; CHECK-NEXT: eor x11, x2, x8
-; CHECK-NEXT: subs x21, x9, x8
-; CHECK-NEXT: eor x13, x3, x8
-; CHECK-NEXT: eor x14, x4, x16
-; CHECK-NEXT: sbcs x22, x10, x8
-; CHECK-NEXT: eor x15, x5, x16
-; CHECK-NEXT: eor x17, x6, x16
-; CHECK-NEXT: sbcs x23, x11, x8
-; CHECK-NEXT: eor x18, x7, x16
-; CHECK-NEXT: clz x0, x22
-; CHECK-NEXT: sbc x11, x13, x8
-; CHECK-NEXT: subs x13, x14, x16
-; CHECK-NEXT: stp x29, x30, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT: sbcs x14, x15, x16
-; CHECK-NEXT: orr x3, x22, x11
-; CHECK-NEXT: clz x4, x13
-; CHECK-NEXT: sbcs x15, x17, x16
-; CHECK-NEXT: add x4, x4, #64
-; CHECK-NEXT: clz x17, x21
-; CHECK-NEXT: sbc x16, x18, x16
-; CHECK-NEXT: orr x1, x13, x15
-; CHECK-NEXT: orr x18, x21, x23
-; CHECK-NEXT: orr x2, x14, x16
-; CHECK-NEXT: orr x18, x18, x3
-; CHECK-NEXT: orr x5, x15, x16
-; CHECK-NEXT: orr x1, x1, x2
-; CHECK-NEXT: clz x2, x15
-; CHECK-NEXT: add x17, x17, #64
-; CHECK-NEXT: cmp x1, #0
-; CHECK-NEXT: add x2, x2, #64
-; CHECK-NEXT: clz x1, x14
-; CHECK-NEXT: ccmp x18, #0, #4, ne
-; CHECK-NEXT: clz x18, x16
-; CHECK-NEXT: stp x28, x27, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT: cset w3, eq
-; CHECK-NEXT: cmp x16, #0
-; CHECK-NEXT: stp x26, x25, [sp, #192] // 16-byte Folded Spill
-; CHECK-NEXT: csel x18, x18, x2, ne
-; CHECK-NEXT: cmp x14, #0
-; CHECK-NEXT: clz x2, x23
-; CHECK-NEXT: csel x1, x1, x4, ne
-; CHECK-NEXT: cmp x5, #0
-; CHECK-NEXT: add x2, x2, #64
-; CHECK-NEXT: add x1, x1, #128
-; CHECK-NEXT: clz x4, x11
-; CHECK-NEXT: stp x20, x19, [sp, #240] // 16-byte Folded Spill
-; CHECK-NEXT: csel x18, x18, x1, ne
-; CHECK-NEXT: cmp x11, #0
-; CHECK-NEXT: csel x1, x4, x2, ne
-; CHECK-NEXT: cmp x22, #0
-; CHECK-NEXT: orr x2, x23, x11
-; CHECK-NEXT: csel x17, x0, x17, ne
-; CHECK-NEXT: cmp x2, #0
-; CHECK-NEXT: add x17, x17, #128
-; CHECK-NEXT: csel x17, x1, x17, ne
-; CHECK-NEXT: subs x2, x18, x17
-; CHECK-NEXT: mov w17, #255 // =0xff
-; CHECK-NEXT: ngcs x18, xzr
-; CHECK-NEXT: ngcs x0, xzr
-; CHECK-NEXT: ngc x1, xzr
-; CHECK-NEXT: cmp x17, x2
-; CHECK-NEXT: ngcs xzr, x18
-; CHECK-NEXT: ngcs xzr, x0
-; CHECK-NEXT: ngcs xzr, x1
-; CHECK-NEXT: csinc w5, w3, wzr, hs
-; CHECK-NEXT: cmp w5, #0
-; CHECK-NEXT: csel x3, xzr, x11, ne
-; CHECK-NEXT: csel x7, xzr, x23, ne
-; CHECK-NEXT: csel x4, xzr, x22, ne
-; CHECK-NEXT: csel x17, xzr, x21, ne
-; CHECK-NEXT: tbnz w5, #0, .LBB3_6
-; CHECK-NEXT: // %bb.1: // %_udiv-special-cases
-; CHECK-NEXT: eor x5, x2, #0xff
-; CHECK-NEXT: orr x6, x18, x1
-; CHECK-NEXT: orr x5, x5, x0
-; CHECK-NEXT: orr x5, x5, x6
-; CHECK-NEXT: cbz x5, .LBB3_6
-; CHECK-NEXT: // %bb.2: // %udiv-bb1
-; CHECK-NEXT: mov w9, #255 // =0xff
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x12, sp, #96
-; CHECK-NEXT: sub x9, x9, x2
-; CHECK-NEXT: add x12, x12, #32
-; CHECK-NEXT: stp x21, x22, [sp, #128]
-; CHECK-NEXT: lsr x10, x9, #3
-; CHECK-NEXT: stp x23, x11, [sp, #144]
-; CHECK-NEXT: adds x17, x2, #1
-; CHECK-NEXT: and x5, x9, #0x3f
-; CHECK-NEXT: adcs x18, x18, xzr
-; CHECK-NEXT: mvn w6, w9
-; CHECK-NEXT: and x10, x10, #0x18
-; CHECK-NEXT: stp q0, q0, [sp, #96]
-; CHECK-NEXT: eor x5, x5, #0x3f
-; CHECK-NEXT: sub x10, x12, x10
-; CHECK-NEXT: adcs x0, x0, xzr
-; CHECK-NEXT: ldp x12, x3, [x10, #8]
-; CHECK-NEXT: ldr x2, [x10, #24]
-; CHECK-NEXT: ldr x10, [x10]
-; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: lsl x2, x2, x9
-; CHECK-NEXT: str x23, [sp, #8] // 8-byte Spill
-; CHECK-NEXT: lsr x4, x12, #1
-; CHECK-NEXT: lsr x7, x3, #1
-; CHECK-NEXT: lsr x19, x10, #1
-; CHECK-NEXT: lsl x3, x3, x9
-; CHECK-NEXT: lsl x12, x12, x9
-; CHECK-NEXT: lsr x6, x4, x6
-; CHECK-NEXT: lsr x7, x7, x5
-; CHECK-NEXT: lsr x19, x19, x5
-; CHECK-NEXT: adcs x4, x1, xzr
-; CHECK-NEXT: lsl x1, x10, x9
-; CHECK-NEXT: cset w20, hs
-; CHECK-NEXT: orr x5, x2, x7
-; CHECK-NEXT: orr x3, x3, x6
-; CHECK-NEXT: orr x2, x12, x19
-; CHECK-NEXT: mov x6, xzr
-; CHECK-NEXT: tbnz w20, #0, .LBB3_5
-; CHECK-NEXT: // %bb.3: // %udiv-preheader
-; CHECK-NEXT: lsr x9, x17, #3
-; CHECK-NEXT: add x10, sp, #32
-; CHECK-NEXT: stp q0, q0, [sp, #64]
-; CHECK-NEXT: stp x21, x22, [sp, #32]
-; CHECK-NEXT: and x21, x17, #0x3f
-; CHECK-NEXT: mvn w22, w17
-; CHECK-NEXT: and x9, x9, #0x18
-; CHECK-NEXT: stp x23, x11, [sp, #48]
-; CHECK-NEXT: eor x26, x21, #0x3f
-; CHECK-NEXT: add x9, x10, x9
-; CHECK-NEXT: subs x21, x13, #1
-; CHECK-NEXT: mov x19, xzr
-; CHECK-NEXT: ldp x10, x12, [x9, #16]
-; CHECK-NEXT: mov x20, xzr
-; CHECK-NEXT: ldp x9, x23, [x9]
-; CHECK-NEXT: mov x7, xzr
-; CHECK-NEXT: lsl x24, x12, #1
-; CHECK-NEXT: lsl x25, x10, #1
-; CHECK-NEXT: lsr x10, x10, x17
-; CHECK-NEXT: lsl x27, x23, #1
-; CHECK-NEXT: lsr x28, x23, x17
-; CHECK-NEXT: lsr x9, x9, x17
-; CHECK-NEXT: lsl x24, x24, x26
-; CHECK-NEXT: lsl x25, x25, x22
-; CHECK-NEXT: mov x22, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: lsl x30, x27, x26
-; CHECK-NEXT: adcs x23, x14, x22
-; CHECK-NEXT: lsr x27, x12, x17
-; CHECK-NEXT: orr x26, x24, x10
-; CHECK-NEXT: adcs x24, x15, x22
-; CHECK-NEXT: orr x29, x28, x25
-; CHECK-NEXT: orr x28, x30, x9
-; CHECK-NEXT: adc x25, x16, x22
-; CHECK-NEXT: .LBB3_4: // %udiv-do-while
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: extr x30, x28, x5, #63
-; CHECK-NEXT: extr x9, x29, x28, #63
-; CHECK-NEXT: extr x10, x26, x29, #63
-; CHECK-NEXT: extr x27, x27, x26, #63
-; CHECK-NEXT: cmp x21, x30
-; CHECK-NEXT: sbcs xzr, x23, x9
-; CHECK-NEXT: sbcs xzr, x24, x10
-; CHECK-NEXT: sbc x26, x25, x27
-; CHECK-NEXT: asr x12, x26, #63
-; CHECK-NEXT: and x26, x12, x13
-; CHECK-NEXT: subs x28, x30, x26
-; CHECK-NEXT: and x26, x12, x14
-; CHECK-NEXT: sbcs x29, x9, x26
-; CHECK-NEXT: and x9, x12, x15
-; CHECK-NEXT: sbcs x26, x10, x9
-; CHECK-NEXT: and x9, x12, x16
-; CHECK-NEXT: extr x10, x5, x3, #63
-; CHECK-NEXT: sbc x27, x27, x9
-; CHECK-NEXT: subs x17, x17, #1
-; CHECK-NEXT: extr x9, x2, x1, #63
-; CHECK-NEXT: adcs x18, x18, x22
-; CHECK-NEXT: extr x3, x3, x2, #63
-; CHECK-NEXT: orr x1, x6, x1, lsl #1
-; CHECK-NEXT: adcs x0, x0, x22
-; CHECK-NEXT: orr x2, x19, x9
-; CHECK-NEXT: orr x5, x7, x10
-; CHECK-NEXT: adc x4, x4, x22
-; CHECK-NEXT: orr x19, x17, x0
-; CHECK-NEXT: orr x3, x20, x3
-; CHECK-NEXT: orr x9, x18, x4
-; CHECK-NEXT: and x6, x12, #0x1
-; CHECK-NEXT: mov x20, xzr
-; CHECK-NEXT: orr x9, x19, x9
-; CHECK-NEXT: mov x19, xzr
-; CHECK-NEXT: mov x7, xzr
-; CHECK-NEXT: cbnz x9, .LBB3_4
-; CHECK-NEXT: .LBB3_5: // %udiv-loop-exit
-; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: extr x4, x2, x1, #63
-; CHECK-NEXT: extr x7, x3, x2, #63
-; CHECK-NEXT: extr x3, x5, x3, #63
-; CHECK-NEXT: orr x17, x6, x1, lsl #1
-; CHECK-NEXT: ldr x23, [sp, #8] // 8-byte Reload
-; CHECK-NEXT: .LBB3_6: // %udiv-end
-; CHECK-NEXT: umulh x10, x17, x15
-; CHECK-NEXT: ldp x20, x19, [sp, #240] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x26, x25, [sp, #192] // 16-byte Folded Reload
-; CHECK-NEXT: umulh x9, x7, x13
-; CHECK-NEXT: ldp x28, x27, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x29, x30, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT: madd x10, x17, x16, x10
-; CHECK-NEXT: madd x9, x7, x14, x9
-; CHECK-NEXT: madd x10, x4, x15, x10
-; CHECK-NEXT: mul x12, x17, x15
-; CHECK-NEXT: mul x15, x7, x13
-; CHECK-NEXT: madd x9, x3, x13, x9
-; CHECK-NEXT: umulh x18, x13, x17
-; CHECK-NEXT: adds x12, x15, x12
-; CHECK-NEXT: mul x0, x14, x17
-; CHECK-NEXT: adc x9, x9, x10
-; CHECK-NEXT: umulh x16, x14, x17
-; CHECK-NEXT: mul x2, x13, x4
-; CHECK-NEXT: adds x10, x0, x18
-; CHECK-NEXT: umulh x1, x13, x4
-; CHECK-NEXT: cinc x15, x16, hs
-; CHECK-NEXT: umulh x3, x14, x4
-; CHECK-NEXT: adds x10, x2, x10
-; CHECK-NEXT: mul x14, x14, x4
-; CHECK-NEXT: cinc x16, x1, hs
-; CHECK-NEXT: mul x13, x13, x17
-; CHECK-NEXT: adds x15, x15, x16
-; CHECK-NEXT: cset w16, hs
-; CHECK-NEXT: adds x14, x14, x15
-; CHECK-NEXT: adc x15, x3, x16
-; CHECK-NEXT: adds x12, x14, x12
-; CHECK-NEXT: adc x9, x15, x9
-; CHECK-NEXT: subs x13, x21, x13
-; CHECK-NEXT: sbcs x10, x22, x10
-; CHECK-NEXT: eor x13, x13, x8
-; CHECK-NEXT: sbcs x12, x23, x12
-; CHECK-NEXT: eor x10, x10, x8
-; CHECK-NEXT: sbc x9, x11, x9
-; CHECK-NEXT: subs x0, x13, x8
-; CHECK-NEXT: eor x11, x12, x8
-; CHECK-NEXT: sbcs x1, x10, x8
-; CHECK-NEXT: ldp x22, x21, [sp, #224] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x24, x23, [sp, #208] // 16-byte Folded Reload
-; CHECK-NEXT: eor x9, x9, x8
-; CHECK-NEXT: sbcs x2, x11, x8
-; CHECK-NEXT: sbc x3, x9, x8
-; CHECK-NEXT: add sp, sp, #256
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: bl __modoi3
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%r = srem i256 %a, %b
ret i256 %r
@@ -932,186 +61,14 @@ define i256 @udiv256_pow2(i256 %a) nounwind {
; Division by constant
define i256 @sdiv256_const(i256 %a) nounwind {
; CHECK-LABEL: sdiv256_const:
-; CHECK: // %bb.0: // %_udiv-special-cases
-; CHECK-NEXT: asr x8, x3, #63
-; CHECK-NEXT: mov w13, #255 // =0xff
-; CHECK-NEXT: eor x9, x0, x8
-; CHECK-NEXT: eor x10, x1, x8
-; CHECK-NEXT: subs x18, x9, x8
-; CHECK-NEXT: eor x9, x2, x8
-; CHECK-NEXT: sbcs x0, x10, x8
-; CHECK-NEXT: eor x10, x3, x8
-; CHECK-NEXT: sbcs x1, x9, x8
-; CHECK-NEXT: clz x9, x18
-; CHECK-NEXT: clz x11, x0
-; CHECK-NEXT: sbcs x5, x10, x8
-; CHECK-NEXT: clz x10, x1
-; CHECK-NEXT: add x9, x9, #64
-; CHECK-NEXT: add x10, x10, #64
-; CHECK-NEXT: clz x12, x5
-; CHECK-NEXT: orr x14, x0, x5
-; CHECK-NEXT: csel x10, x12, x10, ne
-; CHECK-NEXT: cmp x0, #0
-; CHECK-NEXT: orr x12, x1, x5
-; CHECK-NEXT: csel x9, x11, x9, ne
-; CHECK-NEXT: cmp x12, #0
-; CHECK-NEXT: mov w11, #253 // =0xfd
-; CHECK-NEXT: add x9, x9, #128
-; CHECK-NEXT: csel x9, x10, x9, ne
-; CHECK-NEXT: subs x9, x11, x9
-; CHECK-NEXT: ngcs x10, xzr
-; CHECK-NEXT: ngcs x11, xzr
-; CHECK-NEXT: ngc x12, xzr
-; CHECK-NEXT: cmp x13, x9
-; CHECK-NEXT: orr x13, x18, x1
-; CHECK-NEXT: ngcs xzr, x10
-; CHECK-NEXT: orr x13, x13, x14
-; CHECK-NEXT: ngcs xzr, x11
-; CHECK-NEXT: ngcs xzr, x12
-; CHECK-NEXT: ccmp x13, #0, #4, hs
-; CHECK-NEXT: csel x13, xzr, x5, eq
-; CHECK-NEXT: csel x15, xzr, x1, eq
-; CHECK-NEXT: csel x2, xzr, x0, eq
-; CHECK-NEXT: csel x14, xzr, x18, eq
-; CHECK-NEXT: b.eq .LBB5_6
-; CHECK-NEXT: // %bb.1: // %_udiv-special-cases
-; CHECK-NEXT: eor x16, x9, #0xff
-; CHECK-NEXT: orr x17, x10, x12
-; CHECK-NEXT: orr x16, x16, x11
-; CHECK-NEXT: orr x16, x16, x17
-; CHECK-NEXT: cbz x16, .LBB5_6
-; CHECK-NEXT: // %bb.2: // %udiv-bb1
-; CHECK-NEXT: sub sp, sp, #192
-; CHECK-NEXT: mov w13, #255 // =0xff
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: add x15, sp, #64
-; CHECK-NEXT: sub x13, x13, x9
-; CHECK-NEXT: add x15, x15, #32
-; CHECK-NEXT: stp x26, x25, [sp, #128] // 16-byte Folded Spill
-; CHECK-NEXT: lsr x14, x13, #3
-; CHECK-NEXT: stp x24, x23, [sp, #144] // 16-byte Folded Spill
-; CHECK-NEXT: adds x9, x9, #1
-; CHECK-NEXT: stp x22, x21, [sp, #160] // 16-byte Folded Spill
-; CHECK-NEXT: and x3, x13, #0x3f
-; CHECK-NEXT: adcs x10, x10, xzr
-; CHECK-NEXT: and x14, x14, #0x18
-; CHECK-NEXT: stp x20, x19, [sp, #176] // 16-byte Folded Spill
-; CHECK-NEXT: mvn w4, w13
-; CHECK-NEXT: stp x18, x0, [sp, #96]
-; CHECK-NEXT: sub x14, x15, x14
-; CHECK-NEXT: eor x3, x3, #0x3f
-; CHECK-NEXT: stp x1, x5, [sp, #112]
-; CHECK-NEXT: adcs x11, x11, xzr
-; CHECK-NEXT: stp q0, q0, [sp, #64]
-; CHECK-NEXT: ldp x16, x2, [x14, #8]
-; CHECK-NEXT: ldr x17, [x14, #24]
-; CHECK-NEXT: ldr x14, [x14]
-; CHECK-NEXT: lsl x17, x17, x13
-; CHECK-NEXT: lsr x15, x16, #1
-; CHECK-NEXT: lsr x6, x2, #1
-; CHECK-NEXT: lsr x7, x14, #1
-; CHECK-NEXT: lsl x2, x2, x13
-; CHECK-NEXT: lsl x19, x16, x13
-; CHECK-NEXT: lsr x4, x15, x4
-; CHECK-NEXT: lsr x6, x6, x3
-; CHECK-NEXT: lsr x3, x7, x3
-; CHECK-NEXT: adcs x15, x12, xzr
-; CHECK-NEXT: lsl x12, x14, x13
-; CHECK-NEXT: cset w7, hs
-; CHECK-NEXT: orr x16, x17, x6
-; CHECK-NEXT: orr x14, x2, x4
-; CHECK-NEXT: orr x13, x19, x3
-; CHECK-NEXT: mov x17, xzr
-; CHECK-NEXT: tbnz w7, #0, .LBB5_5
-; CHECK-NEXT: // %bb.3: // %udiv-preheader
-; CHECK-NEXT: lsr x6, x9, #3
-; CHECK-NEXT: stp x18, x0, [sp]
-; CHECK-NEXT: mov x0, sp
-; CHECK-NEXT: stp q0, q0, [sp, #32]
-; CHECK-NEXT: mvn w7, w9
-; CHECK-NEXT: mov x3, xzr
-; CHECK-NEXT: and x18, x6, #0x18
-; CHECK-NEXT: stp x1, x5, [sp, #16]
-; CHECK-NEXT: and x5, x9, #0x3f
-; CHECK-NEXT: add x0, x0, x18
-; CHECK-NEXT: mov w18, #7 // =0x7
-; CHECK-NEXT: eor x5, x5, #0x3f
-; CHECK-NEXT: ldp x1, x6, [x0, #16]
-; CHECK-NEXT: mov x4, xzr
-; CHECK-NEXT: ldp x23, x21, [x0]
-; CHECK-NEXT: subs x0, x18, #1
-; CHECK-NEXT: mov x2, xzr
-; CHECK-NEXT: lsl x20, x1, #1
-; CHECK-NEXT: lsl x19, x6, #1
-; CHECK-NEXT: lsr x22, x1, x9
-; CHECK-NEXT: mov x1, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: lsr x23, x23, x9
-; CHECK-NEXT: lsl x7, x20, x7
-; CHECK-NEXT: lsl x20, x21, #1
-; CHECK-NEXT: lsl x19, x19, x5
-; CHECK-NEXT: lsr x21, x21, x9
-; CHECK-NEXT: lsl x24, x20, x5
-; CHECK-NEXT: adcs x5, xzr, x1
-; CHECK-NEXT: lsr x20, x6, x9
-; CHECK-NEXT: adcs x6, xzr, x1
-; CHECK-NEXT: orr x19, x19, x22
-; CHECK-NEXT: orr x21, x21, x7
-; CHECK-NEXT: orr x22, x24, x23
-; CHECK-NEXT: adc x7, xzr, x1
-; CHECK-NEXT: .LBB5_4: // %udiv-do-while
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: extr x23, x22, x16, #63
-; CHECK-NEXT: extr x24, x21, x22, #63
-; CHECK-NEXT: extr x25, x19, x21, #63
-; CHECK-NEXT: extr x20, x20, x19, #63
-; CHECK-NEXT: extr x16, x16, x14, #63
-; CHECK-NEXT: extr x14, x14, x13, #63
-; CHECK-NEXT: cmp x0, x23
-; CHECK-NEXT: sbcs xzr, x5, x24
-; CHECK-NEXT: orr x14, x4, x14
-; CHECK-NEXT: orr x16, x2, x16
-; CHECK-NEXT: sbcs xzr, x6, x25
-; CHECK-NEXT: mov x2, xzr
-; CHECK-NEXT: sbc x19, x7, x20
-; CHECK-NEXT: asr x26, x19, #63
-; CHECK-NEXT: and x19, x26, x18
-; CHECK-NEXT: subs x22, x23, x19
-; CHECK-NEXT: extr x23, x13, x12, #63
-; CHECK-NEXT: orr x12, x17, x12, lsl #1
-; CHECK-NEXT: sbcs x21, x24, xzr
-; CHECK-NEXT: and x17, x26, #0x1
-; CHECK-NEXT: sbcs x19, x25, xzr
-; CHECK-NEXT: orr x13, x3, x23
-; CHECK-NEXT: sbc x20, x20, xzr
-; CHECK-NEXT: subs x9, x9, #1
-; CHECK-NEXT: adcs x10, x10, x1
-; CHECK-NEXT: adcs x11, x11, x1
-; CHECK-NEXT: adc x15, x15, x1
-; CHECK-NEXT: orr x4, x9, x11
-; CHECK-NEXT: orr x3, x10, x15
-; CHECK-NEXT: orr x23, x4, x3
-; CHECK-NEXT: mov x3, xzr
-; CHECK-NEXT: mov x4, xzr
-; CHECK-NEXT: cbnz x23, .LBB5_4
-; CHECK-NEXT: .LBB5_5: // %udiv-loop-exit
-; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT: extr x2, x13, x12, #63
-; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT: extr x15, x14, x13, #63
-; CHECK-NEXT: ldp x24, x23, [sp, #144] // 16-byte Folded Reload
-; CHECK-NEXT: extr x13, x16, x14, #63
-; CHECK-NEXT: ldp x26, x25, [sp, #128] // 16-byte Folded Reload
-; CHECK-NEXT: orr x14, x17, x12, lsl #1
-; CHECK-NEXT: add sp, sp, #192
-; CHECK-NEXT: .LBB5_6: // %udiv-end
-; CHECK-NEXT: eor x9, x14, x8
-; CHECK-NEXT: eor x10, x2, x8
-; CHECK-NEXT: subs x0, x9, x8
-; CHECK-NEXT: eor x9, x15, x8
-; CHECK-NEXT: sbcs x1, x10, x8
-; CHECK-NEXT: eor x10, x13, x8
-; CHECK-NEXT: sbcs x2, x9, x8
-; CHECK-NEXT: sbc x3, x10, x8
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: mov w4, #7 // =0x7
+; CHECK-NEXT: mov x5, xzr
+; CHECK-NEXT: mov x6, xzr
+; CHECK-NEXT: mov x7, xzr
+; CHECK-NEXT: bl __divoi3
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%r = sdiv i256 %a, 7
ret i256 %r
diff --git a/llvm/test/CodeGen/AArch64/shift-i256.ll b/llvm/test/CodeGen/AArch64/shift-i256.ll
index cde8144643575..ade849bc8b2fd 100644
--- a/llvm/test/CodeGen/AArch64/shift-i256.ll
+++ b/llvm/test/CodeGen/AArch64/shift-i256.ll
@@ -5,7 +5,10 @@
define i256 @shl_i256(i256 %a, i256 %amt) nounwind {
; CHECK-LABEL: shl_i256:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: sub x9, sp, #80
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: lsr x8, x4, #3
; CHECK-NEXT: mov x9, sp
@@ -33,7 +36,8 @@ define i256 @shl_i256(i256 %a, i256 %amt) nounwind {
; CHECK-NEXT: orr x1, x9, x15
; CHECK-NEXT: orr x2, x12, x13
; CHECK-NEXT: orr x3, x8, x11
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%r = shl i256 %a, %amt
ret i256 %r
@@ -43,7 +47,10 @@ define i256 @shl_i256(i256 %a, i256 %amt) nounwind {
define i256 @lshr_i256(i256 %a, i256 %amt) nounwind {
; CHECK-LABEL: lshr_i256:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: sub x9, sp, #80
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK-NEXT: movi v0.2d, #0000000000000000
; CHECK-NEXT: lsr x8, x4, #3
; CHECK-NEXT: mov x9, sp
@@ -53,7 +60,7 @@ define i256 @lshr_i256(i256 %a, i256 %amt) nounwind {
; CHECK-NEXT: and x8, x8, #0x18
; CHECK-NEXT: stp x0, x1, [sp]
; CHECK-NEXT: eor x11, x11, #0x3f
-; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: orr x8, x9, x8
; CHECK-NEXT: stp q0, q0, [sp, #32]
; CHECK-NEXT: ldp x9, x10, [x8]
; CHECK-NEXT: ldp x12, x8, [x8, #16]
@@ -70,7 +77,8 @@ define i256 @lshr_i256(i256 %a, i256 %amt) nounwind {
; CHECK-NEXT: orr x0, x14, x9
; CHECK-NEXT: orr x1, x10, x8
; CHECK-NEXT: orr x2, x11, x12
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%r = lshr i256 %a, %amt
ret i256 %r
@@ -80,7 +88,10 @@ define i256 @lshr_i256(i256 %a, i256 %amt) nounwind {
define i256 @ashr_i256(i256 %a, i256 %amt) nounwind {
; CHECK-LABEL: ashr_i256:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: sub x9, sp, #80
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK-NEXT: lsr x9, x4, #3
; CHECK-NEXT: asr x8, x3, #63
; CHECK-NEXT: mov x10, sp
@@ -91,7 +102,7 @@ define i256 @ashr_i256(i256 %a, i256 %amt) nounwind {
; CHECK-NEXT: stp x0, x1, [sp]
; CHECK-NEXT: eor x13, x13, #0x3f
; CHECK-NEXT: stp x8, x8, [sp, #48]
-; CHECK-NEXT: add x9, x10, x9
+; CHECK-NEXT: orr x9, x10, x9
; CHECK-NEXT: stp x8, x8, [sp, #32]
; CHECK-NEXT: ldp x10, x8, [x9, #8]
; CHECK-NEXT: ldr x11, [x9]
@@ -109,7 +120,8 @@ define i256 @ashr_i256(i256 %a, i256 %amt) nounwind {
; CHECK-NEXT: orr x0, x15, x11
; CHECK-NEXT: orr x1, x10, x12
; CHECK-NEXT: orr x2, x13, x8
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%r = ashr i256 %a, %amt
ret i256 %r
@@ -155,3 +167,4 @@ define i256 @ashr_i256_const(i256 %a) nounwind {
%r = ashr i256 %a, 17
ret i256 %r
}
+
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
index b02788ab1b34c..f1555c816b36d 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -179,7 +179,10 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: lshr_32bytes:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT: sub x9, sp, #80
+; ALL-NEXT: mov x29, sp
+; ALL-NEXT: and sp, x9, #0xffffffffffffffe0
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: movi v0.2d, #0000000000000000
; ALL-NEXT: ldr x10, [x1]
@@ -188,7 +191,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: mov x8, sp
; ALL-NEXT: and x9, x10, #0x18
; ALL-NEXT: str q1, [sp]
-; ALL-NEXT: add x8, x8, x9
+; ALL-NEXT: orr x8, x8, x9
; ALL-NEXT: lsl x9, x10, #3
; ALL-NEXT: stp q0, q0, [sp, #32]
; ALL-NEXT: ldp x11, x10, [x8, #16]
@@ -196,21 +199,22 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: ldp x8, x12, [x8]
; ALL-NEXT: and x9, x9, #0x38
; ALL-NEXT: lsl x14, x10, #1
-; ALL-NEXT: lsl x15, x11, #1
+; ALL-NEXT: lsl x16, x11, #1
; ALL-NEXT: lsr x11, x11, x9
-; ALL-NEXT: lsl x16, x12, #1
+; ALL-NEXT: lsl x15, x12, #1
; ALL-NEXT: lsr x10, x10, x9
-; ALL-NEXT: lsr x12, x12, x9
-; ALL-NEXT: lsl x14, x14, x13
; ALL-NEXT: lsr x8, x8, x9
-; ALL-NEXT: lsl x9, x16, x13
+; ALL-NEXT: lsl x14, x14, x13
+; ALL-NEXT: lsr x9, x12, x9
+; ALL-NEXT: lsl x12, x16, x13
; ALL-NEXT: lsl x13, x15, x13
; ALL-NEXT: orr x11, x14, x11
-; ALL-NEXT: orr x8, x9, x8
-; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: orr x9, x9, x12
+; ALL-NEXT: orr x8, x13, x8
; ALL-NEXT: stp x11, x10, [x2, #16]
; ALL-NEXT: stp x8, x9, [x2]
-; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: mov sp, x29
+; ALL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -223,22 +227,25 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: lshr_32bytes_dwordOff:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT: sub x9, sp, #80
+; ALL-NEXT: mov x29, sp
+; ALL-NEXT: and sp, x9, #0xffffffffffffffe0
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: movi v0.2d, #0000000000000000
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q1, [x0]
; ALL-NEXT: stp x9, x8, [sp, #16]
-; ALL-NEXT: ubfiz x8, x10, #3, #2
-; ALL-NEXT: mov x9, sp
+; ALL-NEXT: mov x8, sp
+; ALL-NEXT: bfi x8, x10, #3, #2
; ALL-NEXT: str q1, [sp]
; ALL-NEXT: stp q0, q0, [sp, #32]
-; ALL-NEXT: add x8, x9, x8
-; ALL-NEXT: ldp x10, x9, [x8, #16]
+; ALL-NEXT: ldp x9, x10, [x8, #16]
; ALL-NEXT: ldr q0, [x8]
; ALL-NEXT: str q0, [x2]
-; ALL-NEXT: stp x10, x9, [x2, #16]
-; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: stp x9, x10, [x2, #16]
+; ALL-NEXT: mov sp, x29
+; ALL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -251,7 +258,10 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: shl_32bytes:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT: sub x9, sp, #80
+; ALL-NEXT: mov x29, sp
+; ALL-NEXT: and sp, x9, #0xffffffffffffffe0
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: movi v0.2d, #0000000000000000
; ALL-NEXT: ldr x10, [x1]
@@ -283,7 +293,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: orr x9, x12, x13
; ALL-NEXT: stp x10, x11, [x2]
; ALL-NEXT: stp x9, x8, [x2, #16]
-; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: mov sp, x29
+; ALL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -296,7 +307,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: shl_32bytes_dwordOff:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT: sub x9, sp, #80
+; ALL-NEXT: mov x29, sp
+; ALL-NEXT: and sp, x9, #0xffffffffffffffe0
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: movi v0.2d, #0000000000000000
; ALL-NEXT: ldr x10, [x1]
@@ -312,7 +326,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; ALL-NEXT: ldr q0, [x8]
; ALL-NEXT: str q0, [x2]
; ALL-NEXT: stp x9, x10, [x2, #16]
-; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: mov sp, x29
+; ALL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -325,7 +340,10 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: ashr_32bytes:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT: sub x9, sp, #80
+; ALL-NEXT: mov x29, sp
+; ALL-NEXT: and sp, x9, #0xffffffffffffffe0
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q0, [x0]
@@ -334,7 +352,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: asr x8, x8, #63
; ALL-NEXT: mov x9, sp
; ALL-NEXT: str q0, [sp]
-; ALL-NEXT: add x9, x9, x11
+; ALL-NEXT: orr x9, x9, x11
; ALL-NEXT: stp x8, x8, [sp, #48]
; ALL-NEXT: stp x8, x8, [sp, #32]
; ALL-NEXT: lsl x8, x10, #3
@@ -343,21 +361,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: mvn w13, w8
; ALL-NEXT: and x8, x8, #0x38
; ALL-NEXT: lsl x14, x10, #1
-; ALL-NEXT: lsl x15, x11, #1
+; ALL-NEXT: lsl x16, x11, #1
; ALL-NEXT: lsr x11, x11, x8
-; ALL-NEXT: lsl x16, x12, #1
+; ALL-NEXT: lsl x15, x12, #1
; ALL-NEXT: asr x10, x10, x8
-; ALL-NEXT: lsr x12, x12, x8
+; ALL-NEXT: lsr x9, x9, x8
; ALL-NEXT: lsl x14, x14, x13
-; ALL-NEXT: lsr x8, x9, x8
-; ALL-NEXT: lsl x9, x16, x13
+; ALL-NEXT: lsr x8, x12, x8
+; ALL-NEXT: lsl x12, x16, x13
; ALL-NEXT: lsl x13, x15, x13
; ALL-NEXT: orr x11, x14, x11
-; ALL-NEXT: orr x8, x9, x8
-; ALL-NEXT: orr x9, x12, x13
+; ALL-NEXT: orr x8, x8, x12
+; ALL-NEXT: orr x9, x13, x9
; ALL-NEXT: stp x11, x10, [x2, #16]
-; ALL-NEXT: stp x8, x9, [x2]
-; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: stp x9, x8, [x2]
+; ALL-NEXT: mov sp, x29
+; ALL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -370,23 +389,26 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: ashr_32bytes_dwordOff:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT: sub x9, sp, #80
+; ALL-NEXT: mov x29, sp
+; ALL-NEXT: and sp, x9, #0xffffffffffffffe0
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: ldr x10, [x1]
; ALL-NEXT: ldr q0, [x0]
; ALL-NEXT: stp x9, x8, [sp, #16]
; ALL-NEXT: asr x8, x8, #63
-; ALL-NEXT: ubfiz x9, x10, #3, #2
-; ALL-NEXT: mov x10, sp
+; ALL-NEXT: mov x9, sp
+; ALL-NEXT: bfi x9, x10, #3, #2
; ALL-NEXT: str q0, [sp]
; ALL-NEXT: stp x8, x8, [sp, #48]
; ALL-NEXT: stp x8, x8, [sp, #32]
-; ALL-NEXT: add x8, x10, x9
-; ALL-NEXT: ldp x10, x9, [x8, #16]
-; ALL-NEXT: ldr q0, [x8]
+; ALL-NEXT: ldp x8, x10, [x9, #16]
+; ALL-NEXT: ldr q0, [x9]
; ALL-NEXT: str q0, [x2]
-; ALL-NEXT: stp x10, x9, [x2, #16]
-; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: stp x8, x10, [x2, #16]
+; ALL-NEXT: mov sp, x29
+; ALL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%dwordOff = load i256, ptr %dwordOff.ptr, align 1
diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index 92fd4fe30980c..609cd8909bf5f 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -154,7 +154,10 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: lshr_32bytes:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT: sub x9, sp, #80
+; ALL-NEXT: mov x29, sp
+; ALL-NEXT: and sp, x9, #0xffffffffffffffe0
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: movi v0.2d, #0000000000000000
; ALL-NEXT: ldr x10, [x1]
@@ -163,30 +166,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: lsr x8, x10, #3
; ALL-NEXT: mov x9, sp
; ALL-NEXT: str q1, [sp]
-; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x13, x10, #0x3f
+; ALL-NEXT: mvn w14, w10
; ALL-NEXT: and x8, x8, #0x18
; ALL-NEXT: stp q0, q0, [sp, #32]
-; ALL-NEXT: eor x12, x12, #0x3f
-; ALL-NEXT: add x8, x9, x8
-; ALL-NEXT: ldp x13, x9, [x8]
-; ALL-NEXT: ldp x8, x11, [x8, #16]
+; ALL-NEXT: eor x13, x13, #0x3f
+; ALL-NEXT: orr x8, x9, x8
+; ALL-NEXT: ldp x11, x9, [x8, #16]
+; ALL-NEXT: ldp x12, x8, [x8]
; ALL-NEXT: lsl x15, x9, #1
-; ALL-NEXT: lsr x9, x9, x10
-; ALL-NEXT: lsr x13, x13, x10
-; ALL-NEXT: lsl x14, x11, #1
+; ALL-NEXT: lsl x17, x11, #1
; ALL-NEXT: lsr x11, x11, x10
-; ALL-NEXT: lsl x14, x14, x12
-; ALL-NEXT: lsl x12, x15, x12
-; ALL-NEXT: lsl x15, x8, #1
+; ALL-NEXT: lsl x16, x8, #1
+; ALL-NEXT: lsr x9, x9, x10
+; ALL-NEXT: lsr x12, x12, x10
+; ALL-NEXT: lsl x15, x15, x13
; ALL-NEXT: lsr x8, x8, x10
-; ALL-NEXT: mvn w10, w10
-; ALL-NEXT: lsl x10, x15, x10
-; ALL-NEXT: orr x8, x14, x8
-; ALL-NEXT: stp x8, x11, [x2, #16]
-; ALL-NEXT: orr x11, x12, x13
-; ALL-NEXT: orr x8, x9, x10
-; ALL-NEXT: stp x11, x8, [x2]
-; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: lsl x10, x17, x14
+; ALL-NEXT: lsl x13, x16, x13
+; ALL-NEXT: orr x11, x15, x11
+; ALL-NEXT: orr x8, x8, x10
+; ALL-NEXT: stp x11, x9, [x2, #16]
+; ALL-NEXT: orr x9, x13, x12
+; ALL-NEXT: stp x9, x8, [x2]
+; ALL-NEXT: mov sp, x29
+; ALL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -197,7 +201,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: shl_32bytes:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT: sub x9, sp, #80
+; ALL-NEXT: mov x29, sp
+; ALL-NEXT: and sp, x9, #0xffffffffffffffe0
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: movi v0.2d, #0000000000000000
; ALL-NEXT: ldr x10, [x1]
@@ -230,7 +237,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: orr x11, x13, x14
; ALL-NEXT: orr x8, x9, x10
; ALL-NEXT: stp x8, x11, [x2, #16]
-; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: mov sp, x29
+; ALL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -241,7 +249,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-LABEL: ashr_32bytes:
; ALL: // %bb.0:
-; ALL-NEXT: sub sp, sp, #64
+; ALL-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; ALL-NEXT: sub x9, sp, #80
+; ALL-NEXT: mov x29, sp
+; ALL-NEXT: and sp, x9, #0xffffffffffffffe0
; ALL-NEXT: ldp x9, x8, [x0, #16]
; ALL-NEXT: mov x11, sp
; ALL-NEXT: ldr x10, [x1]
@@ -250,31 +261,32 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; ALL-NEXT: lsr x9, x10, #3
; ALL-NEXT: asr x8, x8, #63
; ALL-NEXT: str q0, [sp]
-; ALL-NEXT: and x12, x10, #0x3f
+; ALL-NEXT: and x13, x10, #0x3f
+; ALL-NEXT: mvn w14, w10
; ALL-NEXT: and x9, x9, #0x18
; ALL-NEXT: stp x8, x8, [sp, #48]
-; ALL-NEXT: eor x12, x12, #0x3f
+; ALL-NEXT: eor x13, x13, #0x3f
; ALL-NEXT: stp x8, x8, [sp, #32]
-; ALL-NEXT: add x8, x11, x9
-; ALL-NEXT: ldp x13, x9, [x8]
-; ALL-NEXT: ldp x8, x11, [x8, #16]
+; ALL-NEXT: orr x8, x11, x9
+; ALL-NEXT: ldp x11, x9, [x8, #16]
+; ALL-NEXT: ldp x12, x8, [x8]
; ALL-NEXT: lsl x15, x9, #1
-; ALL-NEXT: lsr x9, x9, x10
-; ALL-NEXT: lsr x13, x13, x10
-; ALL-NEXT: lsl x14, x11, #1
-; ALL-NEXT: asr x11, x11, x10
-; ALL-NEXT: lsl x14, x14, x12
-; ALL-NEXT: lsl x12, x15, x12
-; ALL-NEXT: lsl x15, x8, #1
+; ALL-NEXT: lsl x17, x11, #1
+; ALL-NEXT: lsr x11, x11, x10
+; ALL-NEXT: lsl x16, x8, #1
+; ALL-NEXT: asr x9, x9, x10
+; ALL-NEXT: lsr x12, x12, x10
+; ALL-NEXT: lsl x15, x15, x13
; ALL-NEXT: lsr x8, x8, x10
-; ALL-NEXT: mvn w10, w10
-; ALL-NEXT: lsl x10, x15, x10
-; ALL-NEXT: orr x8, x14, x8
-; ALL-NEXT: stp x8, x11, [x2, #16]
-; ALL-NEXT: orr x11, x12, x13
-; ALL-NEXT: orr x8, x9, x10
-; ALL-NEXT: stp x11, x8, [x2]
-; ALL-NEXT: add sp, sp, #64
+; ALL-NEXT: lsl x10, x17, x14
+; ALL-NEXT: lsl x13, x16, x13
+; ALL-NEXT: orr x11, x15, x11
+; ALL-NEXT: orr x8, x8, x10
+; ALL-NEXT: stp x11, x9, [x2, #16]
+; ALL-NEXT: orr x9, x13, x12
+; ALL-NEXT: stp x9, x8, [x2]
+; ALL-NEXT: mov sp, x29
+; ALL-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; ALL-NEXT: ret
%src = load i256, ptr %src.ptr, align 1
%bitOff = load i256, ptr %bitOff.ptr, align 1
diff --git a/llvm/test/CodeGen/RISCV/i256-arith.ll b/llvm/test/CodeGen/RISCV/i256-arith.ll
new file mode 100644
index 0000000000000..45da20f332b84
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/i256-arith.ll
@@ -0,0 +1,1442 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv32 | FileCheck %s --check-prefix=RV32
+; RUN: llc < %s -mtriple=riscv64 | FileCheck %s --check-prefix=RV64
+
+; i256 add
+define i256 @add_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: add_i256:
+; RV32: # %bb.0:
+; RV32-NEXT: lw a3, 0(a2)
+; RV32-NEXT: lw a4, 4(a2)
+; RV32-NEXT: lw a6, 8(a2)
+; RV32-NEXT: lw a5, 12(a2)
+; RV32-NEXT: lw t2, 4(a1)
+; RV32-NEXT: lw a7, 0(a1)
+; RV32-NEXT: lw t1, 8(a1)
+; RV32-NEXT: lw t0, 12(a1)
+; RV32-NEXT: add a4, t2, a4
+; RV32-NEXT: add a3, a7, a3
+; RV32-NEXT: sltu a7, a3, a7
+; RV32-NEXT: add a4, a4, a7
+; RV32-NEXT: beq a4, t2, .LBB0_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: sltu a7, a4, t2
+; RV32-NEXT: .LBB0_2:
+; RV32-NEXT: add a6, t1, a6
+; RV32-NEXT: add t2, t0, a5
+; RV32-NEXT: add a5, a6, a7
+; RV32-NEXT: sltu t3, a6, t1
+; RV32-NEXT: sltu a6, a5, a6
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add a6, t2, a6
+; RV32-NEXT: beq a6, t0, .LBB0_4
+; RV32-NEXT: # %bb.3:
+; RV32-NEXT: sltu t3, a6, t0
+; RV32-NEXT: j .LBB0_5
+; RV32-NEXT: .LBB0_4:
+; RV32-NEXT: sltu t3, a5, t1
+; RV32-NEXT: .LBB0_5:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw s0, 16(a2)
+; RV32-NEXT: lw t6, 20(a2)
+; RV32-NEXT: lw t5, 16(a1)
+; RV32-NEXT: lw t2, 20(a1)
+; RV32-NEXT: xor t1, a5, t1
+; RV32-NEXT: xor t0, a6, t0
+; RV32-NEXT: or t0, t1, t0
+; RV32-NEXT: beqz t0, .LBB0_7
+; RV32-NEXT: # %bb.6:
+; RV32-NEXT: mv a7, t3
+; RV32-NEXT: .LBB0_7:
+; RV32-NEXT: lw t4, 24(a2)
+; RV32-NEXT: lw t0, 28(a2)
+; RV32-NEXT: lw a2, 24(a1)
+; RV32-NEXT: lw t3, 28(a1)
+; RV32-NEXT: add s0, t5, s0
+; RV32-NEXT: add t6, t2, t6
+; RV32-NEXT: add a7, s0, a7
+; RV32-NEXT: sltu t1, s0, t5
+; RV32-NEXT: sltu t5, a7, s0
+; RV32-NEXT: add t6, t6, t1
+; RV32-NEXT: add a1, t6, t5
+; RV32-NEXT: sltu s0, a1, t6
+; RV32-NEXT: and t5, t5, s0
+; RV32-NEXT: beq t6, t2, .LBB0_9
+; RV32-NEXT: # %bb.8:
+; RV32-NEXT: sltu t1, t6, t2
+; RV32-NEXT: .LBB0_9:
+; RV32-NEXT: add t4, a2, t4
+; RV32-NEXT: add t0, t3, t0
+; RV32-NEXT: sw a3, 0(a0)
+; RV32-NEXT: sw a4, 4(a0)
+; RV32-NEXT: sw a5, 8(a0)
+; RV32-NEXT: sw a6, 12(a0)
+; RV32-NEXT: add t1, t4, t1
+; RV32-NEXT: sltu a2, t4, a2
+; RV32-NEXT: add t5, t1, t5
+; RV32-NEXT: sltu a3, t1, t4
+; RV32-NEXT: add a2, t0, a2
+; RV32-NEXT: sltu a4, t5, t1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, a2, a4
+; RV32-NEXT: sw a7, 16(a0)
+; RV32-NEXT: sw a1, 20(a0)
+; RV32-NEXT: sw t5, 24(a0)
+; RV32-NEXT: sw a2, 28(a0)
+; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: add_i256:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a5, 0(a2)
+; RV64-NEXT: ld t0, 8(a2)
+; RV64-NEXT: ld a4, 16(a2)
+; RV64-NEXT: ld a2, 24(a2)
+; RV64-NEXT: ld a7, 8(a1)
+; RV64-NEXT: ld t1, 0(a1)
+; RV64-NEXT: ld a3, 16(a1)
+; RV64-NEXT: ld a6, 24(a1)
+; RV64-NEXT: add t2, a7, t0
+; RV64-NEXT: add a1, t1, a5
+; RV64-NEXT: sltu t0, a1, t1
+; RV64-NEXT: add a5, t2, t0
+; RV64-NEXT: beq a5, a7, .LBB0_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: sltu t0, a5, a7
+; RV64-NEXT: .LBB0_2:
+; RV64-NEXT: add a4, a3, a4
+; RV64-NEXT: add a2, a6, a2
+; RV64-NEXT: add t0, a4, t0
+; RV64-NEXT: sltu a3, a4, a3
+; RV64-NEXT: sltu a4, t0, a4
+; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: add a2, a2, a4
+; RV64-NEXT: sd a1, 0(a0)
+; RV64-NEXT: sd a5, 8(a0)
+; RV64-NEXT: sd t0, 16(a0)
+; RV64-NEXT: sd a2, 24(a0)
+; RV64-NEXT: ret
+ %r = add i256 %a, %b
+ ret i256 %r
+}
+
+; i256 sub
+define i256 @sub_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: sub_i256:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 0(a2)
+; RV32-NEXT: lw a4, 4(a2)
+; RV32-NEXT: lw a7, 8(a2)
+; RV32-NEXT: lw t3, 12(a2)
+; RV32-NEXT: lw a5, 0(a1)
+; RV32-NEXT: lw t2, 8(a1)
+; RV32-NEXT: lw t5, 12(a1)
+; RV32-NEXT: lw t0, 4(a1)
+; RV32-NEXT: sltu a6, t2, a7
+; RV32-NEXT: mv s2, a6
+; RV32-NEXT: beq t5, t3, .LBB1_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: sltu s2, t5, t3
+; RV32-NEXT: .LBB1_2:
+; RV32-NEXT: sltu t1, a5, a3
+; RV32-NEXT: mv t4, t1
+; RV32-NEXT: beq t0, a4, .LBB1_4
+; RV32-NEXT: # %bb.3:
+; RV32-NEXT: sltu t4, t0, a4
+; RV32-NEXT: .LBB1_4:
+; RV32-NEXT: lw s7, 16(a2)
+; RV32-NEXT: lw s0, 20(a2)
+; RV32-NEXT: lw s8, 16(a1)
+; RV32-NEXT: lw s1, 20(a1)
+; RV32-NEXT: xor t6, t5, t3
+; RV32-NEXT: xor s3, t2, a7
+; RV32-NEXT: or s3, s3, t6
+; RV32-NEXT: mv t6, t4
+; RV32-NEXT: beqz s3, .LBB1_6
+; RV32-NEXT: # %bb.5:
+; RV32-NEXT: mv t6, s2
+; RV32-NEXT: .LBB1_6:
+; RV32-NEXT: lw s4, 24(a2)
+; RV32-NEXT: lw s3, 28(a2)
+; RV32-NEXT: lw s6, 24(a1)
+; RV32-NEXT: lw s5, 28(a1)
+; RV32-NEXT: sub s2, s8, s7
+; RV32-NEXT: sltu a2, s8, s7
+; RV32-NEXT: sub a1, s1, s0
+; RV32-NEXT: sltu s7, s2, t6
+; RV32-NEXT: sub s8, a1, a2
+; RV32-NEXT: snez a1, s8
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: and a1, a1, s7
+; RV32-NEXT: beq s1, s0, .LBB1_8
+; RV32-NEXT: # %bb.7:
+; RV32-NEXT: sltu a2, s1, s0
+; RV32-NEXT: .LBB1_8:
+; RV32-NEXT: sub s0, s6, s4
+; RV32-NEXT: sltu s1, s6, s4
+; RV32-NEXT: sub s3, s5, s3
+; RV32-NEXT: sub s4, s8, s7
+; RV32-NEXT: sub t6, s2, t6
+; RV32-NEXT: sub t3, t5, t3
+; RV32-NEXT: sub a7, t2, a7
+; RV32-NEXT: sub a4, t0, a4
+; RV32-NEXT: sub a5, a5, a3
+; RV32-NEXT: sub a3, s0, a2
+; RV32-NEXT: sub t0, s3, s1
+; RV32-NEXT: sltu a2, s0, a2
+; RV32-NEXT: sub a6, t3, a6
+; RV32-NEXT: sltu t2, a7, t4
+; RV32-NEXT: sub a7, a7, t4
+; RV32-NEXT: sub a4, a4, t1
+; RV32-NEXT: sltu t1, a3, a1
+; RV32-NEXT: sub a2, t0, a2
+; RV32-NEXT: sub a3, a3, a1
+; RV32-NEXT: sub a1, a6, t2
+; RV32-NEXT: sub a2, a2, t1
+; RV32-NEXT: sw a5, 0(a0)
+; RV32-NEXT: sw a4, 4(a0)
+; RV32-NEXT: sw a7, 8(a0)
+; RV32-NEXT: sw a1, 12(a0)
+; RV32-NEXT: sw t6, 16(a0)
+; RV32-NEXT: sw s4, 20(a0)
+; RV32-NEXT: sw a3, 24(a0)
+; RV32-NEXT: sw a2, 28(a0)
+; RV32-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: ret
+;
+; RV64-LABEL: sub_i256:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a3, 0(a2)
+; RV64-NEXT: ld a4, 8(a2)
+; RV64-NEXT: ld a5, 16(a2)
+; RV64-NEXT: ld a6, 24(a2)
+; RV64-NEXT: ld t0, 16(a1)
+; RV64-NEXT: ld a7, 24(a1)
+; RV64-NEXT: ld a2, 0(a1)
+; RV64-NEXT: ld a1, 8(a1)
+; RV64-NEXT: sltu t1, t0, a5
+; RV64-NEXT: sub a6, a7, a6
+; RV64-NEXT: sltu a7, a2, a3
+; RV64-NEXT: sub a6, a6, t1
+; RV64-NEXT: mv t1, a7
+; RV64-NEXT: beq a1, a4, .LBB1_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: sltu t1, a1, a4
+; RV64-NEXT: .LBB1_2:
+; RV64-NEXT: sub a5, t0, a5
+; RV64-NEXT: sub a1, a1, a4
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sltu a3, a5, t1
+; RV64-NEXT: sub a4, a5, t1
+; RV64-NEXT: sub a1, a1, a7
+; RV64-NEXT: sub a3, a6, a3
+; RV64-NEXT: sd a2, 0(a0)
+; RV64-NEXT: sd a1, 8(a0)
+; RV64-NEXT: sd a4, 16(a0)
+; RV64-NEXT: sd a3, 24(a0)
+; RV64-NEXT: ret
+ %r = sub i256 %a, %b
+ ret i256 %r
+}
+
+; i256 shift left by constant
+define i256 @shl_i256_const(i256 %a) nounwind {
+; RV32-LABEL: shl_i256_const:
+; RV32: # %bb.0:
+; RV32-NEXT: lw a2, 0(a1)
+; RV32-NEXT: lw a3, 4(a1)
+; RV32-NEXT: lw a4, 8(a1)
+; RV32-NEXT: lw a5, 12(a1)
+; RV32-NEXT: lw a6, 16(a1)
+; RV32-NEXT: lw a1, 20(a1)
+; RV32-NEXT: sw a4, 16(a0)
+; RV32-NEXT: sw a5, 20(a0)
+; RV32-NEXT: sw a6, 24(a0)
+; RV32-NEXT: sw a1, 28(a0)
+; RV32-NEXT: sw zero, 0(a0)
+; RV32-NEXT: sw zero, 4(a0)
+; RV32-NEXT: sw a2, 8(a0)
+; RV32-NEXT: sw a3, 12(a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: shl_i256_const:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a2, 0(a1)
+; RV64-NEXT: ld a3, 8(a1)
+; RV64-NEXT: ld a1, 16(a1)
+; RV64-NEXT: sd zero, 0(a0)
+; RV64-NEXT: sd a2, 8(a0)
+; RV64-NEXT: sd a3, 16(a0)
+; RV64-NEXT: sd a1, 24(a0)
+; RV64-NEXT: ret
+ %r = shl i256 %a, 64
+ ret i256 %r
+}
+
+; i256 shift left by variable
+define i256 @shl_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: shl_i256:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw s0, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a2, 0(a2)
+; RV32-NEXT: lw a3, 0(a1)
+; RV32-NEXT: lw a4, 4(a1)
+; RV32-NEXT: lw a5, 8(a1)
+; RV32-NEXT: lw a6, 12(a1)
+; RV32-NEXT: lw a7, 16(a1)
+; RV32-NEXT: lw t0, 20(a1)
+; RV32-NEXT: lw t1, 24(a1)
+; RV32-NEXT: lw a1, 28(a1)
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: sw zero, 28(sp)
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw zero, 36(sp)
+; RV32-NEXT: sw zero, 8(sp)
+; RV32-NEXT: sw zero, 12(sp)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: sw zero, 20(sp)
+; RV32-NEXT: addi t2, sp, 40
+; RV32-NEXT: sw a7, 56(sp)
+; RV32-NEXT: sw t0, 60(sp)
+; RV32-NEXT: sw t1, 64(sp)
+; RV32-NEXT: sw a1, 68(sp)
+; RV32-NEXT: sw a3, 40(sp)
+; RV32-NEXT: sw a4, 44(sp)
+; RV32-NEXT: sw a5, 48(sp)
+; RV32-NEXT: sw a6, 52(sp)
+; RV32-NEXT: srli a1, a2, 3
+; RV32-NEXT: andi a3, a2, 31
+; RV32-NEXT: andi a1, a1, 28
+; RV32-NEXT: xori a3, a3, 31
+; RV32-NEXT: sub a1, t2, a1
+; RV32-NEXT: lw a4, 0(a1)
+; RV32-NEXT: lw a5, 4(a1)
+; RV32-NEXT: lw a6, 8(a1)
+; RV32-NEXT: lw a7, 12(a1)
+; RV32-NEXT: lw t0, 16(a1)
+; RV32-NEXT: lw t1, 20(a1)
+; RV32-NEXT: lw t2, 24(a1)
+; RV32-NEXT: lw a1, 28(a1)
+; RV32-NEXT: sll t3, a5, a2
+; RV32-NEXT: srli t4, a4, 1
+; RV32-NEXT: sll t5, a6, a2
+; RV32-NEXT: srli a5, a5, 1
+; RV32-NEXT: sll t6, a7, a2
+; RV32-NEXT: srli a6, a6, 1
+; RV32-NEXT: sll s0, t0, a2
+; RV32-NEXT: srli a7, a7, 1
+; RV32-NEXT: sll s1, t1, a2
+; RV32-NEXT: srli t0, t0, 1
+; RV32-NEXT: sll a1, a1, a2
+; RV32-NEXT: sll a4, a4, a2
+; RV32-NEXT: sll a2, t2, a2
+; RV32-NEXT: srli t1, t1, 1
+; RV32-NEXT: srli t2, t2, 1
+; RV32-NEXT: srl t4, t4, a3
+; RV32-NEXT: srl a5, a5, a3
+; RV32-NEXT: srl a6, a6, a3
+; RV32-NEXT: srl a7, a7, a3
+; RV32-NEXT: srl t0, t0, a3
+; RV32-NEXT: srl t1, t1, a3
+; RV32-NEXT: srl a3, t2, a3
+; RV32-NEXT: or t2, t3, t4
+; RV32-NEXT: or a5, t5, a5
+; RV32-NEXT: or a6, t6, a6
+; RV32-NEXT: or a7, s0, a7
+; RV32-NEXT: or t0, s1, t0
+; RV32-NEXT: or a2, a2, t1
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: sw a7, 16(a0)
+; RV32-NEXT: sw t0, 20(a0)
+; RV32-NEXT: sw a2, 24(a0)
+; RV32-NEXT: sw a1, 28(a0)
+; RV32-NEXT: sw a4, 0(a0)
+; RV32-NEXT: sw t2, 4(a0)
+; RV32-NEXT: sw a5, 8(a0)
+; RV32-NEXT: sw a6, 12(a0)
+; RV32-NEXT: lw s0, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: shl_i256:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -96
+; RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 96
+; RV64-NEXT: andi sp, sp, -32
+; RV64-NEXT: ld a2, 0(a2)
+; RV64-NEXT: ld a3, 0(a1)
+; RV64-NEXT: ld a4, 8(a1)
+; RV64-NEXT: ld a5, 16(a1)
+; RV64-NEXT: ld a1, 24(a1)
+; RV64-NEXT: sd zero, 0(sp)
+; RV64-NEXT: sd zero, 8(sp)
+; RV64-NEXT: sd zero, 16(sp)
+; RV64-NEXT: sd zero, 24(sp)
+; RV64-NEXT: addi a6, sp, 32
+; RV64-NEXT: sd a3, 32(sp)
+; RV64-NEXT: sd a4, 40(sp)
+; RV64-NEXT: sd a5, 48(sp)
+; RV64-NEXT: sd a1, 56(sp)
+; RV64-NEXT: srli a1, a2, 3
+; RV64-NEXT: andi a3, a2, 63
+; RV64-NEXT: andi a1, a1, 24
+; RV64-NEXT: sub a1, a6, a1
+; RV64-NEXT: ld a4, 0(a1)
+; RV64-NEXT: ld a5, 8(a1)
+; RV64-NEXT: ld a6, 16(a1)
+; RV64-NEXT: ld a1, 24(a1)
+; RV64-NEXT: xori a3, a3, 63
+; RV64-NEXT: sll a7, a5, a2
+; RV64-NEXT: srli t0, a4, 1
+; RV64-NEXT: sll a1, a1, a2
+; RV64-NEXT: sll a4, a4, a2
+; RV64-NEXT: sll a2, a6, a2
+; RV64-NEXT: srli a5, a5, 1
+; RV64-NEXT: srli a6, a6, 1
+; RV64-NEXT: srl t0, t0, a3
+; RV64-NEXT: srl a5, a5, a3
+; RV64-NEXT: srl a3, a6, a3
+; RV64-NEXT: or a6, a7, t0
+; RV64-NEXT: or a2, a2, a5
+; RV64-NEXT: or a1, a1, a3
+; RV64-NEXT: sd a4, 0(a0)
+; RV64-NEXT: sd a6, 8(a0)
+; RV64-NEXT: sd a2, 16(a0)
+; RV64-NEXT: sd a1, 24(a0)
+; RV64-NEXT: addi sp, s0, -96
+; RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 96
+; RV64-NEXT: ret
+ %r = shl i256 %a, %b
+ ret i256 %r
+}
+
+; i256 multiply
+define i256 @mul_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: mul_i256:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -240
+; RV32-NEXT: sw ra, 236(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 232(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 228(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 224(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 220(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 216(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 212(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 208(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 204(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 200(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 196(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 192(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 188(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 16(a2)
+; RV32-NEXT: sw a3, 172(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 20(a2)
+; RV32-NEXT: sw a3, 152(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 24(a2)
+; RV32-NEXT: sw a3, 124(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 28(a2)
+; RV32-NEXT: sw a3, 120(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 16(a1)
+; RV32-NEXT: sw a3, 164(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 20(a1)
+; RV32-NEXT: sw a3, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 24(a1)
+; RV32-NEXT: sw a3, 112(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 28(a1)
+; RV32-NEXT: sw a3, 108(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw s9, 0(a2)
+; RV32-NEXT: lw s8, 4(a2)
+; RV32-NEXT: lw a3, 8(a2)
+; RV32-NEXT: sw a3, 156(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a2, 12(a2)
+; RV32-NEXT: sw a2, 176(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw s3, 0(a1)
+; RV32-NEXT: lw s2, 4(a1)
+; RV32-NEXT: lw s6, 8(a1)
+; RV32-NEXT: lw s7, 12(a1)
+; RV32-NEXT: sw a0, 136(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv a0, s3
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: mv a2, s9
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: sw a0, 132(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv s0, a1
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: mv a2, s9
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: add s0, a0, s0
+; RV32-NEXT: sltu a0, s0, a0
+; RV32-NEXT: add s1, a1, a0
+; RV32-NEXT: sw s3, 180(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv a0, s3
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: mv a2, s8
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: add s0, a0, s0
+; RV32-NEXT: sw s0, 128(sp) # 4-byte Folded Spill
+; RV32-NEXT: sltu a0, s0, a0
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add s0, s1, a0
+; RV32-NEXT: sw s2, 184(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv a0, s2
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: mv a2, s8
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: add s4, a0, s0
+; RV32-NEXT: sltu a2, s0, s1
+; RV32-NEXT: sltu a0, s4, a0
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add s1, a1, a0
+; RV32-NEXT: mv a0, s6
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: mv a2, s9
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s3, a0
+; RV32-NEXT: mv s5, a1
+; RV32-NEXT: mv a0, s7
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: sw s9, 160(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv a2, s9
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s2, a0
+; RV32-NEXT: mv s0, a1
+; RV32-NEXT: add s5, a0, s5
+; RV32-NEXT: sw s6, 144(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv a0, s6
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: sw s8, 168(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv a2, s8
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: add a2, a0, s5
+; RV32-NEXT: add s4, s3, s4
+; RV32-NEXT: add s1, a2, s1
+; RV32-NEXT: sltu s6, s4, s3
+; RV32-NEXT: add s8, s1, s6
+; RV32-NEXT: beq s8, a2, .LBB4_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: sltu s6, s8, a2
+; RV32-NEXT: .LBB4_2:
+; RV32-NEXT: sltu a3, s5, s2
+; RV32-NEXT: sltu a0, a2, a0
+; RV32-NEXT: add a3, s0, a3
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: sw a3, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s0, a3, a0
+; RV32-NEXT: sw s7, 148(sp) # 4-byte Folded Spill
+; RV32-NEXT: mv a0, s7
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw a2, 168(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: sw a1, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 96(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a0, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT: add a0, a0, s0
+; RV32-NEXT: sw a0, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s6, a0, s6
+; RV32-NEXT: lw s0, 180(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw s1, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a2, s1
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s5, a0
+; RV32-NEXT: mv s7, a1
+; RV32-NEXT: lw a0, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: mv a2, s1
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s3, a0
+; RV32-NEXT: mv s2, a1
+; RV32-NEXT: add s7, a0, s7
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw a2, 176(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: add a2, a0, s7
+; RV32-NEXT: add s4, s5, s4
+; RV32-NEXT: sw s4, 104(sp) # 4-byte Folded Spill
+; RV32-NEXT: sltu s0, s4, s5
+; RV32-NEXT: add a3, a2, s0
+; RV32-NEXT: add a3, a3, s8
+; RV32-NEXT: beq a3, a2, .LBB4_4
+; RV32-NEXT: # %bb.3:
+; RV32-NEXT: sltu s0, a3, a2
+; RV32-NEXT: .LBB4_4:
+; RV32-NEXT: sw a3, 100(sp) # 4-byte Folded Spill
+; RV32-NEXT: sltu a3, s7, s3
+; RV32-NEXT: sltu a0, a2, a0
+; RV32-NEXT: add s11, s2, a3
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add s10, s11, a0
+; RV32-NEXT: lw a0, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw a2, 176(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s5, a0
+; RV32-NEXT: sw a1, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s2, a0, s10
+; RV32-NEXT: add s9, s2, s0
+; RV32-NEXT: add s0, s6, s9
+; RV32-NEXT: lw s8, 144(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a0, s8
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw s1, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a2, s1
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s3, a0
+; RV32-NEXT: sw a1, 64(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s7, a0, s0
+; RV32-NEXT: lw a0, 160(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw a2, 164(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s4, a0
+; RV32-NEXT: sw a1, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a0, 172(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw a2, 180(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: sw a0, 80(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a1, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: add a6, a0, s4
+; RV32-NEXT: sw a6, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: sltu a7, s0, s6
+; RV32-NEXT: lw a2, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a0, s6, a2
+; RV32-NEXT: lw a1, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a1, a2, a1
+; RV32-NEXT: lw a2, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a3, 96(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a2, a3, a2
+; RV32-NEXT: sltu a3, s9, s2
+; RV32-NEXT: sltu a4, s2, s5
+; RV32-NEXT: sltu a5, s10, s11
+; RV32-NEXT: add a6, s7, a6
+; RV32-NEXT: lw t0, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT: add a2, t0, a2
+; RV32-NEXT: lw t0, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT: add a5, t0, a5
+; RV32-NEXT: sw a6, 96(sp) # 4-byte Folded Spill
+; RV32-NEXT: sltu s9, a6, s7
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a4, a5, a4
+; RV32-NEXT: add s4, a1, a0
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: add a3, s4, a3
+; RV32-NEXT: sw a7, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s11, a3, a7
+; RV32-NEXT: lw a0, 148(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: mv a2, s1
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s1, a0
+; RV32-NEXT: sw a1, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw s10, 64(sp) # 4-byte Folded Reload
+; RV32-NEXT: add s10, a0, s10
+; RV32-NEXT: mv a0, s8
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw a2, 176(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s2, a0
+; RV32-NEXT: sw a1, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s5, a0, s10
+; RV32-NEXT: sltu s0, s7, s3
+; RV32-NEXT: add s6, s5, s0
+; RV32-NEXT: add s6, s6, s11
+; RV32-NEXT: lw a0, 168(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw a2, 164(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: sw a1, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a0, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw s7, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT: add s7, a0, s7
+; RV32-NEXT: lw a0, 160(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw s3, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a2, s3
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s7, a0, s7
+; RV32-NEXT: lw a0, 152(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw a2, 180(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: sw a1, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a0, 64(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw s8, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: add s8, a0, s8
+; RV32-NEXT: lw a0, 172(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw a2, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: sw s8, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a0, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT: add a0, a0, s8
+; RV32-NEXT: sw a0, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: lw a2, 80(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a3, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a3, a3, a2
+; RV32-NEXT: add a4, a0, a3
+; RV32-NEXT: sw s9, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT: add a0, a4, s9
+; RV32-NEXT: add a2, s6, a0
+; RV32-NEXT: sw a1, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: beq a2, s6, .LBB4_6
+; RV32-NEXT: # %bb.5:
+; RV32-NEXT: sltu a0, a2, s6
+; RV32-NEXT: sw a0, 140(sp) # 4-byte Folded Spill
+; RV32-NEXT: .LBB4_6:
+; RV32-NEXT: beq s6, s5, .LBB4_8
+; RV32-NEXT: # %bb.7:
+; RV32-NEXT: sltu s0, s6, s5
+; RV32-NEXT: .LBB4_8:
+; RV32-NEXT: sw a4, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a3, 80(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a2, 92(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw s6, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT: beq s11, s4, .LBB4_10
+; RV32-NEXT: # %bb.9:
+; RV32-NEXT: sltu s6, s11, s4
+; RV32-NEXT: .LBB4_10:
+; RV32-NEXT: sltu a0, s10, s1
+; RV32-NEXT: sltu a1, s5, s2
+; RV32-NEXT: lw a2, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT: add a0, a2, a0
+; RV32-NEXT: lw a2, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: sw a0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s2, a0, a1
+; RV32-NEXT: lw s5, 148(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a0, s5
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: lw s1, 176(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a2, s1
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: sw a1, 84(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a0, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT: add a0, a0, s2
+; RV32-NEXT: sw a0, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: add a0, a0, s6
+; RV32-NEXT: sw a0, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s0, a0, s0
+; RV32-NEXT: sw s0, 88(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a0, 164(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a1, s3
+; RV32-NEXT: lw a2, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a3, s1
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: sw a1, 164(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a0, 112(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a1, 108(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a2, 160(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 168(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a3, s4
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: sw a1, 156(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a0, 160(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s1, a0, s0
+; RV32-NEXT: lw a0, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a1, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu s6, a1, a0
+; RV32-NEXT: lw a0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a0, s7, a0
+; RV32-NEXT: lw a1, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT: add s6, a1, s6
+; RV32-NEXT: lw a1, 16(sp) # 4-byte Folded Reload
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add s8, s6, a0
+; RV32-NEXT: mv a0, s4
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: mv a2, s3
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: sw a1, 116(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw a0, 176(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s0, a0, s8
+; RV32-NEXT: sw s1, 168(sp) # 4-byte Folded Spill
+; RV32-NEXT: add s1, s0, s1
+; RV32-NEXT: lw a0, 180(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 184(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a1, s9
+; RV32-NEXT: lw a2, 124(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a3, 120(sp) # 4-byte Folded Reload
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s4, a0
+; RV32-NEXT: sw a1, 180(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a0, 144(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a1, s5
+; RV32-NEXT: lw a2, 172(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 152(sp) # 4-byte Folded Reload
+; RV32-NEXT: mv a3, s5
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: mv s10, a0
+; RV32-NEXT: mv s11, a1
+; RV32-NEXT: add s4, a0, s4
+; RV32-NEXT: lw a0, 64(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a1, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu s7, a1, a0
+; RV32-NEXT: lw s2, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a0, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a0, s2, a0
+; RV32-NEXT: lw a1, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT: add s7, a1, s7
+; RV32-NEXT: lw a1, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add s3, s7, a0
+; RV32-NEXT: mv a0, s5
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: mv a2, s9
+; RV32-NEXT: li a3, 0
+; RV32-NEXT: call __muldi3
+; RV32-NEXT: add a7, a0, s3
+; RV32-NEXT: add a6, a7, s4
+; RV32-NEXT: add a2, a6, s1
+; RV32-NEXT: lw t3, 80(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a4, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: beq a4, s2, .LBB4_12
+; RV32-NEXT: # %bb.11:
+; RV32-NEXT: sltu t3, a4, s2
+; RV32-NEXT: .LBB4_12:
+; RV32-NEXT: lw a3, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw a4, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu a4, a4, a3
+; RV32-NEXT: sltu a3, s8, s6
+; RV32-NEXT: sltu t0, s3, s7
+; RV32-NEXT: lw a5, 164(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw t1, 156(sp) # 4-byte Folded Reload
+; RV32-NEXT: add a5, t1, a5
+; RV32-NEXT: lw t1, 160(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw t2, 168(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu t1, t2, t1
+; RV32-NEXT: lw t2, 116(sp) # 4-byte Folded Reload
+; RV32-NEXT: add a3, t2, a3
+; RV32-NEXT: lw t2, 180(sp) # 4-byte Folded Reload
+; RV32-NEXT: add s11, s11, t2
+; RV32-NEXT: sltu t2, s4, s10
+; RV32-NEXT: add a1, a1, t0
+; RV32-NEXT: add t0, a2, t3
+; RV32-NEXT: lw t3, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw t4, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu t3, t4, t3
+; RV32-NEXT: lw t5, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu t4, t5, t4
+; RV32-NEXT: lw s3, 88(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu t5, s3, t5
+; RV32-NEXT: lw t6, 176(sp) # 4-byte Folded Reload
+; RV32-NEXT: sltu t6, s0, t6
+; RV32-NEXT: sltu s0, s1, s0
+; RV32-NEXT: sltu a0, a7, a0
+; RV32-NEXT: sltu a7, a6, a7
+; RV32-NEXT: sltu a6, a2, a6
+; RV32-NEXT: lw s2, 136(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
+; RV32-NEXT: sw s1, 0(s2)
+; RV32-NEXT: lw s1, 128(sp) # 4-byte Folded Reload
+; RV32-NEXT: sw s1, 4(s2)
+; RV32-NEXT: lw s1, 104(sp) # 4-byte Folded Reload
+; RV32-NEXT: sw s1, 8(s2)
+; RV32-NEXT: lw s1, 100(sp) # 4-byte Folded Reload
+; RV32-NEXT: sw s1, 12(s2)
+; RV32-NEXT: add s1, s3, t0
+; RV32-NEXT: lw s4, 84(sp) # 4-byte Folded Reload
+; RV32-NEXT: add a4, s4, a4
+; RV32-NEXT: sltu a2, t0, a2
+; RV32-NEXT: add a5, a5, t1
+; RV32-NEXT: add t2, s11, t2
+; RV32-NEXT: lw t0, 140(sp) # 4-byte Folded Reload
+; RV32-NEXT: add t0, s1, t0
+; RV32-NEXT: sltu t1, s1, s3
+; RV32-NEXT: add a4, a4, t3
+; RV32-NEXT: add a3, a3, t6
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a2, a6, a2
+; RV32-NEXT: sltu a1, t0, s1
+; RV32-NEXT: add a4, a4, t4
+; RV32-NEXT: add a3, a3, a5
+; RV32-NEXT: add a0, a0, t2
+; RV32-NEXT: add a4, a4, t5
+; RV32-NEXT: add a3, a3, s0
+; RV32-NEXT: add a0, a0, a7
+; RV32-NEXT: add a0, a0, a3
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, a4, a0
+; RV32-NEXT: add a0, a0, t1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: lw a1, 96(sp) # 4-byte Folded Reload
+; RV32-NEXT: sw a1, 16(s2)
+; RV32-NEXT: lw a1, 92(sp) # 4-byte Folded Reload
+; RV32-NEXT: sw a1, 20(s2)
+; RV32-NEXT: sw t0, 24(s2)
+; RV32-NEXT: sw a0, 28(s2)
+; RV32-NEXT: lw ra, 236(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 232(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 228(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 224(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 220(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 216(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 212(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 208(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 204(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 200(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 196(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 192(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 188(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 240
+; RV32-NEXT: ret
+;
+; RV64-LABEL: mul_i256:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s5, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s6, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s7, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s8, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT: ld s3, 0(a2)
+; RV64-NEXT: ld s1, 8(a2)
+; RV64-NEXT: ld s4, 16(a2)
+; RV64-NEXT: ld s5, 24(a2)
+; RV64-NEXT: ld s6, 0(a1)
+; RV64-NEXT: ld s2, 8(a1)
+; RV64-NEXT: ld a2, 16(a1)
+; RV64-NEXT: ld a3, 24(a1)
+; RV64-NEXT: mv s0, a0
+; RV64-NEXT: mv a0, s3
+; RV64-NEXT: mv a1, s1
+; RV64-NEXT: call __multi3
+; RV64-NEXT: mv s7, a0
+; RV64-NEXT: mv s8, a1
+; RV64-NEXT: mv a0, s4
+; RV64-NEXT: mv a1, s5
+; RV64-NEXT: mv a2, s6
+; RV64-NEXT: mv a3, s2
+; RV64-NEXT: call __multi3
+; RV64-NEXT: add a1, a1, s8
+; RV64-NEXT: add s7, a0, s7
+; RV64-NEXT: sltu a0, s7, a0
+; RV64-NEXT: add s8, a1, a0
+; RV64-NEXT: mv a0, s6
+; RV64-NEXT: li a1, 0
+; RV64-NEXT: mv a2, s3
+; RV64-NEXT: li a3, 0
+; RV64-NEXT: call __multi3
+; RV64-NEXT: mv s4, a0
+; RV64-NEXT: mv s5, a1
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: li a1, 0
+; RV64-NEXT: mv a2, s3
+; RV64-NEXT: li a3, 0
+; RV64-NEXT: call __multi3
+; RV64-NEXT: add s5, a0, s5
+; RV64-NEXT: sltu a0, s5, a0
+; RV64-NEXT: add s3, a1, a0
+; RV64-NEXT: mv a0, s6
+; RV64-NEXT: li a1, 0
+; RV64-NEXT: mv a2, s1
+; RV64-NEXT: li a3, 0
+; RV64-NEXT: call __multi3
+; RV64-NEXT: add s5, a0, s5
+; RV64-NEXT: sltu a0, s5, a0
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: add s6, s3, a0
+; RV64-NEXT: mv a0, s2
+; RV64-NEXT: li a1, 0
+; RV64-NEXT: mv a2, s1
+; RV64-NEXT: li a3, 0
+; RV64-NEXT: call __multi3
+; RV64-NEXT: add a2, a0, s6
+; RV64-NEXT: sltu a3, s6, s3
+; RV64-NEXT: sltu a0, a2, a0
+; RV64-NEXT: add a1, a1, a3
+; RV64-NEXT: add s7, a2, s7
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: sltu a1, s7, a2
+; RV64-NEXT: add a0, a0, s8
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: sd s4, 0(s0)
+; RV64-NEXT: sd s5, 8(s0)
+; RV64-NEXT: sd s7, 16(s0)
+; RV64-NEXT: sd a0, 24(s0)
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s5, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s6, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s7, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s8, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+ %r = mul i256 %a, %b
+ ret i256 %r
+}
+
+; i256 bitwise and
+define i256 @and_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: and_i256:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 16(a2)
+; RV32-NEXT: lw a4, 20(a2)
+; RV32-NEXT: lw a5, 24(a2)
+; RV32-NEXT: lw a6, 28(a2)
+; RV32-NEXT: lw a7, 16(a1)
+; RV32-NEXT: lw t0, 20(a1)
+; RV32-NEXT: lw t1, 24(a1)
+; RV32-NEXT: lw t2, 28(a1)
+; RV32-NEXT: lw t3, 0(a2)
+; RV32-NEXT: lw t4, 4(a2)
+; RV32-NEXT: lw t5, 8(a2)
+; RV32-NEXT: lw a2, 12(a2)
+; RV32-NEXT: lw t6, 0(a1)
+; RV32-NEXT: lw s0, 4(a1)
+; RV32-NEXT: lw s1, 8(a1)
+; RV32-NEXT: lw a1, 12(a1)
+; RV32-NEXT: and t3, t6, t3
+; RV32-NEXT: and t4, s0, t4
+; RV32-NEXT: and t5, s1, t5
+; RV32-NEXT: and a1, a1, a2
+; RV32-NEXT: and a2, a7, a3
+; RV32-NEXT: and a3, t0, a4
+; RV32-NEXT: and a4, t1, a5
+; RV32-NEXT: and a5, t2, a6
+; RV32-NEXT: sw a2, 16(a0)
+; RV32-NEXT: sw a3, 20(a0)
+; RV32-NEXT: sw a4, 24(a0)
+; RV32-NEXT: sw a5, 28(a0)
+; RV32-NEXT: sw t3, 0(a0)
+; RV32-NEXT: sw t4, 4(a0)
+; RV32-NEXT: sw t5, 8(a0)
+; RV32-NEXT: sw a1, 12(a0)
+; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: and_i256:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a3, 0(a2)
+; RV64-NEXT: ld a4, 8(a2)
+; RV64-NEXT: ld a5, 16(a2)
+; RV64-NEXT: ld a2, 24(a2)
+; RV64-NEXT: ld a6, 0(a1)
+; RV64-NEXT: ld a7, 8(a1)
+; RV64-NEXT: ld t0, 16(a1)
+; RV64-NEXT: ld a1, 24(a1)
+; RV64-NEXT: and a3, a6, a3
+; RV64-NEXT: and a4, a7, a4
+; RV64-NEXT: and a5, t0, a5
+; RV64-NEXT: and a1, a1, a2
+; RV64-NEXT: sd a3, 0(a0)
+; RV64-NEXT: sd a4, 8(a0)
+; RV64-NEXT: sd a5, 16(a0)
+; RV64-NEXT: sd a1, 24(a0)
+; RV64-NEXT: ret
+ %r = and i256 %a, %b
+ ret i256 %r
+}
+
+; i256 bitwise xor (key for Hamming distance)
+define i256 @xor_i256(i256 %a, i256 %b) nounwind {
+; RV32-LABEL: xor_i256:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 16(a2)
+; RV32-NEXT: lw a4, 20(a2)
+; RV32-NEXT: lw a5, 24(a2)
+; RV32-NEXT: lw a6, 28(a2)
+; RV32-NEXT: lw a7, 16(a1)
+; RV32-NEXT: lw t0, 20(a1)
+; RV32-NEXT: lw t1, 24(a1)
+; RV32-NEXT: lw t2, 28(a1)
+; RV32-NEXT: lw t3, 0(a2)
+; RV32-NEXT: lw t4, 4(a2)
+; RV32-NEXT: lw t5, 8(a2)
+; RV32-NEXT: lw a2, 12(a2)
+; RV32-NEXT: lw t6, 0(a1)
+; RV32-NEXT: lw s0, 4(a1)
+; RV32-NEXT: lw s1, 8(a1)
+; RV32-NEXT: lw a1, 12(a1)
+; RV32-NEXT: xor t3, t6, t3
+; RV32-NEXT: xor t4, s0, t4
+; RV32-NEXT: xor t5, s1, t5
+; RV32-NEXT: xor a1, a1, a2
+; RV32-NEXT: xor a2, a7, a3
+; RV32-NEXT: xor a3, t0, a4
+; RV32-NEXT: xor a4, t1, a5
+; RV32-NEXT: xor a5, t2, a6
+; RV32-NEXT: sw a2, 16(a0)
+; RV32-NEXT: sw a3, 20(a0)
+; RV32-NEXT: sw a4, 24(a0)
+; RV32-NEXT: sw a5, 28(a0)
+; RV32-NEXT: sw t3, 0(a0)
+; RV32-NEXT: sw t4, 4(a0)
+; RV32-NEXT: sw t5, 8(a0)
+; RV32-NEXT: sw a1, 12(a0)
+; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: xor_i256:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a3, 0(a2)
+; RV64-NEXT: ld a4, 8(a2)
+; RV64-NEXT: ld a5, 16(a2)
+; RV64-NEXT: ld a2, 24(a2)
+; RV64-NEXT: ld a6, 0(a1)
+; RV64-NEXT: ld a7, 8(a1)
+; RV64-NEXT: ld t0, 16(a1)
+; RV64-NEXT: ld a1, 24(a1)
+; RV64-NEXT: xor a3, a6, a3
+; RV64-NEXT: xor a4, a7, a4
+; RV64-NEXT: xor a5, t0, a5
+; RV64-NEXT: xor a1, a1, a2
+; RV64-NEXT: sd a3, 0(a0)
+; RV64-NEXT: sd a4, 8(a0)
+; RV64-NEXT: sd a5, 16(a0)
+; RV64-NEXT: sd a1, 24(a0)
+; RV64-NEXT: ret
+ %r = xor i256 %a, %b
+ ret i256 %r
+}
+
+; i256 popcount (Hamming weight)
+declare i256 @llvm.ctpop.i256(i256)
+define i256 @ctpop_i256(i256 %a) nounwind {
+; RV32-LABEL: ctpop_i256:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT: lw a3, 16(a1)
+; RV32-NEXT: lw a5, 20(a1)
+; RV32-NEXT: lw a7, 24(a1)
+; RV32-NEXT: lw t0, 28(a1)
+; RV32-NEXT: lw a2, 0(a1)
+; RV32-NEXT: lw a4, 4(a1)
+; RV32-NEXT: lw a6, 8(a1)
+; RV32-NEXT: lw a1, 12(a1)
+; RV32-NEXT: lui t1, 349525
+; RV32-NEXT: addi t1, t1, 1365
+; RV32-NEXT: srli t2, t0, 1
+; RV32-NEXT: srli t3, a7, 1
+; RV32-NEXT: srli t4, a5, 1
+; RV32-NEXT: srli t5, a3, 1
+; RV32-NEXT: srli t6, a1, 1
+; RV32-NEXT: srli s0, a6, 1
+; RV32-NEXT: srli s1, a4, 1
+; RV32-NEXT: srli s2, a2, 1
+; RV32-NEXT: and t2, t2, t1
+; RV32-NEXT: and t3, t3, t1
+; RV32-NEXT: and t4, t4, t1
+; RV32-NEXT: and t5, t5, t1
+; RV32-NEXT: and t6, t6, t1
+; RV32-NEXT: and s0, s0, t1
+; RV32-NEXT: and s1, s1, t1
+; RV32-NEXT: and t1, s2, t1
+; RV32-NEXT: lui s2, 209715
+; RV32-NEXT: addi s2, s2, 819
+; RV32-NEXT: sub t0, t0, t2
+; RV32-NEXT: sub a7, a7, t3
+; RV32-NEXT: sub a5, a5, t4
+; RV32-NEXT: sub a3, a3, t5
+; RV32-NEXT: sub a1, a1, t6
+; RV32-NEXT: sub a6, a6, s0
+; RV32-NEXT: sub a4, a4, s1
+; RV32-NEXT: sub a2, a2, t1
+; RV32-NEXT: and t1, t0, s2
+; RV32-NEXT: srli t0, t0, 2
+; RV32-NEXT: and t2, a7, s2
+; RV32-NEXT: srli a7, a7, 2
+; RV32-NEXT: and t3, a5, s2
+; RV32-NEXT: srli a5, a5, 2
+; RV32-NEXT: and t4, a3, s2
+; RV32-NEXT: srli a3, a3, 2
+; RV32-NEXT: and t5, a1, s2
+; RV32-NEXT: srli a1, a1, 2
+; RV32-NEXT: and t6, a6, s2
+; RV32-NEXT: srli a6, a6, 2
+; RV32-NEXT: and s0, a4, s2
+; RV32-NEXT: srli a4, a4, 2
+; RV32-NEXT: and s1, a2, s2
+; RV32-NEXT: srli a2, a2, 2
+; RV32-NEXT: and t0, t0, s2
+; RV32-NEXT: and a7, a7, s2
+; RV32-NEXT: and a5, a5, s2
+; RV32-NEXT: and a3, a3, s2
+; RV32-NEXT: and a1, a1, s2
+; RV32-NEXT: and a6, a6, s2
+; RV32-NEXT: and a4, a4, s2
+; RV32-NEXT: and a2, a2, s2
+; RV32-NEXT: add t0, t1, t0
+; RV32-NEXT: add a7, t2, a7
+; RV32-NEXT: add a5, t3, a5
+; RV32-NEXT: add a3, t4, a3
+; RV32-NEXT: add a1, t5, a1
+; RV32-NEXT: lui t1, 61681
+; RV32-NEXT: addi t1, t1, -241
+; RV32-NEXT: add a6, t6, a6
+; RV32-NEXT: add a4, s0, a4
+; RV32-NEXT: add a2, s1, a2
+; RV32-NEXT: srli t2, t0, 4
+; RV32-NEXT: srli t3, a7, 4
+; RV32-NEXT: srli t4, a5, 4
+; RV32-NEXT: add t0, t0, t2
+; RV32-NEXT: srli t2, a3, 4
+; RV32-NEXT: add a7, a7, t3
+; RV32-NEXT: srli t3, a1, 4
+; RV32-NEXT: add a5, a5, t4
+; RV32-NEXT: srli t4, a6, 4
+; RV32-NEXT: add a3, a3, t2
+; RV32-NEXT: srli t2, a4, 4
+; RV32-NEXT: add a1, a1, t3
+; RV32-NEXT: srli t3, a2, 4
+; RV32-NEXT: add a6, a6, t4
+; RV32-NEXT: add a4, a4, t2
+; RV32-NEXT: add a2, a2, t3
+; RV32-NEXT: and t0, t0, t1
+; RV32-NEXT: and a7, a7, t1
+; RV32-NEXT: and a5, a5, t1
+; RV32-NEXT: and a3, a3, t1
+; RV32-NEXT: and a1, a1, t1
+; RV32-NEXT: and a6, a6, t1
+; RV32-NEXT: and a4, a4, t1
+; RV32-NEXT: and a2, a2, t1
+; RV32-NEXT: slli t1, t0, 8
+; RV32-NEXT: slli t2, a7, 8
+; RV32-NEXT: slli t3, a5, 8
+; RV32-NEXT: slli t4, a3, 8
+; RV32-NEXT: add t0, t0, t1
+; RV32-NEXT: slli t1, a1, 8
+; RV32-NEXT: add a7, a7, t2
+; RV32-NEXT: slli t2, a6, 8
+; RV32-NEXT: add a5, a5, t3
+; RV32-NEXT: slli t3, a4, 8
+; RV32-NEXT: add a3, a3, t4
+; RV32-NEXT: slli t4, a2, 8
+; RV32-NEXT: add a1, a1, t1
+; RV32-NEXT: add a6, a6, t2
+; RV32-NEXT: add a4, a4, t3
+; RV32-NEXT: add a2, a2, t4
+; RV32-NEXT: slli t1, t0, 16
+; RV32-NEXT: slli t2, a7, 16
+; RV32-NEXT: slli t3, a5, 16
+; RV32-NEXT: slli t4, a3, 16
+; RV32-NEXT: add t0, t0, t1
+; RV32-NEXT: slli t1, a1, 16
+; RV32-NEXT: add a7, a7, t2
+; RV32-NEXT: slli t2, a6, 16
+; RV32-NEXT: add a5, a5, t3
+; RV32-NEXT: slli t3, a4, 16
+; RV32-NEXT: add a3, a3, t4
+; RV32-NEXT: slli t4, a2, 16
+; RV32-NEXT: add a1, a1, t1
+; RV32-NEXT: add a6, a6, t2
+; RV32-NEXT: add a4, a4, t3
+; RV32-NEXT: add a2, a2, t4
+; RV32-NEXT: srli t0, t0, 24
+; RV32-NEXT: srli a7, a7, 24
+; RV32-NEXT: srli a5, a5, 24
+; RV32-NEXT: srli a3, a3, 24
+; RV32-NEXT: srli a1, a1, 24
+; RV32-NEXT: srli a6, a6, 24
+; RV32-NEXT: srli a4, a4, 24
+; RV32-NEXT: srli a2, a2, 24
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a3, a3, a5
+; RV32-NEXT: add a1, a6, a1
+; RV32-NEXT: add a2, a2, a4
+; RV32-NEXT: add a7, a3, a7
+; RV32-NEXT: add a5, a2, a1
+; RV32-NEXT: add a1, a5, a7
+; RV32-NEXT: sltu a3, a7, a3
+; RV32-NEXT: sltu a4, a5, a2
+; RV32-NEXT: sltu a2, a1, a5
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: add a3, a3, a2
+; RV32-NEXT: beq a3, a4, .LBB7_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: sltu a2, a3, a4
+; RV32-NEXT: .LBB7_2:
+; RV32-NEXT: sw zero, 16(a0)
+; RV32-NEXT: sw zero, 20(a0)
+; RV32-NEXT: sw zero, 24(a0)
+; RV32-NEXT: sw zero, 28(a0)
+; RV32-NEXT: sw a1, 0(a0)
+; RV32-NEXT: sw a3, 4(a0)
+; RV32-NEXT: sw a2, 8(a0)
+; RV32-NEXT: sw zero, 12(a0)
+; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: ctpop_i256:
+; RV64: # %bb.0:
+; RV64-NEXT: ld a2, 0(a1)
+; RV64-NEXT: ld a3, 8(a1)
+; RV64-NEXT: ld a4, 16(a1)
+; RV64-NEXT: ld a5, 24(a1)
+; RV64-NEXT: lui a1, 349525
+; RV64-NEXT: lui a6, 209715
+; RV64-NEXT: lui a7, 61681
+; RV64-NEXT: addi a1, a1, 1365
+; RV64-NEXT: addi a6, a6, 819
+; RV64-NEXT: addi a7, a7, -241
+; RV64-NEXT: slli t0, a1, 32
+; RV64-NEXT: slli t1, a6, 32
+; RV64-NEXT: slli t2, a7, 32
+; RV64-NEXT: add t0, a1, t0
+; RV64-NEXT: add a6, a6, t1
+; RV64-NEXT: add a1, a7, t2
+; RV64-NEXT: srli a7, a5, 1
+; RV64-NEXT: srli t1, a4, 1
+; RV64-NEXT: srli t2, a3, 1
+; RV64-NEXT: srli t3, a2, 1
+; RV64-NEXT: and a7, a7, t0
+; RV64-NEXT: and t1, t1, t0
+; RV64-NEXT: and t2, t2, t0
+; RV64-NEXT: and t0, t3, t0
+; RV64-NEXT: sub a5, a5, a7
+; RV64-NEXT: sub a4, a4, t1
+; RV64-NEXT: sub a3, a3, t2
+; RV64-NEXT: sub a2, a2, t0
+; RV64-NEXT: and a7, a5, a6
+; RV64-NEXT: srli a5, a5, 2
+; RV64-NEXT: and t0, a4, a6
+; RV64-NEXT: srli a4, a4, 2
+; RV64-NEXT: and t1, a3, a6
+; RV64-NEXT: srli a3, a3, 2
+; RV64-NEXT: and t2, a2, a6
+; RV64-NEXT: srli a2, a2, 2
+; RV64-NEXT: and a5, a5, a6
+; RV64-NEXT: and a4, a4, a6
+; RV64-NEXT: and a3, a3, a6
+; RV64-NEXT: and a2, a2, a6
+; RV64-NEXT: add a5, a7, a5
+; RV64-NEXT: add a4, t0, a4
+; RV64-NEXT: add a3, t1, a3
+; RV64-NEXT: add a2, t2, a2
+; RV64-NEXT: srli a6, a5, 4
+; RV64-NEXT: srli a7, a4, 4
+; RV64-NEXT: srli t0, a3, 4
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: srli a6, a2, 4
+; RV64-NEXT: add a4, a4, a7
+; RV64-NEXT: add a3, a3, t0
+; RV64-NEXT: add a2, a2, a6
+; RV64-NEXT: and a5, a5, a1
+; RV64-NEXT: and a4, a4, a1
+; RV64-NEXT: and a3, a3, a1
+; RV64-NEXT: and a1, a2, a1
+; RV64-NEXT: slli a2, a5, 8
+; RV64-NEXT: slli a6, a4, 8
+; RV64-NEXT: slli a7, a3, 8
+; RV64-NEXT: slli t0, a1, 8
+; RV64-NEXT: add a2, a5, a2
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: add a3, a3, a7
+; RV64-NEXT: add a1, a1, t0
+; RV64-NEXT: slli a5, a2, 16
+; RV64-NEXT: slli a6, a4, 16
+; RV64-NEXT: slli a7, a3, 16
+; RV64-NEXT: slli t0, a1, 16
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: add a3, a3, a7
+; RV64-NEXT: add a1, a1, t0
+; RV64-NEXT: slli a5, a2, 32
+; RV64-NEXT: slli a6, a4, 32
+; RV64-NEXT: slli a7, a3, 32
+; RV64-NEXT: slli t0, a1, 32
+; RV64-NEXT: add a2, a2, a5
+; RV64-NEXT: add a4, a4, a6
+; RV64-NEXT: add a3, a3, a7
+; RV64-NEXT: add a1, a1, t0
+; RV64-NEXT: srli a2, a2, 56
+; RV64-NEXT: srli a4, a4, 56
+; RV64-NEXT: srli a3, a3, 56
+; RV64-NEXT: srli a1, a1, 56
+; RV64-NEXT: add a2, a4, a2
+; RV64-NEXT: add a1, a1, a3
+; RV64-NEXT: add a2, a1, a2
+; RV64-NEXT: sltu a1, a2, a1
+; RV64-NEXT: sd a2, 0(a0)
+; RV64-NEXT: sd a1, 8(a0)
+; RV64-NEXT: sd zero, 16(a0)
+; RV64-NEXT: sd zero, 24(a0)
+; RV64-NEXT: ret
+ %r = call i256 @llvm.ctpop.i256(i256 %a)
+ ret i256 %r
+}
diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
index 0bb3b179cc305..26939c81ef535 100644
--- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
@@ -6,1025 +6,638 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-LABEL: test_1024:
; EGPR: # %bb.0:
; EGPR-NEXT: pushq %rbp
+; EGPR-NEXT: movq %rsp, %rbp
; EGPR-NEXT: pushq %r15
; EGPR-NEXT: pushq %r14
; EGPR-NEXT: pushq %r13
; EGPR-NEXT: pushq %r12
; EGPR-NEXT: pushq %rbx
-; EGPR-NEXT: subq $104, %rsp
+; EGPR-NEXT: andq $-32, %rsp
+; EGPR-NEXT: subq $1216, %rsp # imm = 0x4C0
; EGPR-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movq %rdi, %r26
-; EGPR-NEXT: movq (%rdi), %r13
-; EGPR-NEXT: movq 8(%rdi), %r18
-; EGPR-NEXT: movq 24(%rdi), %r21
-; EGPR-NEXT: movq 16(%rdi), %r17
-; EGPR-NEXT: movq 40(%rdi), %rdi
-; EGPR-NEXT: movq 32(%r26), %r10
-; EGPR-NEXT: movq 56(%r26), %r15
-; EGPR-NEXT: movq 48(%r26), %r12
-; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movq 24(%rsi), %r25
-; EGPR-NEXT: movq 16(%rsi), %r11
-; EGPR-NEXT: movq (%rsi), %r31
-; EGPR-NEXT: movq 8(%rsi), %r14
-; EGPR-NEXT: movq %r12, %rax
-; EGPR-NEXT: mulq %r31
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r19
-; EGPR-NEXT: movq %r15, %rax
-; EGPR-NEXT: mulq %r31
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %r12, %rax
-; EGPR-NEXT: mulq %r14
-; EGPR-NEXT: movq %rdx, %r22
-; EGPR-NEXT: movq %rax, %r8
-; EGPR-NEXT: addq %r16, %r8
-; EGPR-NEXT: adcq %r9, %r22
-; EGPR-NEXT: setb %al
-; EGPR-NEXT: movzbl %al, %ecx
-; EGPR-NEXT: movq %r15, %rax
-; EGPR-NEXT: mulq %r14
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r22, %r16
-; EGPR-NEXT: adcq %rcx, %r9
-; EGPR-NEXT: movq %r10, %rax
-; EGPR-NEXT: mulq %r31
-; EGPR-NEXT: movq %rdx, %r22
-; EGPR-NEXT: movq %rax, %r27
-; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: mulq %r31
-; EGPR-NEXT: movq %rdx, %r23
-; EGPR-NEXT: movq %rax, %r24
-; EGPR-NEXT: addq %r22, %r24
-; EGPR-NEXT: adcq $0, %r23
-; EGPR-NEXT: movq %r10, %rax
-; EGPR-NEXT: mulq %r14
-; EGPR-NEXT: movq %rdx, %r22
-; EGPR-NEXT: movq %rax, %r20
-; EGPR-NEXT: addq %r24, %r20
-; EGPR-NEXT: adcq %r23, %r22
-; EGPR-NEXT: setb %al
-; EGPR-NEXT: movzbl %al, %ecx
-; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: mulq %r14
-; EGPR-NEXT: movq %rdx, %r23
-; EGPR-NEXT: movq %rax, %r24
-; EGPR-NEXT: addq %r22, %r24
-; EGPR-NEXT: adcq %rcx, %r23
-; EGPR-NEXT: addq %r19, %r24
-; EGPR-NEXT: adcq %r8, %r23
-; EGPR-NEXT: adcq $0, %r16
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movq %r10, %rax
-; EGPR-NEXT: mulq %r11
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r28
-; EGPR-NEXT: movq %rdi, %rax
+; EGPR-NEXT: movq (%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 8(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 16(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 24(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 96(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 104(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 112(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 120(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 64(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 72(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 80(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 88(%rdi), %r13
+; EGPR-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 32(%rdi), %rax
+; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 40(%rdi), %rdx
+; EGPR-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 48(%rdi), %rcx
+; EGPR-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 56(%rdi), %r8
+; EGPR-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 96(%rsi), %rdi
; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: mulq %r11
-; EGPR-NEXT: movq %rdx, %r19
-; EGPR-NEXT: movq %rax, %r22
-; EGPR-NEXT: addq %r8, %r22
-; EGPR-NEXT: adcq $0, %r19
-; EGPR-NEXT: movq %r10, %rax
-; EGPR-NEXT: mulq %r25
-; EGPR-NEXT: movq %rdx, %rbx
-; EGPR-NEXT: movq %rax, %r29
-; EGPR-NEXT: addq %r22, %r29
-; EGPR-NEXT: adcq %r19, %rbx
-; EGPR-NEXT: setb %al
-; EGPR-NEXT: movzbl %al, %ecx
-; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: mulq %r25
-; EGPR-NEXT: movq %rdx, %r30
-; EGPR-NEXT: movq %rax, %r8
-; EGPR-NEXT: addq %rbx, %r8
-; EGPR-NEXT: adcq %rcx, %r30
-; EGPR-NEXT: addq %r24, %r28
-; EGPR-NEXT: adcq %r23, %r29
-; EGPR-NEXT: adcq $0, %r8
-; EGPR-NEXT: adcq $0, %r30
-; EGPR-NEXT: addq %r16, %r8
-; EGPR-NEXT: adcq %r9, %r30
-; EGPR-NEXT: setb %al
-; EGPR-NEXT: movzbl %al, %ecx
+; EGPR-NEXT: movq 104(%rsi), %rdi
+; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 112(%rsi), %rdi
+; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 120(%rsi), %rdi
+; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq (%rsi), %rdi
+; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 8(%rsi), %rdi
+; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 16(%rsi), %rdi
+; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 24(%rsi), %rdi
+; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 32(%rsi), %r12
; EGPR-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movq %r12, %rax
-; EGPR-NEXT: mulq %r11
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %rsi
+; EGPR-NEXT: movq 40(%rsi), %r15
; EGPR-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movq %r15, %rax
-; EGPR-NEXT: mulq %r11
-; EGPR-NEXT: movq %rdx, %r16
-; EGPR-NEXT: movq %rax, %r23
-; EGPR-NEXT: addq %r9, %r23
-; EGPR-NEXT: adcq $0, %r16
-; EGPR-NEXT: movq %r12, %rax
-; EGPR-NEXT: mulq %r25
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %rdi
-; EGPR-NEXT: addq %r23, %rdi
-; EGPR-NEXT: adcq %r16, %r9
-; EGPR-NEXT: setb %al
-; EGPR-NEXT: movzbl %al, %r10d
-; EGPR-NEXT: movq %r15, %rax
-; EGPR-NEXT: mulq %r25
-; EGPR-NEXT: movq %rdx, %r23
-; EGPR-NEXT: movq %rax, %r24
-; EGPR-NEXT: addq %r9, %r24
-; EGPR-NEXT: adcq %r10, %r23
-; EGPR-NEXT: addq %r8, %rsi
-; EGPR-NEXT: movq %rsi, %r19
-; EGPR-NEXT: adcq %r30, %rdi
-; EGPR-NEXT: adcq %rcx, %r24
-; EGPR-NEXT: adcq $0, %r23
-; EGPR-NEXT: movq %r17, %rax
-; EGPR-NEXT: mulq %r31
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %rbx
-; EGPR-NEXT: movq %r21, %rax
-; EGPR-NEXT: mulq %r31
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %r17, %rax
-; EGPR-NEXT: mulq %r14
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r30
-; EGPR-NEXT: addq %r16, %r30
-; EGPR-NEXT: adcq %r9, %r8
-; EGPR-NEXT: setb %al
-; EGPR-NEXT: movzbl %al, %ecx
-; EGPR-NEXT: movq %r21, %rax
-; EGPR-NEXT: mulq %r14
-; EGPR-NEXT: movq %r14, %rsi
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: adcq %rcx, %r9
-; EGPR-NEXT: movq %r13, %rax
-; EGPR-NEXT: mulq %r31
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movq %r18, %rax
-; EGPR-NEXT: mulq %r31
-; EGPR-NEXT: movq %rdx, %r14
-; EGPR-NEXT: movq %rax, %r15
-; EGPR-NEXT: addq %r8, %r15
-; EGPR-NEXT: adcq $0, %r14
-; EGPR-NEXT: movq %r13, %rax
-; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: mulq %rsi
-; EGPR-NEXT: movq %rdx, %r12
-; EGPR-NEXT: addq %r15, %rax
-; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq %r14, %r12
-; EGPR-NEXT: setb %cl
-; EGPR-NEXT: movq %r18, %rax
-; EGPR-NEXT: mulq %rsi
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r15
-; EGPR-NEXT: addq %r12, %r15
-; EGPR-NEXT: movzbl %cl, %eax
-; EGPR-NEXT: adcq %rax, %r8
-; EGPR-NEXT: addq %rbx, %r15
-; EGPR-NEXT: adcq %r30, %r8
-; EGPR-NEXT: adcq $0, %r16
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %r13, %rax
-; EGPR-NEXT: mulq %r11
-; EGPR-NEXT: movq %rdx, %r30
-; EGPR-NEXT: movq %rax, %rsi
-; EGPR-NEXT: movq %r18, %rax
-; EGPR-NEXT: mulq %r11
-; EGPR-NEXT: movq %rdx, %rbx
-; EGPR-NEXT: movq %rax, %r14
-; EGPR-NEXT: addq %r30, %r14
-; EGPR-NEXT: adcq $0, %rbx
-; EGPR-NEXT: movq %r13, %rax
-; EGPR-NEXT: mulq %r25
-; EGPR-NEXT: movq %rdx, %r12
-; EGPR-NEXT: addq %r14, %rax
-; EGPR-NEXT: movq %rax, %r10
-; EGPR-NEXT: adcq %rbx, %r12
-; EGPR-NEXT: setb %cl
-; EGPR-NEXT: movq %r18, %rax
-; EGPR-NEXT: mulq %r25
-; EGPR-NEXT: movq %rdx, %r14
-; EGPR-NEXT: movq %rax, %r30
-; EGPR-NEXT: addq %r12, %r30
-; EGPR-NEXT: movzbl %cl, %eax
-; EGPR-NEXT: adcq %rax, %r14
-; EGPR-NEXT: addq %r15, %rsi
-; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq %r8, %r10
+; EGPR-NEXT: movq 48(%rsi), %rbx
+; EGPR-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 56(%rsi), %r14
+; EGPR-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 64(%rsi), %r9
+; EGPR-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 72(%rsi), %r16
+; EGPR-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT: movq 80(%rsi), %r10
; EGPR-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq $0, %r30
-; EGPR-NEXT: adcq $0, %r14
-; EGPR-NEXT: addq %r16, %r30
-; EGPR-NEXT: adcq %r9, %r14
-; EGPR-NEXT: setb %cl
-; EGPR-NEXT: movq %r17, %rax
+; EGPR-NEXT: movq 88(%rsi), %r11
; EGPR-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: mulq %r11
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %rbx
-; EGPR-NEXT: movq %r21, %rax
-; EGPR-NEXT: mulq %r11
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %r17, %rax
-; EGPR-NEXT: mulq %r25
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r15
-; EGPR-NEXT: addq %r16, %r15
-; EGPR-NEXT: adcq %r9, %r8
-; EGPR-NEXT: setb %r9b
-; EGPR-NEXT: movq %r21, %rax
-; EGPR-NEXT: mulq %r25
-; EGPR-NEXT: movq %rdx, %r12
-; EGPR-NEXT: movq %rax, %rbp
-; EGPR-NEXT: addq %r8, %rbp
-; EGPR-NEXT: movzbl %r9b, %eax
-; EGPR-NEXT: adcq %rax, %r12
-; EGPR-NEXT: addq %r30, %rbx
-; EGPR-NEXT: adcq %r14, %r15
-; EGPR-NEXT: movzbl %cl, %eax
-; EGPR-NEXT: adcq %rax, %rbp
-; EGPR-NEXT: adcq $0, %r12
-; EGPR-NEXT: addq %r27, %rbx
+; EGPR-NEXT: subq $8, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %rax, %rsi
+; EGPR-NEXT: pushq %r11
+; EGPR-NEXT: pushq %r10
+; EGPR-NEXT: pushq %r16
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq 32(%rsi), %r27
-; EGPR-NEXT: adcq %r20, %r15
-; EGPR-NEXT: adcq %r28, %rbp
-; EGPR-NEXT: adcq %r29, %r12
-; EGPR-NEXT: adcq $0, %r19
-; EGPR-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq $0, %rdi
-; EGPR-NEXT: adcq $0, %r24
-; EGPR-NEXT: adcq $0, %r23
-; EGPR-NEXT: movq %r17, %rax
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r20
-; EGPR-NEXT: movq %r21, %rax
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq 40(%rsi), %rcx
-; EGPR-NEXT: movq %r17, %rax
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r30
-; EGPR-NEXT: addq %r16, %r30
-; EGPR-NEXT: adcq %r9, %r8
-; EGPR-NEXT: setb %r10b
-; EGPR-NEXT: movq %r21, %rax
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: movzbl %r10b, %eax
-; EGPR-NEXT: adcq %rax, %r9
-; EGPR-NEXT: movq %r13, %rax
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r19
-; EGPR-NEXT: movq %r18, %rax
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rdx, %r28
-; EGPR-NEXT: movq %rax, %r29
-; EGPR-NEXT: addq %r8, %r29
-; EGPR-NEXT: adcq $0, %r28
-; EGPR-NEXT: movq %r13, %rax
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r22
-; EGPR-NEXT: addq %r29, %r22
-; EGPR-NEXT: adcq %r28, %r8
-; EGPR-NEXT: setb %r10b
-; EGPR-NEXT: movq %r18, %rax
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r28
-; EGPR-NEXT: movq %rax, %r29
-; EGPR-NEXT: addq %r8, %r29
-; EGPR-NEXT: movzbl %r10b, %eax
-; EGPR-NEXT: adcq %rax, %r28
-; EGPR-NEXT: addq %r20, %r29
-; EGPR-NEXT: adcq %r30, %r28
-; EGPR-NEXT: adcq $0, %r16
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq 48(%rsi), %r20
-; EGPR-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movq %r13, %rax
-; EGPR-NEXT: mulq %r20
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r11
-; EGPR-NEXT: movq %r18, %rax
-; EGPR-NEXT: movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: mulq %r20
-; EGPR-NEXT: movq %rdx, %r30
-; EGPR-NEXT: movq %rax, %r14
-; EGPR-NEXT: addq %r8, %r14
-; EGPR-NEXT: adcq $0, %r30
-; EGPR-NEXT: movq 56(%rsi), %r10
-; EGPR-NEXT: movq %r13, %rax
-; EGPR-NEXT: mulq %r10
-; EGPR-NEXT: movq %rdx, %r13
-; EGPR-NEXT: addq %r14, %rax
-; EGPR-NEXT: movq %rax, %r14
-; EGPR-NEXT: adcq %r30, %r13
-; EGPR-NEXT: setb %sil
-; EGPR-NEXT: movq %r18, %rax
-; EGPR-NEXT: mulq %r10
-; EGPR-NEXT: movq %rdx, %r30
-; EGPR-NEXT: movq %rax, %r8
-; EGPR-NEXT: addq %r13, %r8
-; EGPR-NEXT: movzbl %sil, %eax
-; EGPR-NEXT: adcq %rax, %r30
-; EGPR-NEXT: addq %r29, %r11
-; EGPR-NEXT: adcq %r28, %r14
-; EGPR-NEXT: adcq $0, %r8
-; EGPR-NEXT: adcq $0, %r30
-; EGPR-NEXT: addq %r16, %r8
-; EGPR-NEXT: adcq %r9, %r30
-; EGPR-NEXT: setb %r18b
-; EGPR-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movq %r17, %rax
-; EGPR-NEXT: mulq %r20
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r28
-; EGPR-NEXT: movq %r21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movq %r21, %rax
-; EGPR-NEXT: mulq %r20
-; EGPR-NEXT: movq %rdx, %r16
-; EGPR-NEXT: movq %rax, %r29
-; EGPR-NEXT: addq %r9, %r29
-; EGPR-NEXT: adcq $0, %r16
-; EGPR-NEXT: movq %r17, %rax
-; EGPR-NEXT: mulq %r10
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r17
-; EGPR-NEXT: addq %r29, %r17
-; EGPR-NEXT: adcq %r16, %r9
-; EGPR-NEXT: setb %r16b
-; EGPR-NEXT: movq %r21, %rax
-; EGPR-NEXT: mulq %r10
-; EGPR-NEXT: movq %rdx, %r13
-; EGPR-NEXT: movq %rax, %r29
-; EGPR-NEXT: addq %r9, %r29
-; EGPR-NEXT: movzbl %r16b, %eax
-; EGPR-NEXT: adcq %rax, %r13
-; EGPR-NEXT: addq %r8, %r28
-; EGPR-NEXT: adcq %r30, %r17
-; EGPR-NEXT: movzbl %r18b, %eax
-; EGPR-NEXT: adcq %rax, %r29
-; EGPR-NEXT: adcq $0, %r13
-; EGPR-NEXT: addq %rbx, %r19
-; EGPR-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq %r15, %r22
-; EGPR-NEXT: movq %r22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq %rbp, %r11
-; EGPR-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq %r12, %r14
-; EGPR-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq $0, %r28
-; EGPR-NEXT: adcq $0, %r17
-; EGPR-NEXT: adcq $0, %r29
-; EGPR-NEXT: adcq $0, %r13
-; EGPR-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
-; EGPR-NEXT: adcq %rdi, %r17
-; EGPR-NEXT: adcq %r24, %r29
-; EGPR-NEXT: adcq %r23, %r13
-; EGPR-NEXT: setb %r15b
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; EGPR-NEXT: movq %r13, %r8
+; EGPR-NEXT: movq %r12, %r9
+; EGPR-NEXT: pushq %r14
+; EGPR-NEXT: pushq %rbx
+; EGPR-NEXT: pushq %r15
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq %rsi, %rax
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r19
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
-; EGPR-NEXT: movq %r23, %rax
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %rsi, %rax
-; EGPR-NEXT: movq %rsi, %r21
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r22
-; EGPR-NEXT: addq %r16, %r22
-; EGPR-NEXT: adcq %r9, %r8
-; EGPR-NEXT: setb %r18b
-; EGPR-NEXT: movq %r23, %rax
-; EGPR-NEXT: movq %r23, %r14
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: movzbl %r18b, %eax
-; EGPR-NEXT: adcq %rax, %r9
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; EGPR-NEXT: movq %rbx, %rax
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %rdi
+; EGPR-NEXT: movq %rbx, %r9
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT: pushq %r14
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq %rsi, %rax
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rdx, %r23
-; EGPR-NEXT: movq %rax, %r24
-; EGPR-NEXT: addq %r8, %r24
-; EGPR-NEXT: adcq $0, %r23
-; EGPR-NEXT: movq %rbx, %rax
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: addq %r24, %rax
-; EGPR-NEXT: movq %rax, %r11
-; EGPR-NEXT: adcq %r23, %r8
-; EGPR-NEXT: setb %r18b
-; EGPR-NEXT: movq %rsi, %rax
-; EGPR-NEXT: movq %rsi, %r23
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r24
-; EGPR-NEXT: movq %rax, %r30
-; EGPR-NEXT: addq %r8, %r30
-; EGPR-NEXT: movzbl %r18b, %eax
-; EGPR-NEXT: adcq %rax, %r24
-; EGPR-NEXT: addq %r19, %r30
-; EGPR-NEXT: adcq %r22, %r24
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NEXT: movq %r12, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %rbx, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq %r14
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NEXT: movq %r13, %rsi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NEXT: movq %r15, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %rbx, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq %r14
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %r13, %rsi
+; EGPR-NEXT: movq %r15, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NEXT: movq %r13, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT: pushq %r14
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT: movq %r12, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %r13, %r9
+; EGPR-NEXT: movq %r13, %r12
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq %r14
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NEXT: movq %r13, %rsi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NEXT: movq %r15, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %rbx, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %r13, %rsi
+; EGPR-NEXT: movq %r15, %rdx
+; EGPR-NEXT: movq %r15, %rbx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %r12, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq %r14
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %r13, %rsi
+; EGPR-NEXT: movq %r15, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %r13, %rsi
+; EGPR-NEXT: movq %r15, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT: movq %r14, %rsi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT: movq %r13, %rcx
+; EGPR-NEXT: movq %r15, %r8
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %r13, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq %r15
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NEXT: movq %r12, %rsi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NEXT: movq %r15, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %r13, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq %rbx
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %r12, %rsi
+; EGPR-NEXT: movq %r15, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %r14, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NEXT: pushq %r12
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %r14, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq %r12
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT: movq %r14, %rsi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NEXT: movq %rbx, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %r14, %rsi
+; EGPR-NEXT: movq %rbx, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT: movq %r14, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NEXT: pushq %rbx
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NEXT: movq %r15, %rsi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NEXT: movq %r13, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %r14, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq %rbx
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %r15, %rsi
+; EGPR-NEXT: movq %r13, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NEXT: movq %rbx, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NEXT: pushq %r13
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq %rbx, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq %r13
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NEXT: movq %r15, %rsi
+; EGPR-NEXT: movq %r12, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NEXT: movq %rbx, %r9
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NEXT: pushq %r14
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %r15, %rsi
+; EGPR-NEXT: movq %r12, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %r15, %rsi
+; EGPR-NEXT: movq %r12, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq %r13
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %r15, %rsi
+; EGPR-NEXT: movq %r12, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %rbx, %rsi
+; EGPR-NEXT: movq %r14, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $24, %rsp
+; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq %rbx, %rsi
+; EGPR-NEXT: movq %r14, %rdx
+; EGPR-NEXT: xorl %ecx, %ecx
+; EGPR-NEXT: xorl %r8d, %r8d
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq $0
+; EGPR-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NEXT: callq __multi5 at PLT
+; EGPR-NEXT: addq $32, %rsp
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: adcq $0, %rcx
+; EGPR-NEXT: adcq $0, %rax
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r16
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
; EGPR-NEXT: adcq $0, %r16
+; EGPR-NEXT: adcq $0, %r11
+; EGPR-NEXT: addq %rcx, %r16
+; EGPR-NEXT: adcq %rax, %r11
+; EGPR-NEXT: setb %al
+; EGPR-NEXT: movzbl %al, %r17d
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r18
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r16
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r17
+; EGPR-NEXT: adcq $0, %r18
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NEXT: adcq $0, %r10
; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %rbx, %rax
-; EGPR-NEXT: mulq %r20
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %rsi
-; EGPR-NEXT: movq %r23, %rax
-; EGPR-NEXT: mulq %r20
-; EGPR-NEXT: movq %rdx, %r19
-; EGPR-NEXT: movq %rax, %r22
-; EGPR-NEXT: addq %r8, %r22
-; EGPR-NEXT: adcq $0, %r19
-; EGPR-NEXT: movq %rbx, %rax
-; EGPR-NEXT: mulq %r10
-; EGPR-NEXT: movq %rdx, %rbx
-; EGPR-NEXT: addq %r22, %rax
-; EGPR-NEXT: movq %rax, %r22
-; EGPR-NEXT: adcq %r19, %rbx
-; EGPR-NEXT: setb %r18b
-; EGPR-NEXT: movq %r23, %rax
-; EGPR-NEXT: mulq %r10
-; EGPR-NEXT: movq %rdx, %r23
-; EGPR-NEXT: movq %rax, %r8
-; EGPR-NEXT: addq %rbx, %r8
-; EGPR-NEXT: movzbl %r18b, %eax
-; EGPR-NEXT: adcq %rax, %r23
-; EGPR-NEXT: addq %r30, %rsi
-; EGPR-NEXT: adcq %r24, %r22
-; EGPR-NEXT: adcq $0, %r8
-; EGPR-NEXT: adcq $0, %r23
-; EGPR-NEXT: addq %r16, %r8
-; EGPR-NEXT: adcq %r9, %r23
-; EGPR-NEXT: setb %r18b
-; EGPR-NEXT: movq %r21, %rax
-; EGPR-NEXT: mulq %r20
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r24
-; EGPR-NEXT: movq %r14, %rax
-; EGPR-NEXT: mulq %r20
-; EGPR-NEXT: movq %rdx, %r16
-; EGPR-NEXT: movq %rax, %r19
-; EGPR-NEXT: addq %r9, %r19
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NEXT: adcq $0, %rdx
+; EGPR-NEXT: adcq $0, %rsi
+; EGPR-NEXT: addq %r10, %rdx
+; EGPR-NEXT: adcq %r9, %rsi
+; EGPR-NEXT: setb %r9b
+; EGPR-NEXT: movzbl %r9b, %r19d
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r22
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r19
+; EGPR-NEXT: adcq $0, %r22
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NEXT: adcq %r8, %r19
+; EGPR-NEXT: adcq %rdi, %r22
; EGPR-NEXT: adcq $0, %r16
-; EGPR-NEXT: movq %r21, %rax
-; EGPR-NEXT: mulq %r10
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: addq %r19, %rax
-; EGPR-NEXT: movq %rax, %r19
-; EGPR-NEXT: adcq %r16, %r9
-; EGPR-NEXT: setb %r16b
-; EGPR-NEXT: movq %r14, %rax
-; EGPR-NEXT: mulq %r10
-; EGPR-NEXT: movq %rdx, %rbp
-; EGPR-NEXT: movq %rax, %r12
-; EGPR-NEXT: addq %r9, %r12
-; EGPR-NEXT: movzbl %r16b, %eax
-; EGPR-NEXT: adcq %rax, %rbp
-; EGPR-NEXT: addq %r8, %r24
-; EGPR-NEXT: adcq %r23, %r19
-; EGPR-NEXT: movzbl %r18b, %eax
-; EGPR-NEXT: adcq %rax, %r12
-; EGPR-NEXT: adcq $0, %rbp
-; EGPR-NEXT: addq %r28, %rdi
-; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq %r17, %r11
-; EGPR-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq %r29, %rsi
-; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq %r13, %r22
-; EGPR-NEXT: movq %r22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movzbl %r15b, %eax
-; EGPR-NEXT: adcq %rax, %r24
-; EGPR-NEXT: movq %r24, (%rsp) # 8-byte Spill
-; EGPR-NEXT: adcq $0, %r19
-; EGPR-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq $0, %r12
-; EGPR-NEXT: adcq $0, %rbp
-; EGPR-NEXT: movq 64(%r26), %r23
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: mulq %r23
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r24
-; EGPR-NEXT: movq %r25, %rax
-; EGPR-NEXT: mulq %r23
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
+; EGPR-NEXT: adcq $0, %r11
+; EGPR-NEXT: adcq $0, %r17
+; EGPR-NEXT: adcq $0, %r18
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r23
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r24
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT: adcq $0, %r24
+; EGPR-NEXT: adcq $0, %r23
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r8
+; EGPR-NEXT: adcq $0, %r10
; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq 72(%r26), %r28
-; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: mulq %r28
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r30
-; EGPR-NEXT: addq %r16, %r30
-; EGPR-NEXT: adcq %r9, %r8
-; EGPR-NEXT: setb %r18b
-; EGPR-NEXT: movq %r25, %rax
-; EGPR-NEXT: mulq %r28
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: movzbl %r18b, %eax
-; EGPR-NEXT: adcq %rax, %r9
-; EGPR-NEXT: movq %r31, %rax
-; EGPR-NEXT: mulq %r23
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; EGPR-NEXT: movq %r11, %rax
-; EGPR-NEXT: mulq %r23
-; EGPR-NEXT: movq %rdx, %r29
-; EGPR-NEXT: movq %rax, %rbx
-; EGPR-NEXT: addq %r8, %rbx
-; EGPR-NEXT: adcq $0, %r29
-; EGPR-NEXT: movq %r31, %rax
-; EGPR-NEXT: mulq %r28
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: addq %rbx, %rax
-; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq %r29, %r8
-; EGPR-NEXT: setb %r18b
-; EGPR-NEXT: movq %r11, %rax
-; EGPR-NEXT: mulq %r28
-; EGPR-NEXT: movq %rdx, %r29
-; EGPR-NEXT: movq %rax, %rbx
-; EGPR-NEXT: addq %r8, %rbx
-; EGPR-NEXT: movzbl %r18b, %eax
-; EGPR-NEXT: adcq %rax, %r29
-; EGPR-NEXT: addq %r24, %rbx
-; EGPR-NEXT: adcq %r30, %r29
-; EGPR-NEXT: adcq $0, %r16
+; EGPR-NEXT: addq %r24, %r10
+; EGPR-NEXT: adcq %r23, %r9
+; EGPR-NEXT: setb %r23b
+; EGPR-NEXT: movzbl %r23b, %r23d
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r24
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r10
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r23
+; EGPR-NEXT: adcq $0, %r24
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NEXT: adcq %r19, %rdi
+; EGPR-NEXT: adcq %r22, %r8
+; EGPR-NEXT: adcq $0, %r10
; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq 80(%r26), %r13
-; EGPR-NEXT: movq %r31, %rax
-; EGPR-NEXT: mulq %r13
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %rsi
-; EGPR-NEXT: movq %r11, %rax
-; EGPR-NEXT: mulq %r13
-; EGPR-NEXT: movq %rdx, %r30
-; EGPR-NEXT: movq %rax, %r14
-; EGPR-NEXT: addq %r8, %r14
-; EGPR-NEXT: adcq $0, %r30
-; EGPR-NEXT: movq 88(%r26), %r18
-; EGPR-NEXT: movq %r31, %rax
-; EGPR-NEXT: mulq %r18
-; EGPR-NEXT: movq %rdx, %r15
-; EGPR-NEXT: movq %rax, %r24
-; EGPR-NEXT: addq %r14, %r24
-; EGPR-NEXT: adcq %r30, %r15
-; EGPR-NEXT: setb %r14b
-; EGPR-NEXT: movq %r11, %rax
-; EGPR-NEXT: mulq %r18
-; EGPR-NEXT: movq %rdx, %r30
-; EGPR-NEXT: movq %rax, %r8
-; EGPR-NEXT: addq %r15, %r8
-; EGPR-NEXT: movzbl %r14b, %eax
-; EGPR-NEXT: adcq %rax, %r30
-; EGPR-NEXT: addq %rbx, %rsi
-; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT: adcq %r29, %r24
-; EGPR-NEXT: adcq $0, %r8
-; EGPR-NEXT: adcq $0, %r30
-; EGPR-NEXT: addq %r16, %r8
-; EGPR-NEXT: adcq %r9, %r30
-; EGPR-NEXT: setb %r29b
-; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: mulq %r13
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %rsi
-; EGPR-NEXT: movq %r25, %rax
-; EGPR-NEXT: mulq %r13
-; EGPR-NEXT: movq %rdx, %r16
-; EGPR-NEXT: movq %rax, %r14
-; EGPR-NEXT: addq %r9, %r14
-; EGPR-NEXT: adcq $0, %r16
-; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: mulq %r18
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %rbx
-; EGPR-NEXT: addq %r14, %rbx
-; EGPR-NEXT: adcq %r16, %r9
-; EGPR-NEXT: setb %r16b
-; EGPR-NEXT: movq %r25, %rax
-; EGPR-NEXT: mulq %r18
-; EGPR-NEXT: movq %rdx, %r14
-; EGPR-NEXT: movq %rax, %r15
-; EGPR-NEXT: addq %r9, %r15
-; EGPR-NEXT: movzbl %r16b, %eax
-; EGPR-NEXT: adcq %rax, %r14
-; EGPR-NEXT: addq %r8, %rsi
-; EGPR-NEXT: adcq %r30, %rbx
-; EGPR-NEXT: movzbl %r29b, %eax
-; EGPR-NEXT: adcq %rax, %r15
+; EGPR-NEXT: adcq $0, %r23
+; EGPR-NEXT: adcq $0, %r24
+; EGPR-NEXT: addq %r16, %r10
+; EGPR-NEXT: adcq %r11, %r9
+; EGPR-NEXT: adcq %r17, %r23
+; EGPR-NEXT: adcq %r18, %r24
+; EGPR-NEXT: setb %r11b
+; EGPR-NEXT: movzbl %r11b, %r25d
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r19
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r22
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r16
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r16
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; EGPR-NEXT: adcq $0, %r22
+; EGPR-NEXT: adcq $0, %r19
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r18
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r17
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r16
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; EGPR-NEXT: adcq $0, %r18
+; EGPR-NEXT: adcq $0, %r17
+; EGPR-NEXT: addq %r22, %r18
+; EGPR-NEXT: adcq %r19, %r17
+; EGPR-NEXT: setb %r19b
+; EGPR-NEXT: movzbl %r19b, %r19d
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r22
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r18
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r17
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r19
+; EGPR-NEXT: adcq $0, %r22
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r10
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; EGPR-NEXT: adcq %r23, %r16
+; EGPR-NEXT: adcq %r24, %r11
+; EGPR-NEXT: adcq %r25, %r18
+; EGPR-NEXT: adcq $0, %r17
+; EGPR-NEXT: adcq $0, %r19
+; EGPR-NEXT: adcq $0, %r22
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r25
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r26
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r24
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r23
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r24
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r23
+; EGPR-NEXT: adcq $0, %r26
+; EGPR-NEXT: adcq $0, %r25
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r21
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r28
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r24
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r23
+; EGPR-NEXT: adcq $0, %r21
+; EGPR-NEXT: adcq $0, %r28
+; EGPR-NEXT: addq %r26, %r21
+; EGPR-NEXT: adcq %r25, %r28
+; EGPR-NEXT: setb %r25b
+; EGPR-NEXT: movzbl %r25b, %r25d
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r26
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r21
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r28
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r25
+; EGPR-NEXT: adcq $0, %r26
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r27
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r30
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r20
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r31
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r20
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r31
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r30
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r27
+; EGPR-NEXT: addq %r21, %r20
+; EGPR-NEXT: adcq %r28, %r31
+; EGPR-NEXT: adcq %r25, %r30
+; EGPR-NEXT: adcq %r26, %r27
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r21
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r28
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r25
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r26
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r25
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r26
+; EGPR-NEXT: adcq $0, %r28
+; EGPR-NEXT: adcq $0, %r21
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r25
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r26
+; EGPR-NEXT: adcq $0, %rbx
; EGPR-NEXT: adcq $0, %r14
-; EGPR-NEXT: imulq %r27, %r18
-; EGPR-NEXT: movq %r27, %rax
-; EGPR-NEXT: mulq %r13
-; EGPR-NEXT: movq %rax, %r8
-; EGPR-NEXT: addq %r18, %rdx
-; EGPR-NEXT: imulq %rcx, %r13
-; EGPR-NEXT: addq %rdx, %r13
-; EGPR-NEXT: movq %r20, %r9
-; EGPR-NEXT: imulq %r28, %r9
-; EGPR-NEXT: movq %r20, %rax
-; EGPR-NEXT: mulq %r23
-; EGPR-NEXT: movq %rax, %r30
-; EGPR-NEXT: addq %r9, %rdx
-; EGPR-NEXT: imulq %r23, %r10
-; EGPR-NEXT: addq %rdx, %r10
-; EGPR-NEXT: addq %r8, %r30
-; EGPR-NEXT: adcq %r13, %r10
-; EGPR-NEXT: movq %r23, %rax
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r9
-; EGPR-NEXT: movq %r28, %rax
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rdx, %r27
-; EGPR-NEXT: movq %rax, %r20
-; EGPR-NEXT: addq %r8, %r20
-; EGPR-NEXT: adcq $0, %r27
-; EGPR-NEXT: movq %r23, %rax
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r20, %r16
-; EGPR-NEXT: adcq %r27, %r8
-; EGPR-NEXT: setb %r18b
-; EGPR-NEXT: movq %r28, %rax
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r23
-; EGPR-NEXT: movq %rax, %r20
-; EGPR-NEXT: addq %r8, %r20
-; EGPR-NEXT: movzbl %r18b, %eax
-; EGPR-NEXT: adcq %rax, %r23
-; EGPR-NEXT: addq %r30, %r20
-; EGPR-NEXT: adcq %r10, %r23
-; EGPR-NEXT: movq 112(%r26), %rcx
-; EGPR-NEXT: movq %r31, %rax
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rax, %r8
-; EGPR-NEXT: imulq %r11, %rcx
-; EGPR-NEXT: addq %rdx, %rcx
-; EGPR-NEXT: movq 120(%r26), %rax
-; EGPR-NEXT: imulq %r31, %rax
-; EGPR-NEXT: addq %rax, %rcx
-; EGPR-NEXT: movq 96(%r26), %r27
-; EGPR-NEXT: movq 104(%r26), %r30
-; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: imulq %r30, %rdi
-; EGPR-NEXT: mulq %r27
-; EGPR-NEXT: movq %rax, %r21
-; EGPR-NEXT: addq %rdi, %rdx
-; EGPR-NEXT: imulq %r27, %r25
-; EGPR-NEXT: addq %rdx, %r25
-; EGPR-NEXT: addq %r8, %r21
-; EGPR-NEXT: adcq %rcx, %r25
-; EGPR-NEXT: movq %r27, %rax
-; EGPR-NEXT: mulq %r31
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r22
-; EGPR-NEXT: movq %r30, %rax
-; EGPR-NEXT: mulq %r31
-; EGPR-NEXT: movq %rdx, %r31
-; EGPR-NEXT: movq %rax, %r28
-; EGPR-NEXT: addq %r8, %r28
-; EGPR-NEXT: adcq $0, %r31
-; EGPR-NEXT: movq %r27, %rax
-; EGPR-NEXT: mulq %r11
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r27
-; EGPR-NEXT: addq %r28, %r27
-; EGPR-NEXT: adcq %r31, %r8
-; EGPR-NEXT: setb %cl
-; EGPR-NEXT: movq %r30, %rax
-; EGPR-NEXT: mulq %r11
-; EGPR-NEXT: movq %rdx, %r26
-; EGPR-NEXT: movq %rax, %r31
-; EGPR-NEXT: addq %r8, %r31
-; EGPR-NEXT: movzbl %cl, %eax
-; EGPR-NEXT: adcq %rax, %r26
-; EGPR-NEXT: addq %r21, %r31
-; EGPR-NEXT: adcq %r25, %r26
-; EGPR-NEXT: addq %r9, %r22
-; EGPR-NEXT: adcq %r16, %r27
-; EGPR-NEXT: adcq %r20, %r31
+; EGPR-NEXT: addq %r28, %rbx
+; EGPR-NEXT: adcq %r21, %r14
+; EGPR-NEXT: setb %r21b
+; EGPR-NEXT: movzbl %r21b, %r15d
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %rbx
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r14
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r15
+; EGPR-NEXT: adcq $0, %r12
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r21
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r28
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %r21
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r28
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r29
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r29
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r13
+; EGPR-NEXT: addq %rbx, %r21
+; EGPR-NEXT: adcq %r14, %r28
+; EGPR-NEXT: adcq %r15, %r29
+; EGPR-NEXT: adcq %r12, %r13
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; EGPR-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; EGPR-NEXT: addq {{[0-9]+}}(%rsp), %rbx
+; EGPR-NEXT: adcq {{[0-9]+}}(%rsp), %r14
+; EGPR-NEXT: adcq %r24, %r25
; EGPR-NEXT: adcq %r23, %r26
-; EGPR-NEXT: addq %rsi, %r22
-; EGPR-NEXT: adcq %rbx, %r27
-; EGPR-NEXT: adcq %r15, %r31
-; EGPR-NEXT: adcq %r14, %r26
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; EGPR-NEXT: movq 80(%r11), %rbx
-; EGPR-NEXT: movq %rbx, %rax
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
-; EGPR-NEXT: mulq %r19
-; EGPR-NEXT: movq %rax, %r23
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq 88(%r11), %r20
-; EGPR-NEXT: movq %r20, %rax
-; EGPR-NEXT: mulq %r19
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %rbx, %rax
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r17 # 8-byte Reload
-; EGPR-NEXT: mulq %r17
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r30
-; EGPR-NEXT: addq %r16, %r30
-; EGPR-NEXT: adcq %r9, %r8
-; EGPR-NEXT: setb %cl
-; EGPR-NEXT: movq %r20, %rax
-; EGPR-NEXT: mulq %r17
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r16
-; EGPR-NEXT: addq %r8, %r16
-; EGPR-NEXT: movzbl %cl, %eax
-; EGPR-NEXT: adcq %rax, %r9
-; EGPR-NEXT: movq 64(%r11), %r15
-; EGPR-NEXT: movq %r15, %rax
-; EGPR-NEXT: mulq %r19
-; EGPR-NEXT: movq %rax, %r25
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq 72(%r11), %r14
-; EGPR-NEXT: movq %r14, %rax
-; EGPR-NEXT: mulq %r19
-; EGPR-NEXT: movq %rdx, %r28
-; EGPR-NEXT: movq %rax, %r29
-; EGPR-NEXT: addq %r8, %r29
-; EGPR-NEXT: adcq $0, %r28
-; EGPR-NEXT: movq %r15, %rax
-; EGPR-NEXT: mulq %r17
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r21
-; EGPR-NEXT: addq %r29, %r21
-; EGPR-NEXT: adcq %r28, %r8
-; EGPR-NEXT: setb %cl
-; EGPR-NEXT: movq %r14, %rax
-; EGPR-NEXT: mulq %r17
-; EGPR-NEXT: movq %rdx, %r29
-; EGPR-NEXT: movq %rax, %r13
-; EGPR-NEXT: addq %r8, %r13
-; EGPR-NEXT: movzbl %cl, %eax
-; EGPR-NEXT: adcq %rax, %r29
-; EGPR-NEXT: addq %r23, %r13
+; EGPR-NEXT: adcq %r20, %r21
+; EGPR-NEXT: adcq %r31, %r28
; EGPR-NEXT: adcq %r30, %r29
-; EGPR-NEXT: adcq $0, %r16
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %r15, %rax
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NEXT: mulq %rdi
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r28
-; EGPR-NEXT: movq %r14, %rax
-; EGPR-NEXT: mulq %rdi
-; EGPR-NEXT: movq %rdx, %r30
-; EGPR-NEXT: movq %rax, %rcx
-; EGPR-NEXT: addq %r8, %rcx
-; EGPR-NEXT: adcq $0, %r30
-; EGPR-NEXT: movq %r15, %rax
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload
-; EGPR-NEXT: mulq %r18
-; EGPR-NEXT: movq %rdx, %r10
-; EGPR-NEXT: movq %rax, %r23
-; EGPR-NEXT: addq %rcx, %r23
-; EGPR-NEXT: adcq %r30, %r10
-; EGPR-NEXT: setb %cl
-; EGPR-NEXT: movq %r14, %rax
-; EGPR-NEXT: mulq %r18
-; EGPR-NEXT: movq %rdx, %r30
-; EGPR-NEXT: movq %rax, %r8
-; EGPR-NEXT: addq %r10, %r8
-; EGPR-NEXT: movzbl %cl, %eax
-; EGPR-NEXT: adcq %rax, %r30
-; EGPR-NEXT: addq %r13, %r28
-; EGPR-NEXT: adcq %r29, %r23
-; EGPR-NEXT: adcq $0, %r8
-; EGPR-NEXT: adcq $0, %r30
-; EGPR-NEXT: addq %r16, %r8
-; EGPR-NEXT: adcq %r9, %r30
-; EGPR-NEXT: setb %sil
-; EGPR-NEXT: movq %rbx, %rax
-; EGPR-NEXT: mulq %rdi
-; EGPR-NEXT: movq %rdx, %rcx
-; EGPR-NEXT: movq %rax, %r29
-; EGPR-NEXT: movq %r20, %rax
-; EGPR-NEXT: mulq %rdi
-; EGPR-NEXT: movq %rdx, %r9
-; EGPR-NEXT: movq %rax, %r10
-; EGPR-NEXT: addq %rcx, %r10
-; EGPR-NEXT: adcq $0, %r9
-; EGPR-NEXT: movq %rbx, %rax
-; EGPR-NEXT: mulq %r18
-; EGPR-NEXT: movq %rdx, %rcx
-; EGPR-NEXT: movq %rax, %r13
-; EGPR-NEXT: addq %r10, %r13
-; EGPR-NEXT: adcq %r9, %rcx
-; EGPR-NEXT: setb %r10b
-; EGPR-NEXT: movq %r20, %rax
-; EGPR-NEXT: mulq %r18
-; EGPR-NEXT: movq %rdx, %r16
-; EGPR-NEXT: movq %rax, %r9
-; EGPR-NEXT: addq %rcx, %r9
-; EGPR-NEXT: movzbl %r10b, %eax
-; EGPR-NEXT: adcq %rax, %r16
-; EGPR-NEXT: addq %r8, %r29
-; EGPR-NEXT: adcq %r30, %r13
-; EGPR-NEXT: movzbl %sil, %eax
-; EGPR-NEXT: adcq %rax, %r9
-; EGPR-NEXT: adcq $0, %r16
-; EGPR-NEXT: movq 96(%r11), %rcx
-; EGPR-NEXT: imulq %rcx, %r18
-; EGPR-NEXT: movq %rcx, %rax
-; EGPR-NEXT: mulq %rdi
-; EGPR-NEXT: movq %rax, %r8
-; EGPR-NEXT: addq %r18, %rdx
-; EGPR-NEXT: movq 104(%r11), %r30
-; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: imulq %r30, %rax
-; EGPR-NEXT: addq %rdx, %rax
-; EGPR-NEXT: movq %rax, %r10
-; EGPR-NEXT: movq 112(%r11), %rax
-; EGPR-NEXT: movq %rax, %rsi
-; EGPR-NEXT: imulq %r17, %rsi
-; EGPR-NEXT: mulq %r19
-; EGPR-NEXT: movq %rax, %rdi
-; EGPR-NEXT: addq %rsi, %rdx
-; EGPR-NEXT: movq 120(%r11), %r18
-; EGPR-NEXT: imulq %r19, %r18
-; EGPR-NEXT: addq %rdx, %r18
-; EGPR-NEXT: addq %r8, %rdi
-; EGPR-NEXT: adcq %r10, %r18
-; EGPR-NEXT: movq %r19, %rax
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %rsi
-; EGPR-NEXT: movq %r17, %rax
-; EGPR-NEXT: mulq %rcx
-; EGPR-NEXT: movq %rdx, %rcx
-; EGPR-NEXT: movq %rax, %r10
-; EGPR-NEXT: addq %r8, %r10
-; EGPR-NEXT: adcq $0, %rcx
-; EGPR-NEXT: movq %r19, %rax
-; EGPR-NEXT: mulq %r30
-; EGPR-NEXT: movq %rdx, %r8
-; EGPR-NEXT: movq %rax, %r11
-; EGPR-NEXT: addq %r10, %r11
-; EGPR-NEXT: adcq %rcx, %r8
-; EGPR-NEXT: setb %cl
-; EGPR-NEXT: movq %r17, %rax
-; EGPR-NEXT: mulq %r30
-; EGPR-NEXT: movq %rdx, %r10
-; EGPR-NEXT: movq %rax, %r17
-; EGPR-NEXT: addq %r8, %r17
-; EGPR-NEXT: movzbl %cl, %eax
-; EGPR-NEXT: adcq %rax, %r10
-; EGPR-NEXT: addq %rdi, %r17
-; EGPR-NEXT: adcq %r18, %r10
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NEXT: imulq %r15, %rdi
-; EGPR-NEXT: movq %r15, %rax
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; EGPR-NEXT: mulq %r8
-; EGPR-NEXT: movq %rax, %rcx
-; EGPR-NEXT: addq %rdi, %rdx
-; EGPR-NEXT: movq %r8, %rax
-; EGPR-NEXT: imulq %r14, %rax
-; EGPR-NEXT: addq %rdx, %rax
-; EGPR-NEXT: movq %rax, %r18
-; EGPR-NEXT: movq %rbx, %rdi
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
-; EGPR-NEXT: imulq %r19, %rdi
-; EGPR-NEXT: movq %rbx, %rax
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; EGPR-NEXT: mulq %r8
-; EGPR-NEXT: movq %rax, %r30
-; EGPR-NEXT: addq %rdi, %rdx
-; EGPR-NEXT: imulq %r8, %r20
-; EGPR-NEXT: addq %rdx, %r20
-; EGPR-NEXT: addq %rcx, %r30
-; EGPR-NEXT: adcq %r18, %r20
-; EGPR-NEXT: movq %r8, %rax
-; EGPR-NEXT: movq %r8, %rdi
-; EGPR-NEXT: mulq %r15
-; EGPR-NEXT: movq %rdx, %rcx
-; EGPR-NEXT: movq %rax, %r8
-; EGPR-NEXT: movq %r19, %rax
-; EGPR-NEXT: mulq %r15
-; EGPR-NEXT: movq %rdx, %rbx
-; EGPR-NEXT: movq %rax, %r15
-; EGPR-NEXT: addq %rcx, %r15
-; EGPR-NEXT: adcq $0, %rbx
-; EGPR-NEXT: movq %rdi, %rax
-; EGPR-NEXT: mulq %r14
-; EGPR-NEXT: movq %rdx, %rcx
-; EGPR-NEXT: movq %rax, %r18
-; EGPR-NEXT: addq %r15, %r18
-; EGPR-NEXT: adcq %rbx, %rcx
-; EGPR-NEXT: setb %dil
-; EGPR-NEXT: movq %r19, %rax
-; EGPR-NEXT: mulq %r14
-; EGPR-NEXT: addq %rcx, %rax
-; EGPR-NEXT: movzbl %dil, %ecx
-; EGPR-NEXT: adcq %rcx, %rdx
-; EGPR-NEXT: addq %r30, %rax
-; EGPR-NEXT: adcq %r20, %rdx
-; EGPR-NEXT: addq %rsi, %r8
-; EGPR-NEXT: adcq %r11, %r18
-; EGPR-NEXT: adcq %r17, %rax
-; EGPR-NEXT: adcq %r10, %rdx
-; EGPR-NEXT: addq %r29, %r8
-; EGPR-NEXT: adcq %r13, %r18
-; EGPR-NEXT: adcq %r9, %rax
-; EGPR-NEXT: adcq %r16, %rdx
-; EGPR-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Folded Reload
-; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
-; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
-; EGPR-NEXT: adcq %r24, %r23
-; EGPR-NEXT: adcq %r22, %r8
-; EGPR-NEXT: adcq %r27, %r18
-; EGPR-NEXT: adcq %r31, %rax
-; EGPR-NEXT: adcq %r26, %rdx
-; EGPR-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Folded Reload
-; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
-; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
-; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Folded Reload
-; EGPR-NEXT: adcq (%rsp), %r8 # 8-byte Folded Reload
-; EGPR-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Folded Reload
-; EGPR-NEXT: adcq %r12, %rax
-; EGPR-NEXT: adcq %rbp, %rdx
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq %rsi, (%rcx)
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq %rsi, 8(%rcx)
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq %rsi, 16(%rcx)
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq %rsi, 24(%rcx)
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq %rsi, 32(%rcx)
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq %rsi, 40(%rcx)
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq %rsi, 48(%rcx)
-; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT: movq %rsi, 56(%rcx)
-; EGPR-NEXT: movq %r25, 64(%rcx)
-; EGPR-NEXT: movq %r21, 72(%rcx)
-; EGPR-NEXT: movq %r28, 80(%rcx)
-; EGPR-NEXT: movq %r23, 88(%rcx)
-; EGPR-NEXT: movq %r8, 96(%rcx)
-; EGPR-NEXT: movq %r18, 104(%rcx)
-; EGPR-NEXT: movq %rax, 112(%rcx)
-; EGPR-NEXT: movq %rdx, 120(%rcx)
-; EGPR-NEXT: addq $104, %rsp
+; EGPR-NEXT: adcq %r27, %r13
+; EGPR-NEXT: addq %r10, %rbx
+; EGPR-NEXT: adcq %r9, %r14
+; EGPR-NEXT: adcq %r16, %r25
+; EGPR-NEXT: adcq %r11, %r26
+; EGPR-NEXT: adcq %r18, %r21
+; EGPR-NEXT: adcq %r17, %r28
+; EGPR-NEXT: adcq %r19, %r29
+; EGPR-NEXT: adcq %r22, %r13
+; EGPR-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NEXT: movq %rax, 16(%r9)
+; EGPR-NEXT: movq %rcx, 24(%r9)
+; EGPR-NEXT: movq %rdx, 32(%r9)
+; EGPR-NEXT: movq %rsi, 40(%r9)
+; EGPR-NEXT: movq %rdi, 48(%r9)
+; EGPR-NEXT: movq %r8, 56(%r9)
+; EGPR-NEXT: movq %rbx, 64(%r9)
+; EGPR-NEXT: movq %r14, 72(%r9)
+; EGPR-NEXT: movq %r25, 80(%r9)
+; EGPR-NEXT: movq %r26, 88(%r9)
+; EGPR-NEXT: movq %r21, 96(%r9)
+; EGPR-NEXT: movq %r28, 104(%r9)
+; EGPR-NEXT: movq %r29, 112(%r9)
+; EGPR-NEXT: movq %r13, 120(%r9)
+; EGPR-NEXT: movaps %xmm0, (%r9)
+; EGPR-NEXT: leaq -40(%rbp), %rsp
; EGPR-NEXT: popq %rbx
; EGPR-NEXT: popq %r12
; EGPR-NEXT: popq %r13
@@ -1036,845 +649,608 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; EGPR-NDD-LABEL: test_1024:
; EGPR-NDD: # %bb.0:
; EGPR-NDD-NEXT: pushq %rbp
+; EGPR-NDD-NEXT: movq %rsp, %rbp
; EGPR-NDD-NEXT: pushq %r15
; EGPR-NDD-NEXT: pushq %r14
; EGPR-NDD-NEXT: pushq %r13
; EGPR-NDD-NEXT: pushq %r12
; EGPR-NDD-NEXT: pushq %rbx
-; EGPR-NDD-NEXT: subq $96, %rsp
+; EGPR-NDD-NEXT: andq $-32, %rsp
+; EGPR-NDD-NEXT: subq $1216, %rsp # imm = 0x4C0
; EGPR-NDD-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq %rsi, %r15
-; EGPR-NDD-NEXT: movq %rdi, %r22
-; EGPR-NDD-NEXT: movq (%rdi), %r17
-; EGPR-NDD-NEXT: movq 8(%rdi), %r11
-; EGPR-NDD-NEXT: movq 24(%rdi), %r9
-; EGPR-NDD-NEXT: movq 16(%rdi), %r10
-; EGPR-NDD-NEXT: movq 40(%rdi), %rdi
-; EGPR-NDD-NEXT: movq 32(%r22), %r16
-; EGPR-NDD-NEXT: movq 56(%r22), %r18
-; EGPR-NDD-NEXT: movq 48(%r22), %r25
-; EGPR-NDD-NEXT: movq 24(%rsi), %r14
-; EGPR-NDD-NEXT: movq 16(%rsi), %r26
-; EGPR-NDD-NEXT: movq (%rsi), %r24
-; EGPR-NDD-NEXT: movq 8(%rsi), %r23
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r24
-; EGPR-NDD-NEXT: movq %rdx, %r27
-; EGPR-NDD-NEXT: movq %rax, %r19
-; EGPR-NDD-NEXT: movq %r18, %rax
-; EGPR-NDD-NEXT: mulq %r24
-; EGPR-NDD-NEXT: addq %rax, %r27
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %r27, %rax, %rsi
-; EGPR-NDD-NEXT: adcq %rdx, %rcx
-; EGPR-NDD-NEXT: setb %al
-; EGPR-NDD-NEXT: movzbl %al, %r8d
-; EGPR-NDD-NEXT: movq %r18, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %rcx, %rax, %r31
-; EGPR-NDD-NEXT: adcq %rdx, %r8
-; EGPR-NDD-NEXT: movq %r16, %rax
-; EGPR-NDD-NEXT: mulq %r24
-; EGPR-NDD-NEXT: movq %rdx, %r30
-; EGPR-NDD-NEXT: movq %rax, %r27
-; EGPR-NDD-NEXT: movq %rdi, %rax
-; EGPR-NDD-NEXT: mulq %r24
-; EGPR-NDD-NEXT: addq %r30, %rax, %rcx
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r30
-; EGPR-NDD-NEXT: movq %r16, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %rax, %rcx
-; EGPR-NDD-NEXT: adcq %rdx, %r30
-; EGPR-NDD-NEXT: setb %al
-; EGPR-NDD-NEXT: movzbl %al, %r20d
-; EGPR-NDD-NEXT: movq %rdi, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %r30, %rax
-; EGPR-NDD-NEXT: adcq %r20, %rdx
-; EGPR-NDD-NEXT: addq %rax, %r19, %r20
-; EGPR-NDD-NEXT: adcq %rdx, %rsi, %r21
-; EGPR-NDD-NEXT: adcq $0, %r31
-; EGPR-NDD-NEXT: adcq $0, %r8
-; EGPR-NDD-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq %r16, %rax
-; EGPR-NDD-NEXT: mulq %r26
-; EGPR-NDD-NEXT: movq %rdx, %r19
-; EGPR-NDD-NEXT: movq %rax, %r30
-; EGPR-NDD-NEXT: movq %rdi, %rax
-; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: mulq %r26
-; EGPR-NDD-NEXT: addq %rax, %r19
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rsi
-; EGPR-NDD-NEXT: movq %r16, %rax
-; EGPR-NDD-NEXT: mulq %r14
-; EGPR-NDD-NEXT: addq %rax, %r19
-; EGPR-NDD-NEXT: adcq %rdx, %rsi
-; EGPR-NDD-NEXT: setb %al
-; EGPR-NDD-NEXT: movzbl %al, %r28d
-; EGPR-NDD-NEXT: movq %rdi, %rax
-; EGPR-NDD-NEXT: mulq %r14
-; EGPR-NDD-NEXT: addq %rsi, %rax
-; EGPR-NDD-NEXT: adcq %r28, %rdx
-; EGPR-NDD-NEXT: addq %r20, %r30, %rsi
-; EGPR-NDD-NEXT: adcq %r21, %r19, %r20
-; EGPR-NDD-NEXT: adcq $0, %rax
-; EGPR-NDD-NEXT: adcq $0, %rdx
-; EGPR-NDD-NEXT: addq %rax, %r31
-; EGPR-NDD-NEXT: adcq %rdx, %r8
-; EGPR-NDD-NEXT: setb %al
-; EGPR-NDD-NEXT: movzbl %al, %r29d
-; EGPR-NDD-NEXT: movq %r25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r26
-; EGPR-NDD-NEXT: movq %rdx, %r19
-; EGPR-NDD-NEXT: movq %rax, %r30
-; EGPR-NDD-NEXT: movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq %r18, %rax
-; EGPR-NDD-NEXT: mulq %r26
-; EGPR-NDD-NEXT: addq %rax, %r19
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r21
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r14
-; EGPR-NDD-NEXT: addq %rax, %r19
-; EGPR-NDD-NEXT: adcq %rdx, %r21
-; EGPR-NDD-NEXT: setb %al
-; EGPR-NDD-NEXT: movzbl %al, %r28d
-; EGPR-NDD-NEXT: movq %r18, %rax
-; EGPR-NDD-NEXT: mulq %r14
-; EGPR-NDD-NEXT: addq %r21, %rax
-; EGPR-NDD-NEXT: adcq %r28, %rdx
-; EGPR-NDD-NEXT: addq %r31, %r30, %r21
-; EGPR-NDD-NEXT: adcq %r8, %r19, %r28
-; EGPR-NDD-NEXT: adcq %rax, %r29
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT: movq %r10, %rax
-; EGPR-NDD-NEXT: mulq %r24
-; EGPR-NDD-NEXT: movq %rdx, %r19
-; EGPR-NDD-NEXT: movq %rax, %r30
-; EGPR-NDD-NEXT: movq %r9, %rax
-; EGPR-NDD-NEXT: mulq %r24
-; EGPR-NDD-NEXT: addq %rax, %r19
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT: movq %r10, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %rax, %r19
-; EGPR-NDD-NEXT: adcq %rdx, %r8
-; EGPR-NDD-NEXT: setb %al
-; EGPR-NDD-NEXT: movzbl %al, %r31d
-; EGPR-NDD-NEXT: movq %r9, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %rax, %r8
-; EGPR-NDD-NEXT: adcq %r31, %rdx, %rbx
-; EGPR-NDD-NEXT: movq %r17, %rax
-; EGPR-NDD-NEXT: mulq %r24
-; EGPR-NDD-NEXT: movq %rdx, %r31
+; EGPR-NDD-NEXT: movq (%rdi), %rax
; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq %r11, %rax
-; EGPR-NDD-NEXT: mulq %r24
-; EGPR-NDD-NEXT: addq %rax, %r31
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r12
-; EGPR-NDD-NEXT: movq %r17, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %r31, %rax
+; EGPR-NDD-NEXT: movq 8(%rdi), %rax
; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq %rdx, %r12
-; EGPR-NDD-NEXT: setb %r31b
-; EGPR-NDD-NEXT: movq %r11, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %r12, %rax
-; EGPR-NDD-NEXT: movzbl %r31b, %r31d
-; EGPR-NDD-NEXT: adcq %r31, %rdx
-; EGPR-NDD-NEXT: addq %rax, %r30, %r12
-; EGPR-NDD-NEXT: adcq %rdx, %r19
-; EGPR-NDD-NEXT: adcq $0, %r8
-; EGPR-NDD-NEXT: adcq $0, %rbx
-; EGPR-NDD-NEXT: movq %r17, %rax
-; EGPR-NDD-NEXT: mulq %r26
-; EGPR-NDD-NEXT: movq %rdx, %r30
-; EGPR-NDD-NEXT: movq %rax, %r31
-; EGPR-NDD-NEXT: movq %r11, %rax
-; EGPR-NDD-NEXT: mulq %r26
-; EGPR-NDD-NEXT: addq %rax, %r30
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r13
-; EGPR-NDD-NEXT: movq %r17, %rax
-; EGPR-NDD-NEXT: mulq %r14
-; EGPR-NDD-NEXT: addq %rax, %r30
-; EGPR-NDD-NEXT: adcq %rdx, %r13
-; EGPR-NDD-NEXT: setb %bpl
-; EGPR-NDD-NEXT: movq %r11, %rax
-; EGPR-NDD-NEXT: mulq %r14
-; EGPR-NDD-NEXT: addq %r13, %rax
-; EGPR-NDD-NEXT: movzbl %bpl, %r13d
-; EGPR-NDD-NEXT: adcq %r13, %rdx
-; EGPR-NDD-NEXT: addq %r12, %r31
-; EGPR-NDD-NEXT: movq %r31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq %r30, %r19
-; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq $0, %rax
-; EGPR-NDD-NEXT: adcq $0, %rdx
-; EGPR-NDD-NEXT: addq %rax, %r8
-; EGPR-NDD-NEXT: adcq %rdx, %rbx
-; EGPR-NDD-NEXT: setb %r19b
-; EGPR-NDD-NEXT: movq %r10, %r16
-; EGPR-NDD-NEXT: movq %r10, %rax
-; EGPR-NDD-NEXT: mulq %r26
-; EGPR-NDD-NEXT: movq %rdx, %r30
-; EGPR-NDD-NEXT: movq %rax, %r31
-; EGPR-NDD-NEXT: movq %r9, %rax
-; EGPR-NDD-NEXT: mulq %r26
-; EGPR-NDD-NEXT: addq %rax, %r30
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r12
-; EGPR-NDD-NEXT: movq %r10, %rax
-; EGPR-NDD-NEXT: mulq %r14
-; EGPR-NDD-NEXT: addq %rax, %r30
-; EGPR-NDD-NEXT: adcq %rdx, %r12
-; EGPR-NDD-NEXT: setb %bpl
-; EGPR-NDD-NEXT: movq %r9, %rax
-; EGPR-NDD-NEXT: mulq %r14
-; EGPR-NDD-NEXT: addq %r12, %rax
-; EGPR-NDD-NEXT: movzbl %bpl, %r12d
-; EGPR-NDD-NEXT: adcq %r12, %rdx
-; EGPR-NDD-NEXT: addq %r31, %r8
-; EGPR-NDD-NEXT: adcq %r30, %rbx
-; EGPR-NDD-NEXT: movzbl %r19b, %r19d
-; EGPR-NDD-NEXT: adcq %r19, %rax
-; EGPR-NDD-NEXT: adcq $0, %rdx
-; EGPR-NDD-NEXT: addq %r8, %r27, %r12
-; EGPR-NDD-NEXT: movq 32(%r15), %r30
-; EGPR-NDD-NEXT: adcq %rbx, %rcx, %r13
-; EGPR-NDD-NEXT: adcq %rax, %rsi, %rbp
-; EGPR-NDD-NEXT: adcq %rdx, %r20, %rbx
-; EGPR-NDD-NEXT: adcq $0, %r21
-; EGPR-NDD-NEXT: movq %r21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq $0, %r28
-; EGPR-NDD-NEXT: adcq $0, %r29
-; EGPR-NDD-NEXT: adcq $0, %rdi
+; EGPR-NDD-NEXT: movq 16(%rdi), %rax
+; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 24(%rdi), %rax
+; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 96(%rdi), %rax
+; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 104(%rdi), %rax
+; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 112(%rdi), %rax
+; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 120(%rdi), %rax
+; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 64(%rdi), %r15
+; EGPR-NDD-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 72(%rdi), %rax
+; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 80(%rdi), %rax
+; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 88(%rdi), %rax
+; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 32(%rdi), %rax
+; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 40(%rdi), %rdx
+; EGPR-NDD-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 48(%rdi), %rcx
+; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 56(%rdi), %r8
+; EGPR-NDD-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 96(%rsi), %rdi
; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq %r10, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: movq %rdx, %r27
-; EGPR-NDD-NEXT: movq %rax, %r31
-; EGPR-NDD-NEXT: movq %r9, %r19
-; EGPR-NDD-NEXT: movq %r9, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: addq %rax, %r27
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT: movq 40(%r15), %r18
-; EGPR-NDD-NEXT: movq %r10, %rax
-; EGPR-NDD-NEXT: mulq %r18
-; EGPR-NDD-NEXT: addq %r27, %rax, %r21
-; EGPR-NDD-NEXT: adcq %rdx, %rcx
-; EGPR-NDD-NEXT: setb %r8b
-; EGPR-NDD-NEXT: movq %r9, %rax
-; EGPR-NDD-NEXT: mulq %r18
-; EGPR-NDD-NEXT: addq %rcx, %rax, %rdi
-; EGPR-NDD-NEXT: movzbl %r8b, %eax
-; EGPR-NDD-NEXT: adcq %rax, %rdx, %rsi
-; EGPR-NDD-NEXT: movq %r17, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: movq %rdx, %r20
-; EGPR-NDD-NEXT: movq %rax, %r27
-; EGPR-NDD-NEXT: movq %r11, %r10
-; EGPR-NDD-NEXT: movq %r11, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: addq %r20, %rax, %r8
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r20
-; EGPR-NDD-NEXT: movq %r17, %rax
-; EGPR-NDD-NEXT: mulq %r18
-; EGPR-NDD-NEXT: addq %r8, %rax, %r25
-; EGPR-NDD-NEXT: adcq %rdx, %r20
-; EGPR-NDD-NEXT: setb %cl
-; EGPR-NDD-NEXT: movq %r11, %rax
-; EGPR-NDD-NEXT: mulq %r18
-; EGPR-NDD-NEXT: addq %r20, %rax
-; EGPR-NDD-NEXT: movzbl %cl, %ecx
-; EGPR-NDD-NEXT: adcq %rdx, %rcx
-; EGPR-NDD-NEXT: addq %rax, %r31
-; EGPR-NDD-NEXT: adcq %rcx, %r21, %r8
-; EGPR-NDD-NEXT: adcq $0, %rdi
-; EGPR-NDD-NEXT: adcq $0, %rsi, %r9
-; EGPR-NDD-NEXT: movq 48(%r15), %r11
-; EGPR-NDD-NEXT: movq %r17, %rsi
-; EGPR-NDD-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq %r17, %rax
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: movq %rdx, %r20
-; EGPR-NDD-NEXT: movq %rax, %r21
-; EGPR-NDD-NEXT: movq %r10, %rax
-; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: addq %rax, %r20
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT: movq 56(%r15), %r17
-; EGPR-NDD-NEXT: movq %rsi, %rax
-; EGPR-NDD-NEXT: mulq %r17
-; EGPR-NDD-NEXT: addq %rax, %r20
-; EGPR-NDD-NEXT: adcq %rdx, %rcx
-; EGPR-NDD-NEXT: setb %sil
-; EGPR-NDD-NEXT: movq %r10, %rax
-; EGPR-NDD-NEXT: mulq %r17
-; EGPR-NDD-NEXT: addq %rcx, %rax
-; EGPR-NDD-NEXT: movzbl %sil, %ecx
-; EGPR-NDD-NEXT: adcq %rdx, %rcx
-; EGPR-NDD-NEXT: addq %r21, %r31
-; EGPR-NDD-NEXT: adcq %r8, %r20, %r10
-; EGPR-NDD-NEXT: adcq $0, %rax
-; EGPR-NDD-NEXT: adcq $0, %rcx
-; EGPR-NDD-NEXT: addq %rax, %rdi
-; EGPR-NDD-NEXT: adcq %rcx, %r9, %r8
-; EGPR-NDD-NEXT: setb %sil
+; EGPR-NDD-NEXT: movq 104(%rsi), %rdi
+; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 112(%rsi), %rdi
+; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 120(%rsi), %rdi
+; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq (%rsi), %rdi
+; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 8(%rsi), %rdi
+; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 16(%rsi), %rdi
+; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 24(%rsi), %rdi
+; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 32(%rsi), %r13
+; EGPR-NDD-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 40(%rsi), %r12
+; EGPR-NDD-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 48(%rsi), %rbx
+; EGPR-NDD-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 56(%rsi), %r14
+; EGPR-NDD-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 64(%rsi), %r9
+; EGPR-NDD-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: movq 72(%rsi), %r16
; EGPR-NDD-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq %r16, %rax
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: movq %rdx, %r20
-; EGPR-NDD-NEXT: movq %rax, %r21
-; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq %r19, %rax
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: addq %rax, %r20
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r9
-; EGPR-NDD-NEXT: movq %r16, %rax
-; EGPR-NDD-NEXT: mulq %r17
-; EGPR-NDD-NEXT: addq %rax, %r20
-; EGPR-NDD-NEXT: adcq %rdx, %r9
-; EGPR-NDD-NEXT: setb %cl
-; EGPR-NDD-NEXT: movq %r19, %rax
-; EGPR-NDD-NEXT: mulq %r17
-; EGPR-NDD-NEXT: addq %r9, %rax
-; EGPR-NDD-NEXT: movzbl %cl, %ecx
-; EGPR-NDD-NEXT: adcq %rdx, %rcx
-; EGPR-NDD-NEXT: addq %r21, %rdi
-; EGPR-NDD-NEXT: adcq %r20, %r8
-; EGPR-NDD-NEXT: movzbl %sil, %edx
-; EGPR-NDD-NEXT: adcq %rdx, %rax
-; EGPR-NDD-NEXT: adcq $0, %rcx
-; EGPR-NDD-NEXT: addq %r12, %r27
-; EGPR-NDD-NEXT: movq %r27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq %r13, %r25, %r19
-; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq %rbp, %r31
-; EGPR-NDD-NEXT: movq %r31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq %rbx, %r10
+; EGPR-NDD-NEXT: movq 80(%rsi), %r10
; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq $0, %rdi
-; EGPR-NDD-NEXT: adcq $0, %r8
-; EGPR-NDD-NEXT: adcq $0, %rax
-; EGPR-NDD-NEXT: adcq $0, %rcx
-; EGPR-NDD-NEXT: addq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %r8, %r28
-; EGPR-NDD-NEXT: adcq %rax, %r29
-; EGPR-NDD-NEXT: adcq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; EGPR-NDD-NEXT: setb %r8b
+; EGPR-NDD-NEXT: movq 88(%rsi), %r11
+; EGPR-NDD-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT: subq $8, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %rax, %rsi
+; EGPR-NDD-NEXT: pushq %r11
+; EGPR-NDD-NEXT: pushq %r10
+; EGPR-NDD-NEXT: pushq %r16
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %r15, %rsi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r13, %r9
+; EGPR-NDD-NEXT: pushq %r14
+; EGPR-NDD-NEXT: pushq %rbx
+; EGPR-NDD-NEXT: pushq %r12
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r14, %r9
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq %r15
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; EGPR-NDD-NEXT: movq %r13, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: movq %rdx, %r27
-; EGPR-NDD-NEXT: movq %rax, %r20
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; EGPR-NDD-NEXT: movq %r10, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: addq %rax, %r27
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rsi
-; EGPR-NDD-NEXT: movq %r13, %rax
-; EGPR-NDD-NEXT: mulq %r18
-; EGPR-NDD-NEXT: addq %r27, %rax, %rdi
-; EGPR-NDD-NEXT: adcq %rdx, %rsi
-; EGPR-NDD-NEXT: setb %r9b
-; EGPR-NDD-NEXT: movq %r10, %rax
-; EGPR-NDD-NEXT: movq %r10, %r16
-; EGPR-NDD-NEXT: mulq %r18
-; EGPR-NDD-NEXT: addq %rax, %rsi
-; EGPR-NDD-NEXT: movzbl %r9b, %eax
-; EGPR-NDD-NEXT: adcq %rax, %rdx, %r9
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Reload
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: movq %rdx, %r21
-; EGPR-NDD-NEXT: movq %rax, %r27
+; EGPR-NDD-NEXT: movq %r13, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %r14, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq %r15
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT: movq %rbx, %rsi
; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; EGPR-NDD-NEXT: movq %r12, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: addq %rax, %r21
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r10
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r18
-; EGPR-NDD-NEXT: addq %r21, %rax, %rbx
-; EGPR-NDD-NEXT: adcq %rdx, %r10
-; EGPR-NDD-NEXT: setb %r31b
-; EGPR-NDD-NEXT: movq %r12, %rax
-; EGPR-NDD-NEXT: mulq %r18
-; EGPR-NDD-NEXT: addq %r10, %rax
-; EGPR-NDD-NEXT: movzbl %r31b, %r10d
-; EGPR-NDD-NEXT: adcq %r10, %rdx
-; EGPR-NDD-NEXT: addq %rax, %r20, %r10
-; EGPR-NDD-NEXT: adcq %rdx, %rdi
-; EGPR-NDD-NEXT: adcq $0, %rsi
-; EGPR-NDD-NEXT: adcq $0, %r9
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: movq %rdx, %r20
-; EGPR-NDD-NEXT: movq %rax, %r21
-; EGPR-NDD-NEXT: movq %r12, %rax
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: addq %rax, %r20
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r31
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r17
-; EGPR-NDD-NEXT: addq %rax, %r20
-; EGPR-NDD-NEXT: adcq %rdx, %r31
-; EGPR-NDD-NEXT: setb %bpl
-; EGPR-NDD-NEXT: movq %r12, %rax
-; EGPR-NDD-NEXT: mulq %r17
-; EGPR-NDD-NEXT: addq %r31, %rax
-; EGPR-NDD-NEXT: movzbl %bpl, %r31d
-; EGPR-NDD-NEXT: adcq %r31, %rdx
-; EGPR-NDD-NEXT: addq %r21, %r10
-; EGPR-NDD-NEXT: adcq %r20, %rdi
-; EGPR-NDD-NEXT: adcq $0, %rax
-; EGPR-NDD-NEXT: adcq $0, %rdx
-; EGPR-NDD-NEXT: addq %rax, %rsi
-; EGPR-NDD-NEXT: adcq %rdx, %r9
-; EGPR-NDD-NEXT: setb %r31b
-; EGPR-NDD-NEXT: movq %r13, %rax
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: movq %rdx, %r20
-; EGPR-NDD-NEXT: movq %rax, %r21
-; EGPR-NDD-NEXT: movq %r16, %rax
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: addq %rax, %r20
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r12
-; EGPR-NDD-NEXT: movq %r13, %rax
-; EGPR-NDD-NEXT: mulq %r17
-; EGPR-NDD-NEXT: addq %rax, %r20
-; EGPR-NDD-NEXT: adcq %rdx, %r12
-; EGPR-NDD-NEXT: setb %bpl
-; EGPR-NDD-NEXT: movq %r16, %rax
-; EGPR-NDD-NEXT: mulq %r17
-; EGPR-NDD-NEXT: addq %r12, %rax
-; EGPR-NDD-NEXT: movzbl %bpl, %r12d
-; EGPR-NDD-NEXT: adcq %r12, %rdx
-; EGPR-NDD-NEXT: addq %r21, %rsi
-; EGPR-NDD-NEXT: adcq %r20, %r9
-; EGPR-NDD-NEXT: movzbl %r31b, %r31d
-; EGPR-NDD-NEXT: adcq %r31, %rax
-; EGPR-NDD-NEXT: adcq $0, %rdx
-; EGPR-NDD-NEXT: addq %r27, %r19
-; EGPR-NDD-NEXT: movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq %rbx, %r28
-; EGPR-NDD-NEXT: movq %r28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq %r29, %r10
-; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq %rdi, %rcx
-; EGPR-NDD-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movzbl %r8b, %ecx
+; EGPR-NDD-NEXT: movq %r12, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %r14, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq %r15
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %rbx, %rsi
+; EGPR-NDD-NEXT: movq %r12, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r12, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq %r15
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r13, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %r12, %r9
+; EGPR-NDD-NEXT: movq %r12, %r13
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq %r15
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r12, %rsi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT: movq %rbx, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %r14, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %r12, %rsi
+; EGPR-NDD-NEXT: movq %rbx, %rdx
+; EGPR-NDD-NEXT: movq %rbx, %r14
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %r13, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq %r15
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %r12, %rsi
+; EGPR-NDD-NEXT: movq %rbx, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %r12, %rsi
+; EGPR-NDD-NEXT: movq %rbx, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r15, %rsi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r12, %rcx
+; EGPR-NDD-NEXT: movq %rbx, %r8
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %r12, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq %rbx
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r13, %rsi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT: movq %rbx, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %r12, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq %r14
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %r13, %rsi
+; EGPR-NDD-NEXT: movq %rbx, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %r15, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq %r13
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %r15, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq %r13
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r14, %rsi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT: movq %rbx, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %r14, %rsi
+; EGPR-NDD-NEXT: movq %rbx, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r14, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT: pushq %rbx
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r15, %rsi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r12, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %r14, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq %rbx
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %r15, %rsi
+; EGPR-NDD-NEXT: movq %r12, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT: movq %rbx, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq %r15
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq %rbx, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq %r15
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %r12, %rsi
+; EGPR-NDD-NEXT: movq %r13, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NDD-NEXT: movq %rbx, %r9
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq %r14
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %r12, %rsi
+; EGPR-NDD-NEXT: movq %r13, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %r12, %rsi
+; EGPR-NDD-NEXT: movq %r13, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq %r15
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %r12, %rsi
+; EGPR-NDD-NEXT: movq %r13, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %rbx, %rsi
+; EGPR-NDD-NEXT: movq %r14, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $24, %rsp
+; EGPR-NDD-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; EGPR-NDD-NEXT: movq %rbx, %rsi
+; EGPR-NDD-NEXT: movq %r14, %rdx
+; EGPR-NDD-NEXT: xorl %ecx, %ecx
+; EGPR-NDD-NEXT: xorl %r8d, %r8d
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq $0
+; EGPR-NDD-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; EGPR-NDD-NEXT: callq __multi5 at PLT
+; EGPR-NDD-NEXT: addq $32, %rsp
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %rdx
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %rsi
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %rax, %rdi
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %rcx, %r8
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT: addq %rdx, %rax
; EGPR-NDD-NEXT: adcq %rsi, %rcx
-; EGPR-NDD-NEXT: movq %rcx, (%rsp) # 8-byte Spill
+; EGPR-NDD-NEXT: setb %dl
+; EGPR-NDD-NEXT: movzbl %dl, %edx
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %rax, %rsi
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %rcx, %r9
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r10
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r11
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r16
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %rax, %rbx
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r17
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r18
+; EGPR-NDD-NEXT: addq %r17, %r11
+; EGPR-NDD-NEXT: adcq %r18, %r16
+; EGPR-NDD-NEXT: setb %r17b
+; EGPR-NDD-NEXT: movzbl %r17b, %r17d
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r11
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r16
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r17
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r18
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r11
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r16
+; EGPR-NDD-NEXT: adcq %r17, %rdi
+; EGPR-NDD-NEXT: adcq %r18, %r8
+; EGPR-NDD-NEXT: adcq $0, %rsi, %r17
; EGPR-NDD-NEXT: adcq $0, %r9
-; EGPR-NDD-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq $0, %rax
-; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq $0, %rdx
-; EGPR-NDD-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq 64(%r22), %r20
-; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: mulq %r20
-; EGPR-NDD-NEXT: movq %rdx, %r27
-; EGPR-NDD-NEXT: movq %rax, %r28
-; EGPR-NDD-NEXT: movq %r14, %rax
-; EGPR-NDD-NEXT: mulq %r20
-; EGPR-NDD-NEXT: addq %rax, %r27
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT: movq 72(%r22), %r21
-; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: mulq %r21
-; EGPR-NDD-NEXT: addq %rax, %r27
-; EGPR-NDD-NEXT: adcq %rdx, %rcx
-; EGPR-NDD-NEXT: setb %sil
-; EGPR-NDD-NEXT: movq %r14, %rax
-; EGPR-NDD-NEXT: mulq %r21
-; EGPR-NDD-NEXT: addq %rax, %rcx
-; EGPR-NDD-NEXT: movzbl %sil, %eax
-; EGPR-NDD-NEXT: adcq %rax, %rdx, %rsi
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: mulq %r20
-; EGPR-NDD-NEXT: movq %rdx, %r29
-; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: movq %r23, %rax
-; EGPR-NDD-NEXT: mulq %r20
-; EGPR-NDD-NEXT: addq %rax, %r29
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: mulq %r21
-; EGPR-NDD-NEXT: addq %r29, %rax
-; EGPR-NDD-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq %rdx, %rdi
-; EGPR-NDD-NEXT: setb %r8b
-; EGPR-NDD-NEXT: movq %r23, %rax
-; EGPR-NDD-NEXT: mulq %r21
-; EGPR-NDD-NEXT: addq %rdi, %rax
-; EGPR-NDD-NEXT: movzbl %r8b, %edi
-; EGPR-NDD-NEXT: adcq %rdi, %rdx
-; EGPR-NDD-NEXT: addq %rax, %r28, %rdi
-; EGPR-NDD-NEXT: adcq %rdx, %r27
-; EGPR-NDD-NEXT: adcq $0, %rcx
-; EGPR-NDD-NEXT: adcq $0, %rsi
-; EGPR-NDD-NEXT: movq 80(%r22), %r8
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: mulq %r8
-; EGPR-NDD-NEXT: movq %rdx, %r28
-; EGPR-NDD-NEXT: movq %rax, %r29
-; EGPR-NDD-NEXT: movq %r23, %rax
-; EGPR-NDD-NEXT: mulq %r8
-; EGPR-NDD-NEXT: addq %rax, %r28
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r9
-; EGPR-NDD-NEXT: movq 88(%r22), %rbx
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: mulq %rbx
-; EGPR-NDD-NEXT: addq %rax, %r28
-; EGPR-NDD-NEXT: adcq %rdx, %r9
-; EGPR-NDD-NEXT: setb %r10b
-; EGPR-NDD-NEXT: movq %r23, %rax
-; EGPR-NDD-NEXT: mulq %rbx
-; EGPR-NDD-NEXT: addq %r9, %rax
-; EGPR-NDD-NEXT: movzbl %r10b, %r9d
-; EGPR-NDD-NEXT: adcq %r9, %rdx
-; EGPR-NDD-NEXT: addq %r29, %rdi
-; EGPR-NDD-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT: adcq %r27, %r28, %rbp
-; EGPR-NDD-NEXT: adcq $0, %rax
-; EGPR-NDD-NEXT: adcq $0, %rdx
-; EGPR-NDD-NEXT: addq %rax, %rcx
-; EGPR-NDD-NEXT: adcq %rdx, %rsi
-; EGPR-NDD-NEXT: setb %dil
-; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: mulq %r8
-; EGPR-NDD-NEXT: movq %rdx, %r28
-; EGPR-NDD-NEXT: movq %rax, %r29
-; EGPR-NDD-NEXT: movq %r14, %rax
-; EGPR-NDD-NEXT: mulq %r8
-; EGPR-NDD-NEXT: addq %rax, %r28
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r9
-; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: mulq %rbx
-; EGPR-NDD-NEXT: addq %rax, %r28
-; EGPR-NDD-NEXT: adcq %rdx, %r9
-; EGPR-NDD-NEXT: setb %r10b
-; EGPR-NDD-NEXT: movq %r14, %rax
-; EGPR-NDD-NEXT: mulq %rbx
-; EGPR-NDD-NEXT: addq %r9, %rax
-; EGPR-NDD-NEXT: movzbl %r10b, %r9d
-; EGPR-NDD-NEXT: adcq %r9, %rdx
-; EGPR-NDD-NEXT: addq %rcx, %r29, %r27
-; EGPR-NDD-NEXT: adcq %rsi, %r28, %r12
-; EGPR-NDD-NEXT: movzbl %dil, %r19d
-; EGPR-NDD-NEXT: adcq %rax, %r19
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r29
-; EGPR-NDD-NEXT: imulq %r30, %rbx
-; EGPR-NDD-NEXT: movq %r30, %rax
-; EGPR-NDD-NEXT: mulq %r8
-; EGPR-NDD-NEXT: movq %rax, %r28
-; EGPR-NDD-NEXT: addq %rbx, %rdx
-; EGPR-NDD-NEXT: imulq %r18, %r8
-; EGPR-NDD-NEXT: addq %rdx, %r8
-; EGPR-NDD-NEXT: imulq %r21, %r11, %rcx
-; EGPR-NDD-NEXT: movq %r11, %rax
-; EGPR-NDD-NEXT: mulq %r20
-; EGPR-NDD-NEXT: addq %rdx, %rcx
-; EGPR-NDD-NEXT: imulq %r20, %r17, %r16
-; EGPR-NDD-NEXT: addq %r16, %rcx
-; EGPR-NDD-NEXT: addq %r28, %rax, %rsi
-; EGPR-NDD-NEXT: adcq %rcx, %r8
-; EGPR-NDD-NEXT: movq %r20, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: movq %rdx, %r28
-; EGPR-NDD-NEXT: movq %rax, %r31
-; EGPR-NDD-NEXT: movq %r21, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: addq %r28, %rax, %rcx
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT: movq %r20, %rax
-; EGPR-NDD-NEXT: mulq %r18
-; EGPR-NDD-NEXT: addq %rax, %rcx
-; EGPR-NDD-NEXT: adcq %rdx, %rdi
-; EGPR-NDD-NEXT: setb %r9b
-; EGPR-NDD-NEXT: movq %r21, %rax
-; EGPR-NDD-NEXT: mulq %r18
-; EGPR-NDD-NEXT: addq %rdi, %rax
-; EGPR-NDD-NEXT: movzbl %r9b, %edi
-; EGPR-NDD-NEXT: adcq %rdi, %rdx
-; EGPR-NDD-NEXT: addq %rax, %rsi
-; EGPR-NDD-NEXT: adcq %rdx, %r8
-; EGPR-NDD-NEXT: movq 112(%r22), %rdi
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: mulq %rdi
-; EGPR-NDD-NEXT: movq %rax, %r30
-; EGPR-NDD-NEXT: imulq %r23, %rdi
-; EGPR-NDD-NEXT: addq %rdi, %rdx
-; EGPR-NDD-NEXT: imulq 120(%r22), %r24, %rax
-; EGPR-NDD-NEXT: leaq (%rdx,%rax), %r9
-; EGPR-NDD-NEXT: movq 96(%r22), %r20
-; EGPR-NDD-NEXT: movq 104(%r22), %rdi
-; EGPR-NDD-NEXT: imulq %rdi, %r26, %r10
-; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: mulq %r20
-; EGPR-NDD-NEXT: addq %r10, %rdx
-; EGPR-NDD-NEXT: imulq %r20, %r14, %r25
-; EGPR-NDD-NEXT: addq %r25, %rdx
-; EGPR-NDD-NEXT: addq %rax, %r30
-; EGPR-NDD-NEXT: adcq %rdx, %r9
-; EGPR-NDD-NEXT: movq %r20, %rax
-; EGPR-NDD-NEXT: mulq %r24
-; EGPR-NDD-NEXT: movq %rdx, %r25
-; EGPR-NDD-NEXT: movq %rax, %r26
-; EGPR-NDD-NEXT: movq %rdi, %rax
-; EGPR-NDD-NEXT: mulq %r24
-; EGPR-NDD-NEXT: addq %rax, %r25
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r10
-; EGPR-NDD-NEXT: movq %r20, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %rax, %r25
-; EGPR-NDD-NEXT: adcq %rdx, %r10
-; EGPR-NDD-NEXT: setb %r11b
-; EGPR-NDD-NEXT: movq %rdi, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %r10, %rax
-; EGPR-NDD-NEXT: movzbl %r11b, %edi
-; EGPR-NDD-NEXT: adcq %rdi, %rdx
-; EGPR-NDD-NEXT: addq %r30, %rax
-; EGPR-NDD-NEXT: adcq %r9, %rdx
-; EGPR-NDD-NEXT: addq %r31, %r26
-; EGPR-NDD-NEXT: adcq %r25, %rcx
-; EGPR-NDD-NEXT: adcq %rsi, %rax
-; EGPR-NDD-NEXT: adcq %r8, %rdx
-; EGPR-NDD-NEXT: addq %r26, %r27, %rbx
-; EGPR-NDD-NEXT: adcq %rcx, %r12
-; EGPR-NDD-NEXT: adcq %rax, %r19, %r13
-; EGPR-NDD-NEXT: adcq %rdx, %r29, %r28
-; EGPR-NDD-NEXT: movq 80(%r15), %r24
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
-; EGPR-NDD-NEXT: mulq %r16
-; EGPR-NDD-NEXT: movq %rax, %r30
-; EGPR-NDD-NEXT: movq %rdx, %rdi
-; EGPR-NDD-NEXT: movq 88(%r15), %r22
-; EGPR-NDD-NEXT: movq %r22, %rax
-; EGPR-NDD-NEXT: mulq %r16
-; EGPR-NDD-NEXT: addq %rax, %rdi
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %rax, %rdi
-; EGPR-NDD-NEXT: adcq %rdx, %rcx
-; EGPR-NDD-NEXT: setb %sil
-; EGPR-NDD-NEXT: movq %r22, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %rax, %rcx
-; EGPR-NDD-NEXT: movzbl %sil, %eax
-; EGPR-NDD-NEXT: adcq %rax, %rdx, %rsi
-; EGPR-NDD-NEXT: movq 64(%r15), %r26
-; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: mulq %r16
-; EGPR-NDD-NEXT: movq %rax, %r21
-; EGPR-NDD-NEXT: movq %rdx, %r31
-; EGPR-NDD-NEXT: movq 72(%r15), %r25
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r16
-; EGPR-NDD-NEXT: addq %rax, %r31
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %r31, %rax, %r29
-; EGPR-NDD-NEXT: adcq %rdx, %r8
-; EGPR-NDD-NEXT: setb %r9b
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r23
-; EGPR-NDD-NEXT: addq %r8, %rax
-; EGPR-NDD-NEXT: movzbl %r9b, %r8d
-; EGPR-NDD-NEXT: adcq %r8, %rdx
-; EGPR-NDD-NEXT: addq %rax, %r30, %r20
-; EGPR-NDD-NEXT: adcq %rdx, %rdi
-; EGPR-NDD-NEXT: adcq $0, %rcx
-; EGPR-NDD-NEXT: adcq $0, %rsi
-; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; EGPR-NDD-NEXT: mulq %r10
-; EGPR-NDD-NEXT: movq %rdx, %r30
-; EGPR-NDD-NEXT: movq %rax, %r31
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r10
-; EGPR-NDD-NEXT: addq %rax, %r30
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: addq %r30, %rax, %r27
-; EGPR-NDD-NEXT: adcq %rdx, %r8
-; EGPR-NDD-NEXT: setb %r9b
-; EGPR-NDD-NEXT: movq %r25, %rax
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: addq %r8, %rax
-; EGPR-NDD-NEXT: movzbl %r9b, %r8d
-; EGPR-NDD-NEXT: adcq %r8, %rdx
-; EGPR-NDD-NEXT: addq %r31, %r20
-; EGPR-NDD-NEXT: adcq %rdi, %r27
-; EGPR-NDD-NEXT: adcq $0, %rax
-; EGPR-NDD-NEXT: adcq $0, %rdx
-; EGPR-NDD-NEXT: addq %rax, %rcx
-; EGPR-NDD-NEXT: adcq %rdx, %rsi
-; EGPR-NDD-NEXT: setb %dil
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: mulq %r10
-; EGPR-NDD-NEXT: movq %rdx, %r30
-; EGPR-NDD-NEXT: movq %rax, %r31
-; EGPR-NDD-NEXT: movq %r22, %rax
-; EGPR-NDD-NEXT: mulq %r10
-; EGPR-NDD-NEXT: addq %rax, %r30
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: addq %r30, %rax, %r19
-; EGPR-NDD-NEXT: adcq %rdx, %r8
+; EGPR-NDD-NEXT: adcq $0, %rdx, %r18
+; EGPR-NDD-NEXT: adcq $0, %r10
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %rsi
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r19
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r22
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %rdx, %r23
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %rsi, %r24
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %rdx
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %rsi
+; EGPR-NDD-NEXT: addq %r19, %rdx
+; EGPR-NDD-NEXT: adcq %r22, %rsi
+; EGPR-NDD-NEXT: setb %r19b
+; EGPR-NDD-NEXT: movzbl %r19b, %r19d
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %rdx, %r22
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %rsi, %r25
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r19
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r26
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r11, %rdx
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r16, %rsi
+; EGPR-NDD-NEXT: adcq %r23, %rdi
+; EGPR-NDD-NEXT: adcq %r24, %r8
+; EGPR-NDD-NEXT: adcq $0, %r22
+; EGPR-NDD-NEXT: adcq $0, %r25
+; EGPR-NDD-NEXT: adcq $0, %r19
+; EGPR-NDD-NEXT: adcq $0, %r26
+; EGPR-NDD-NEXT: addq %r22, %r17
+; EGPR-NDD-NEXT: adcq %r9, %r25
+; EGPR-NDD-NEXT: adcq %r19, %r18, %r11
+; EGPR-NDD-NEXT: adcq %r26, %r10, %r16
; EGPR-NDD-NEXT: setb %r9b
-; EGPR-NDD-NEXT: movq %r22, %rax
-; EGPR-NDD-NEXT: mulq %r11
-; EGPR-NDD-NEXT: addq %r8, %rax
-; EGPR-NDD-NEXT: movzbl %r9b, %r8d
-; EGPR-NDD-NEXT: adcq %r8, %rdx
-; EGPR-NDD-NEXT: addq %rcx, %r31
-; EGPR-NDD-NEXT: adcq %rsi, %r19
-; EGPR-NDD-NEXT: movzbl %dil, %ecx
-; EGPR-NDD-NEXT: adcq %rax, %rcx
-; EGPR-NDD-NEXT: adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT: movq 96(%r15), %r30
-; EGPR-NDD-NEXT: imulq %r11, %r30, %rsi
-; EGPR-NDD-NEXT: movq %r30, %rax
-; EGPR-NDD-NEXT: mulq %r10
-; EGPR-NDD-NEXT: movq %rax, %r18
-; EGPR-NDD-NEXT: addq %rsi, %rdx
-; EGPR-NDD-NEXT: movq 104(%r15), %r8
-; EGPR-NDD-NEXT: imulq %r10, %r8, %rax
-; EGPR-NDD-NEXT: leaq (%rdx,%rax), %rsi
-; EGPR-NDD-NEXT: movq 112(%r15), %rax
-; EGPR-NDD-NEXT: imulq %r23, %rax, %r9
-; EGPR-NDD-NEXT: mulq %r16
-; EGPR-NDD-NEXT: addq %r9, %rdx
-; EGPR-NDD-NEXT: imulq 120(%r15), %r16, %r9
-; EGPR-NDD-NEXT: addq %r9, %rdx
-; EGPR-NDD-NEXT: addq %r18, %rax, %r10
-; EGPR-NDD-NEXT: adcq %rsi, %rdx, %r9
-; EGPR-NDD-NEXT: movq %r16, %rax
-; EGPR-NDD-NEXT: movq %r16, %r18
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: movq %rdx, %r17
-; EGPR-NDD-NEXT: movq %rax, %rsi
-; EGPR-NDD-NEXT: movq %r23, %rax
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: addq %r17, %rax, %r11
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r16
-; EGPR-NDD-NEXT: movq %r18, %rax
-; EGPR-NDD-NEXT: mulq %r8
-; EGPR-NDD-NEXT: addq %rax, %r11
-; EGPR-NDD-NEXT: adcq %rdx, %r16
-; EGPR-NDD-NEXT: setb %r17b
-; EGPR-NDD-NEXT: movq %r23, %rax
-; EGPR-NDD-NEXT: mulq %r8
-; EGPR-NDD-NEXT: addq %r16, %rax
-; EGPR-NDD-NEXT: movzbl %r17b, %r8d
-; EGPR-NDD-NEXT: adcq %r8, %rdx
-; EGPR-NDD-NEXT: addq %rax, %r10
-; EGPR-NDD-NEXT: adcq %r9, %rdx, %r17
-; EGPR-NDD-NEXT: imulq {{[-0-9]+}}(%r{{[sb]}}p), %r26, %r8 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: movq %r26, %rax
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
-; EGPR-NDD-NEXT: mulq %r16
-; EGPR-NDD-NEXT: movq %rax, %r9
-; EGPR-NDD-NEXT: addq %r8, %rdx
-; EGPR-NDD-NEXT: imulq %r16, %r25, %rax
-; EGPR-NDD-NEXT: leaq (%rdx,%rax), %r8
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
-; EGPR-NDD-NEXT: imulq %r23, %r24, %r16
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Reload
-; EGPR-NDD-NEXT: mulq %r30
-; EGPR-NDD-NEXT: addq %r16, %rdx
-; EGPR-NDD-NEXT: imulq %r30, %r22
-; EGPR-NDD-NEXT: addq %r22, %rdx
-; EGPR-NDD-NEXT: addq %r9, %rax, %r16
-; EGPR-NDD-NEXT: adcq %r8, %rdx, %r18
-; EGPR-NDD-NEXT: movq %r30, %rax
-; EGPR-NDD-NEXT: mulq %r26
-; EGPR-NDD-NEXT: movq %rdx, %r8
-; EGPR-NDD-NEXT: movq %rax, %r9
-; EGPR-NDD-NEXT: movq %r23, %rax
-; EGPR-NDD-NEXT: movq %r23, %r24
-; EGPR-NDD-NEXT: mulq %r26
-; EGPR-NDD-NEXT: addq %rax, %r8
-; EGPR-NDD-NEXT: adcq $0, %rdx, %r22
-; EGPR-NDD-NEXT: movq %r30, %rax
-; EGPR-NDD-NEXT: mulq %r25
-; EGPR-NDD-NEXT: addq %rax, %r8
-; EGPR-NDD-NEXT: adcq %rdx, %r22
-; EGPR-NDD-NEXT: setb %r23b
-; EGPR-NDD-NEXT: movq %r24, %rax
-; EGPR-NDD-NEXT: mulq %r25
-; EGPR-NDD-NEXT: addq %r22, %rax
-; EGPR-NDD-NEXT: movzbl %r23b, %r22d
-; EGPR-NDD-NEXT: adcq %r22, %rdx
-; EGPR-NDD-NEXT: addq %r16, %rax
-; EGPR-NDD-NEXT: adcq %r18, %rdx
-; EGPR-NDD-NEXT: addq %r9, %rsi
-; EGPR-NDD-NEXT: adcq %r11, %r8
-; EGPR-NDD-NEXT: adcq %r10, %rax
-; EGPR-NDD-NEXT: adcq %r17, %rdx
-; EGPR-NDD-NEXT: addq %r31, %rsi
-; EGPR-NDD-NEXT: adcq %r19, %r8
-; EGPR-NDD-NEXT: adcq %rcx, %rax
-; EGPR-NDD-NEXT: adcq %rdi, %rdx
-; EGPR-NDD-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r20 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %rbp, %r27
-; EGPR-NDD-NEXT: adcq %rbx, %rsi
-; EGPR-NDD-NEXT: adcq %r12, %r8
-; EGPR-NDD-NEXT: adcq %r13, %rax
-; EGPR-NDD-NEXT: adcq %r28, %rdx
-; EGPR-NDD-NEXT: addq %r21, {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %r29, {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %r20, {{[-0-9]+}}(%r{{[sb]}}p), %r20 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %r27, {{[-0-9]+}}(%r{{[sb]}}p), %r27 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %rsi, (%rsp), %rsi # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; EGPR-NDD-NEXT: adcq %rdx, {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT: movq %rdi, (%rcx)
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT: movq %rdi, 8(%rcx)
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT: movq %rdi, 16(%rcx)
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT: movq %rdi, 24(%rcx)
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT: movq %rdi, 32(%rcx)
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT: movq %rdi, 40(%rcx)
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT: movq %rdi, 48(%rcx)
-; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NDD-NEXT: movq %rdi, 56(%rcx)
-; EGPR-NDD-NEXT: movq %r21, 64(%rcx)
-; EGPR-NDD-NEXT: movq %r29, 72(%rcx)
-; EGPR-NDD-NEXT: movq %r20, 80(%rcx)
-; EGPR-NDD-NEXT: movq %r27, 88(%rcx)
-; EGPR-NDD-NEXT: movq %rsi, 96(%rcx)
-; EGPR-NDD-NEXT: movq %r8, 104(%rcx)
-; EGPR-NDD-NEXT: movq %rax, 112(%rcx)
-; EGPR-NDD-NEXT: movq %rdx, 120(%rcx)
-; EGPR-NDD-NEXT: addq $96, %rsp
+; EGPR-NDD-NEXT: movzbl %r9b, %r18d
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r9
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r10
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r19
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r22
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r9, %r23
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r10, %r24
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r9
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r10
+; EGPR-NDD-NEXT: addq %r19, %r9
+; EGPR-NDD-NEXT: adcq %r22, %r10
+; EGPR-NDD-NEXT: setb %r19b
+; EGPR-NDD-NEXT: movzbl %r19b, %r19d
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r9, %r22
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r10, %r26
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r19
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r27
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r17, %r9
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r25, %r10
+; EGPR-NDD-NEXT: adcq %r23, %r11
+; EGPR-NDD-NEXT: adcq %r24, %r16
+; EGPR-NDD-NEXT: adcq %r18, %r22, %r17
+; EGPR-NDD-NEXT: adcq $0, %r26, %r18
+; EGPR-NDD-NEXT: adcq $0, %r19
+; EGPR-NDD-NEXT: adcq $0, %r27, %r22
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r23
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r24
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r23
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r24
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r25
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r26
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r23
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r24
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r27
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r30
+; EGPR-NDD-NEXT: addq %r27, %r25
+; EGPR-NDD-NEXT: adcq %r30, %r26
+; EGPR-NDD-NEXT: setb %r27b
+; EGPR-NDD-NEXT: movzbl %r27b, %r27d
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r25
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r26
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r27
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r30
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r31
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r20
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r21
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r28
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r21
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r28
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r20
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r31
+; EGPR-NDD-NEXT: addq %r21, %r25
+; EGPR-NDD-NEXT: adcq %r28, %r26
+; EGPR-NDD-NEXT: adcq %r20, %r27
+; EGPR-NDD-NEXT: adcq %r31, %r30
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r31
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r20
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r31
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r20
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r21
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r28
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r31
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r20
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r29
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r14
+; EGPR-NDD-NEXT: addq %r29, %r21
+; EGPR-NDD-NEXT: adcq %r14, %r28
+; EGPR-NDD-NEXT: setb %r29b
+; EGPR-NDD-NEXT: movzbl %r29b, %r29d
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r21
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r28
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r29
+; EGPR-NDD-NEXT: adcq $0, {{[0-9]+}}(%rsp), %r14
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r15
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r12
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r13
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %rax
+; EGPR-NDD-NEXT: addq %r15, %r21
+; EGPR-NDD-NEXT: adcq %r12, %r28
+; EGPR-NDD-NEXT: adcq %r13, %r29
+; EGPR-NDD-NEXT: adcq %r14, %rax
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; EGPR-NDD-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; EGPR-NDD-NEXT: addq {{[0-9]+}}(%rsp), %r14
+; EGPR-NDD-NEXT: adcq {{[0-9]+}}(%rsp), %r15
+; EGPR-NDD-NEXT: adcq %r31, %r23
+; EGPR-NDD-NEXT: adcq %r20, %r24
+; EGPR-NDD-NEXT: adcq %r21, %r25
+; EGPR-NDD-NEXT: adcq %r28, %r26
+; EGPR-NDD-NEXT: adcq %r29, %r27
+; EGPR-NDD-NEXT: adcq %r30, %rax
+; EGPR-NDD-NEXT: addq %r14, %r9
+; EGPR-NDD-NEXT: adcq %r15, %r10
+; EGPR-NDD-NEXT: adcq %r23, %r11
+; EGPR-NDD-NEXT: adcq %r24, %r16
+; EGPR-NDD-NEXT: adcq %r25, %r17
+; EGPR-NDD-NEXT: adcq %r26, %r18
+; EGPR-NDD-NEXT: adcq %r27, %r19
+; EGPR-NDD-NEXT: adcq %r22, %rax
+; EGPR-NDD-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; EGPR-NDD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r22 # 8-byte Reload
+; EGPR-NDD-NEXT: movq %rbx, 16(%r22)
+; EGPR-NDD-NEXT: movq %rcx, 24(%r22)
+; EGPR-NDD-NEXT: movq %rdx, 32(%r22)
+; EGPR-NDD-NEXT: movq %rsi, 40(%r22)
+; EGPR-NDD-NEXT: movq %rdi, 48(%r22)
+; EGPR-NDD-NEXT: movq %r8, 56(%r22)
+; EGPR-NDD-NEXT: movq %r9, 64(%r22)
+; EGPR-NDD-NEXT: movq %r10, 72(%r22)
+; EGPR-NDD-NEXT: movq %r11, 80(%r22)
+; EGPR-NDD-NEXT: movq %r16, 88(%r22)
+; EGPR-NDD-NEXT: movq %r17, 96(%r22)
+; EGPR-NDD-NEXT: movq %r18, 104(%r22)
+; EGPR-NDD-NEXT: movq %r19, 112(%r22)
+; EGPR-NDD-NEXT: movq %rax, 120(%r22)
+; EGPR-NDD-NEXT: movaps %xmm0, (%r22)
+; EGPR-NDD-NEXT: leaq -40(%rbp), %rsp
; EGPR-NDD-NEXT: popq %rbx
; EGPR-NDD-NEXT: popq %r12
; EGPR-NDD-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 7070848e3fe3e..e462867360b3b 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1228,7 +1228,7 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind
; AVX-NEXT: shrl $3, %ecx
; AVX-NEXT: andl $28, %ecx
; AVX-NEXT: andl %eax, (%rdi,%rcx)
-; AVX-NEXT: vmovdqu (%rdi), %ymm0
+; AVX-NEXT: vmovdqa (%rdi), %ymm0
; AVX-NEXT: movl (%rdi), %ecx
; AVX-NEXT: movl (%rsi), %eax
; AVX-NEXT: movl %ecx, (%rsi)
diff --git a/llvm/test/CodeGen/X86/cmp-i256.ll b/llvm/test/CodeGen/X86/cmp-i256.ll
new file mode 100644
index 0000000000000..213a355a8f223
--- /dev/null
+++ b/llvm/test/CodeGen/X86/cmp-i256.ll
@@ -0,0 +1,450 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
+
+define i32 @icmp_slt_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_slt_i256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setl %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: icmp_slt_i256:
+; X64: # %bb.0:
+; X64-NEXT: cmpq %r8, %rdi
+; X64-NEXT: sbbq %r9, %rsi
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: setl %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %c = icmp slt i256 %a, %b
+ %r = zext i1 %c to i32
+ ret i32 %r
+}
+
+define i32 @icmp_sgt_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_sgt_i256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setl %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: icmp_sgt_i256:
+; X64: # %bb.0:
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: cmpq %rdi, %r8
+; X64-NEXT: sbbq %rsi, %r9
+; X64-NEXT: sbbq %rdx, %r10
+; X64-NEXT: sbbq %rcx, %rax
+; X64-NEXT: setl %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %c = icmp sgt i256 %a, %b
+ %r = zext i1 %c to i32
+ ret i32 %r
+}
+
+define i32 @icmp_sle_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_sle_i256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setge %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: icmp_sle_i256:
+; X64: # %bb.0:
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: cmpq %rdi, %r8
+; X64-NEXT: sbbq %rsi, %r9
+; X64-NEXT: sbbq %rdx, %r10
+; X64-NEXT: sbbq %rcx, %rax
+; X64-NEXT: setge %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %c = icmp sle i256 %a, %b
+ %r = zext i1 %c to i32
+ ret i32 %r
+}
+
+define i32 @icmp_sge_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_sge_i256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setge %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: icmp_sge_i256:
+; X64: # %bb.0:
+; X64-NEXT: cmpq %r8, %rdi
+; X64-NEXT: sbbq %r9, %rsi
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: setge %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %c = icmp sge i256 %a, %b
+ %r = zext i1 %c to i32
+ ret i32 %r
+}
+
+define i32 @icmp_ult_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_ult_i256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setb %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: icmp_ult_i256:
+; X64: # %bb.0:
+; X64-NEXT: cmpq %r8, %rdi
+; X64-NEXT: sbbq %r9, %rsi
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: setb %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %c = icmp ult i256 %a, %b
+ %r = zext i1 %c to i32
+ ret i32 %r
+}
+
+define i32 @icmp_ugt_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_ugt_i256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setb %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: icmp_ugt_i256:
+; X64: # %bb.0:
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: cmpq %rdi, %r8
+; X64-NEXT: sbbq %rsi, %r9
+; X64-NEXT: sbbq %rdx, %r10
+; X64-NEXT: sbbq %rcx, %rax
+; X64-NEXT: setb %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %c = icmp ugt i256 %a, %b
+ %r = zext i1 %c to i32
+ ret i32 %r
+}
+
+define i32 @icmp_ule_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_ule_i256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setae %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: icmp_ule_i256:
+; X64: # %bb.0:
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: cmpq %rdi, %r8
+; X64-NEXT: sbbq %rsi, %r9
+; X64-NEXT: sbbq %rdx, %r10
+; X64-NEXT: sbbq %rcx, %rax
+; X64-NEXT: setae %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %c = icmp ule i256 %a, %b
+ %r = zext i1 %c to i32
+ ret i32 %r
+}
+
+define i32 @icmp_uge_i256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: icmp_uge_i256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: setae %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: icmp_uge_i256:
+; X64: # %bb.0:
+; X64-NEXT: cmpq %r8, %rdi
+; X64-NEXT: sbbq %r9, %rsi
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: setae %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %c = icmp uge i256 %a, %b
+ %r = zext i1 %c to i32
+ ret i32 %r
+}
+
+; Select based on i256 comparison
+define i256 @select_slt_i256(i256 %a, i256 %b, i256 %x, i256 %y) nounwind {
+; X86-LABEL: select_slt_i256:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: jl .LBB8_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jge .LBB8_5
+; X86-NEXT: .LBB8_4:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: jge .LBB8_8
+; X86-NEXT: .LBB8_7:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT: jge .LBB8_11
+; X86-NEXT: .LBB8_10:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edi
+; X86-NEXT: jge .LBB8_14
+; X86-NEXT: .LBB8_13:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: jge .LBB8_17
+; X86-NEXT: .LBB8_16:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: jge .LBB8_20
+; X86-NEXT: .LBB8_19:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: jmp .LBB8_21
+; X86-NEXT: .LBB8_1:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jl .LBB8_4
+; X86-NEXT: .LBB8_5:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: jl .LBB8_7
+; X86-NEXT: .LBB8_8:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT: jl .LBB8_10
+; X86-NEXT: .LBB8_11:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edi
+; X86-NEXT: jl .LBB8_13
+; X86-NEXT: .LBB8_14:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: jl .LBB8_16
+; X86-NEXT: .LBB8_17:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: jl .LBB8_19
+; X86-NEXT: .LBB8_20:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: .LBB8_21:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl (%edx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: movl (%edx), %edx
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl (%esi), %esi
+; X86-NEXT: movl (%edi), %edi
+; X86-NEXT: movl (%ebx), %ebx
+; X86-NEXT: movl (%ebp), %ebp
+; X86-NEXT: movl (%ecx), %ecx
+; X86-NEXT: jl .LBB8_22
+; X86-NEXT: # %bb.23:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT: jmp .LBB8_24
+; X86-NEXT: .LBB8_22:
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT: .LBB8_24:
+; X86-NEXT: movl (%edx), %edx
+; X86-NEXT: movl %edx, 28(%eax)
+; X86-NEXT: movl %ecx, 24(%eax)
+; X86-NEXT: movl %ebp, 20(%eax)
+; X86-NEXT: movl %ebx, 16(%eax)
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+;
+; X64-LABEL: select_slt_i256:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: cmpq %r9, %rsi
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: cmovlq %rcx, %rdx
+; X64-NEXT: movq (%rdx), %rcx
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: cmovlq %rdx, %rsi
+; X64-NEXT: movq (%rsi), %rdx
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: cmovlq %rsi, %rdi
+; X64-NEXT: movq (%rdi), %rsi
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: cmovlq %rdi, %r8
+; X64-NEXT: movq (%r8), %rdi
+; X64-NEXT: movq %rdi, 24(%rax)
+; X64-NEXT: movq %rsi, 16(%rax)
+; X64-NEXT: movq %rdx, 8(%rax)
+; X64-NEXT: movq %rcx, (%rax)
+; X64-NEXT: retq
+ %c = icmp slt i256 %a, %b
+ %r = select i1 %c, i256 %x, i256 %y
+ ret i256 %r
+}
diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll
index 3efd536adc4d1..f84293a26e102 100644
--- a/llvm/test/CodeGen/X86/dagcombine-cse.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll
@@ -106,24 +106,26 @@ define i96 @square_high(i96 %x) nounwind {
;
; X64-LABEL: square_high:
; X64: ## %bb.0: ## %entry
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: addq %r8, %rdx
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: adcq $0, %rax
-; X64-NEXT: addq %rdx, %r8
-; X64-NEXT: adcq %rsi, %rax
-; X64-NEXT: imulq %rcx, %rcx
-; X64-NEXT: addq %rax, %rcx
-; X64-NEXT: shrdq $32, %rcx, %r8
-; X64-NEXT: shrq $32, %rcx
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %rcx, %rdx
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $64, %rsp
+; X64-NEXT: movq %rdi, %r9
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: subq $8, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r9, %rsi
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %rdx
+; X64-NEXT: callq ___multi5
+; X64-NEXT: addq $32, %rsp
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq %rbp, %rsp
+; X64-NEXT: popq %rbp
; X64-NEXT: retq
entry:
%conv = zext i96 %x to i192
diff --git a/llvm/test/CodeGen/X86/div-i256.ll b/llvm/test/CodeGen/X86/div-i256.ll
new file mode 100644
index 0000000000000..b57c6f5dec5d0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/div-i256.ll
@@ -0,0 +1,5475 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
+
+define i256 @udiv256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: udiv256:
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $288, %esp # imm = 0x120
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: movl 72(%ebp), %ebx
+; X86-NEXT: movl 56(%ebp), %ecx
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: orl 64(%ebp), %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 68(%ebp), %edi
+; X86-NEXT: movl 52(%ebp), %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl 44(%ebp), %edx
+; X86-NEXT: orl 60(%ebp), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: sete %al
+; X86-NEXT: movl 24(%ebp), %edx
+; X86-NEXT: orl 40(%ebp), %edx
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: orl 32(%ebp), %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl 20(%ebp), %edx
+; X86-NEXT: orl 36(%ebp), %edx
+; X86-NEXT: movl 12(%ebp), %esi
+; X86-NEXT: orl 28(%ebp), %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: sete %ah
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB0_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: jmp .LBB0_3
+; X86-NEXT: .LBB0_1:
+; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: .LBB0_3: # %_udiv-special-cases
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%ebp), %edx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl 56(%ebp), %ebx
+; X86-NEXT: jne .LBB0_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl 60(%ebp), %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: orl 72(%ebp), %edi
+; X86-NEXT: je .LBB0_7
+; X86-NEXT: jmp .LBB0_8
+; X86-NEXT: .LBB0_4:
+; X86-NEXT: bsrl %edx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl 72(%ebp), %edi
+; X86-NEXT: jne .LBB0_8
+; X86-NEXT: .LBB0_7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB0_8: # %_udiv-special-cases
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: movl 52(%ebp), %edi
+; X86-NEXT: movl 44(%ebp), %esi
+; X86-NEXT: jne .LBB0_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: jmp .LBB0_11
+; X86-NEXT: .LBB0_9:
+; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: .LBB0_11: # %_udiv-special-cases
+; X86-NEXT: movl 48(%ebp), %ecx
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: jne .LBB0_12
+; X86-NEXT: # %bb.13: # %_udiv-special-cases
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: je .LBB0_15
+; X86-NEXT: jmp .LBB0_16
+; X86-NEXT: .LBB0_12:
+; X86-NEXT: bsrl %ecx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: jne .LBB0_16
+; X86-NEXT: .LBB0_15: # %_udiv-special-cases
+; X86-NEXT: orl $64, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: .LBB0_16: # %_udiv-special-cases
+; X86-NEXT: movl 64(%ebp), %esi
+; X86-NEXT: orl 72(%ebp), %esi
+; X86-NEXT: movl 60(%ebp), %edi
+; X86-NEXT: orl 68(%ebp), %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl 40(%ebp), %ebx
+; X86-NEXT: jne .LBB0_18
+; X86-NEXT: # %bb.17: # %_udiv-special-cases
+; X86-NEXT: orl $128, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB0_18: # %_udiv-special-cases
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: jne .LBB0_19
+; X86-NEXT: # %bb.20: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: je .LBB0_23
+; X86-NEXT: .LBB0_22:
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: je .LBB0_25
+; X86-NEXT: jmp .LBB0_26
+; X86-NEXT: .LBB0_19:
+; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: jne .LBB0_22
+; X86-NEXT: .LBB0_23: # %_udiv-special-cases
+; X86-NEXT: bsrl 28(%ebp), %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: orl %ebx, %edi
+; X86-NEXT: jne .LBB0_26
+; X86-NEXT: .LBB0_25: # %_udiv-special-cases
+; X86-NEXT: orl $64, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: .LBB0_26: # %_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl 16(%ebp), %edi
+; X86-NEXT: movl 20(%ebp), %ebx
+; X86-NEXT: jne .LBB0_27
+; X86-NEXT: # %bb.28: # %_udiv-special-cases
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: je .LBB0_31
+; X86-NEXT: .LBB0_30:
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl 24(%ebp), %ebx
+; X86-NEXT: je .LBB0_33
+; X86-NEXT: jmp .LBB0_34
+; X86-NEXT: .LBB0_27:
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: jne .LBB0_30
+; X86-NEXT: .LBB0_31: # %_udiv-special-cases
+; X86-NEXT: bsrl 12(%ebp), %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: orl 24(%ebp), %ebx
+; X86-NEXT: jne .LBB0_34
+; X86-NEXT: .LBB0_33: # %_udiv-special-cases
+; X86-NEXT: orl $64, %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: .LBB0_34: # %_udiv-special-cases
+; X86-NEXT: orb %ah, %al
+; X86-NEXT: movb %al, (%esp) # 1-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: orl 40(%ebp), %ecx
+; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: orl 36(%ebp), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: jne .LBB0_36
+; X86-NEXT: # %bb.35: # %_udiv-special-cases
+; X86-NEXT: orl $128, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: .LBB0_36: # %_udiv-special-cases
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpb $0, (%esp) # 1-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB0_37
+; X86-NEXT: # %bb.38: # %select.false.sink
+; X86-NEXT: movl $255, %eax
+; X86-NEXT: cmpl %esi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: setb %al
+; X86-NEXT: .LBB0_39: # %select.end
+; X86-NEXT: testb %al, %al
+; X86-NEXT: movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: jne .LBB0_41
+; X86-NEXT: # %bb.40: # %select.end
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl 40(%ebp), %esi
+; X86-NEXT: .LBB0_41: # %select.end
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB0_42
+; X86-NEXT: # %bb.48: # %select.end
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl $255, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NEXT: je .LBB0_49
+; X86-NEXT: # %bb.46: # %udiv-bb1
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: notb %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $28, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %edx
+; X86-NEXT: movl 264(%esp,%edx), %esi
+; X86-NEXT: movl 268(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 260(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 256(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 252(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 248(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 240(%esp,%edx), %eax
+; X86-NEXT: movl 244(%esp,%edx), %edx
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl $1, %ecx
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jb .LBB0_47
+; X86-NEXT: # %bb.43: # %udiv-preheader
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movl 172(%esp,%edx,4), %edi
+; X86-NEXT: movl 168(%esp,%edx,4), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %edi, %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: movl 164(%esp,%edx,4), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shrdl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 160(%esp,%edx,4), %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 156(%esp,%edx,4), %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: shrdl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 152(%esp,%edx,4), %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 144(%esp,%edx,4), %esi
+; X86-NEXT: movl 148(%esp,%edx,4), %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: shrdl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebp), %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 68(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 72(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB0_44: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl 72(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl 68(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl 64(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl 60(%ebp), %ebx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl 56(%ebp), %esi
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl 52(%ebp), %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl 48(%ebp), %eax
+; X86-NEXT: andl 44(%ebp), %ecx
+; X86-NEXT: subl %ecx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB0_44
+; X86-NEXT: .LBB0_45: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%edx,2), %edi
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB0_49: # %udiv-end
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 16(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 20(%eax)
+; X86-NEXT: movl %edx, 24(%eax)
+; X86-NEXT: movl %esi, 28(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB0_37:
+; X86-NEXT: movb $1, %al
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jmp .LBB0_39
+; X86-NEXT: .LBB0_47:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB0_45
+; X86-NEXT: .LBB0_42:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB0_49
+;
+; X64-LABEL: udiv256:
+; X64: # %bb.0: # %_udiv-special-cases
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $288, %rsp # imm = 0x120
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movq 24(%rbp), %r15
+; X64-NEXT: movq 32(%rbp), %r10
+; X64-NEXT: movq 16(%rbp), %r11
+; X64-NEXT: movq %r11, %rdi
+; X64-NEXT: orq %r10, %rdi
+; X64-NEXT: movq %r9, %rbx
+; X64-NEXT: orq %r15, %rbx
+; X64-NEXT: orq %rdi, %rbx
+; X64-NEXT: sete %bl
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: orq %r8, %r14
+; X64-NEXT: movq %rsi, %rdi
+; X64-NEXT: orq %rcx, %rdi
+; X64-NEXT: orq %r14, %rdi
+; X64-NEXT: sete %dil
+; X64-NEXT: orb %bl, %dil
+; X64-NEXT: bsrq %r10, %r14
+; X64-NEXT: xorq $63, %r14
+; X64-NEXT: bsrq %r15, %rbx
+; X64-NEXT: xorq $63, %rbx
+; X64-NEXT: orq $64, %rbx
+; X64-NEXT: testq %r10, %r10
+; X64-NEXT: cmovneq %r14, %rbx
+; X64-NEXT: bsrq %r11, %r14
+; X64-NEXT: xorq $63, %r14
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: bsrq %r9, %r9
+; X64-NEXT: xorq $63, %r9
+; X64-NEXT: orq $64, %r9
+; X64-NEXT: testq %r11, %r11
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: cmovneq %r14, %r9
+; X64-NEXT: orq $128, %r9
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: orq %r10, %rdx
+; X64-NEXT: cmovneq %rbx, %r9
+; X64-NEXT: bsrq %r8, %r10
+; X64-NEXT: xorq $63, %r10
+; X64-NEXT: bsrq %rcx, %rdx
+; X64-NEXT: xorq $63, %rdx
+; X64-NEXT: orq $64, %rdx
+; X64-NEXT: testq %r8, %r8
+; X64-NEXT: cmovneq %r10, %rdx
+; X64-NEXT: bsrq %r11, %r10
+; X64-NEXT: xorq $63, %r10
+; X64-NEXT: bsrq %rsi, %rbx
+; X64-NEXT: xorq $63, %rbx
+; X64-NEXT: orq $64, %rbx
+; X64-NEXT: testq %r11, %r11
+; X64-NEXT: cmovneq %r10, %rbx
+; X64-NEXT: orq $128, %rbx
+; X64-NEXT: movq %rcx, %r10
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r8, %r10
+; X64-NEXT: cmovneq %rdx, %rbx
+; X64-NEXT: xorl %r10d, %r10d
+; X64-NEXT: subq %rbx, %r9
+; X64-NEXT: movl $0, %ebx
+; X64-NEXT: sbbq %rbx, %rbx
+; X64-NEXT: movl $0, %r14d
+; X64-NEXT: sbbq %r14, %r14
+; X64-NEXT: sbbq %r10, %r10
+; X64-NEXT: movq %rsi, %r15
+; X64-NEXT: movq %rcx, %r8
+; X64-NEXT: testb %dil, %dil
+; X64-NEXT: jne .LBB0_1
+; X64-NEXT: # %bb.2: # %select.false.sink
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: movl $255, %ecx
+; X64-NEXT: cmpq %r9, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %rbx, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %r14, %rcx
+; X64-NEXT: sbbq %r10, %rdx
+; X64-NEXT: setb %cl
+; X64-NEXT: .LBB0_3: # %select.end
+; X64-NEXT: xorl %r12d, %r12d
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: cmovneq %r12, %r11
+; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: cmovneq %r12, %rcx
+; X64-NEXT: movq %r8, %rdx
+; X64-NEXT: cmovneq %r12, %rdx
+; X64-NEXT: cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; X64-NEXT: jne .LBB0_9
+; X64-NEXT: # %bb.4: # %select.end
+; X64-NEXT: movq %r9, %rsi
+; X64-NEXT: xorq $255, %rsi
+; X64-NEXT: orq %r14, %rsi
+; X64-NEXT: movq %rbx, %rdi
+; X64-NEXT: orq %r10, %rdi
+; X64-NEXT: orq %rsi, %rdi
+; X64-NEXT: je .LBB0_9
+; X64-NEXT: # %bb.5: # %udiv-bb1
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r15, %r11
+; X64-NEXT: movq %r15, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r8, %r15
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %r9d, %ecx
+; X64-NEXT: notb %cl
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: shrb $3, %al
+; X64-NEXT: andb $24, %al
+; X64-NEXT: negb %al
+; X64-NEXT: movsbq %al, %rdi
+; X64-NEXT: movq 240(%rsp,%rdi), %rsi
+; X64-NEXT: movq 248(%rsp,%rdi), %rax
+; X64-NEXT: shldq %cl, %rsi, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 224(%rsp,%rdi), %rdx
+; X64-NEXT: movq 232(%rsp,%rdi), %rax
+; X64-NEXT: shldq %cl, %rax, %rsi
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: shldq %cl, %rdx, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: shlq %cl, %rdx
+; X64-NEXT: addq $1, %r9
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: jb .LBB0_10
+; X64-NEXT: # %bb.6: # %udiv-preheader
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r15, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %r9d, %ecx
+; X64-NEXT: shrb $6, %cl
+; X64-NEXT: movzbl %cl, %edi
+; X64-NEXT: movq 152(%rsp,%rdi,8), %r8
+; X64-NEXT: movq %r9, %rcx
+; X64-NEXT: movq 144(%rsp,%rdi,8), %r9
+; X64-NEXT: movq %r9, %rsi
+; X64-NEXT: shrdq %cl, %r8, %rsi
+; X64-NEXT: movq 128(%rsp,%rdi,8), %rbx
+; X64-NEXT: movq 136(%rsp,%rdi,8), %rdi
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: shrdq %cl, %r9, %r14
+; X64-NEXT: shrq %cl, %r8
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NEXT: shrdq %cl, %rdi, %rbx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: addq $-1, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 16(%rbp), %rax
+; X64-NEXT: adcq $-1, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 24(%rbp), %rax
+; X64-NEXT: adcq $-1, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 32(%rbp), %rax
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r9d, %r9d
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: xorl %r15d, %r15d
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB0_7: # %udiv-do-while
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: shldq $1, %rsi, %r8
+; X64-NEXT: shldq $1, %r14, %rsi
+; X64-NEXT: shldq $1, %rbx, %r14
+; X64-NEXT: shldq $1, %r12, %rbx
+; X64-NEXT: shldq $1, %r10, %r12
+; X64-NEXT: orq %r15, %r12
+; X64-NEXT: shldq $1, %r11, %r10
+; X64-NEXT: orq %rdi, %r10
+; X64-NEXT: shldq $1, %rdx, %r11
+; X64-NEXT: orq %r9, %r11
+; X64-NEXT: addq %rdx, %rdx
+; X64-NEXT: orq %rcx, %rdx
+; X64-NEXT: cmpq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %r14, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %rsi, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: sbbq %r8, %rdi
+; X64-NEXT: sarq $63, %rdi
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %rdi, %r9
+; X64-NEXT: andq %rax, %r9
+; X64-NEXT: movq %rdi, %r15
+; X64-NEXT: movq 24(%rbp), %rax
+; X64-NEXT: andq %rax, %r15
+; X64-NEXT: movq %rdi, %r13
+; X64-NEXT: andq 16(%rbp), %r13
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT: subq %rdi, %rbx
+; X64-NEXT: sbbq %r13, %r14
+; X64-NEXT: movq 32(%rbp), %rax
+; X64-NEXT: sbbq %r15, %rsi
+; X64-NEXT: sbbq %r9, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: addq $-1, %r9
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: adcq $-1, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: adcq $-1, %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: adcq $-1, %r13
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r13, %rdi
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r15, %r9
+; X64-NEXT: orq %rdi, %r9
+; X64-NEXT: movl $0, %r9d
+; X64-NEXT: movl $0, %edi
+; X64-NEXT: movl $0, %r15d
+; X64-NEXT: jne .LBB0_7
+; X64-NEXT: .LBB0_8: # %udiv-loop-exit
+; X64-NEXT: shldq $1, %r10, %r12
+; X64-NEXT: shldq $1, %r11, %r10
+; X64-NEXT: shldq $1, %rdx, %r11
+; X64-NEXT: leaq (%rcx,%rdx,2), %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %r10, %rdx
+; X64-NEXT: .LBB0_9: # %udiv-end
+; X64-NEXT: movq %rcx, (%rax)
+; X64-NEXT: movq %r11, 8(%rax)
+; X64-NEXT: movq %rdx, 16(%rax)
+; X64-NEXT: movq %r12, 24(%rax)
+; X64-NEXT: leaq -40(%rbp), %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+; X64-NEXT: .LBB0_1:
+; X64-NEXT: movb $1, %cl
+; X64-NEXT: jmp .LBB0_3
+; X64-NEXT: .LBB0_10:
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: jmp .LBB0_8
+ %r = udiv i256 %a, %b
+ ret i256 %r
+}
+
+define i256 @sdiv256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: sdiv256:
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $320, %esp # imm = 0x140
+; X86-NEXT: movl 40(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl 20(%ebp), %esi
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl 12(%ebp), %edi
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: subl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 72(%ebp), %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 68(%ebp), %eax
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%ebp), %ebx
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: movl 60(%ebp), %eax
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%ebp), %edx
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: movl 52(%ebp), %edi
+; X86-NEXT: xorl %ecx, %edi
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: movl 44(%ebp), %esi
+; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: sbbl %ecx, %edi
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: sbbl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %ecx, %ebx
+; X86-NEXT: sbbl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: sete %cl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: sete %ch
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB1_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: bsrl %esi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: jmp .LBB1_3
+; X86-NEXT: .LBB1_1:
+; X86-NEXT: bsrl %edx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: .LBB1_3: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jne .LBB1_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB1_6
+; X86-NEXT: .LBB1_4:
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB1_6: # %_udiv-special-cases
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: jne .LBB1_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB1_8: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jne .LBB1_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: je .LBB1_13
+; X86-NEXT: .LBB1_12:
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: je .LBB1_15
+; X86-NEXT: jmp .LBB1_16
+; X86-NEXT: .LBB1_9:
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: jne .LBB1_12
+; X86-NEXT: .LBB1_13: # %_udiv-special-cases
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB1_16
+; X86-NEXT: .LBB1_15: # %_udiv-special-cases
+; X86-NEXT: orl $64, %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: .LBB1_16: # %_udiv-special-cases
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: jne .LBB1_18
+; X86-NEXT: # %bb.17: # %_udiv-special-cases
+; X86-NEXT: orl $128, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB1_18: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jne .LBB1_19
+; X86-NEXT: # %bb.20: # %_udiv-special-cases
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: je .LBB1_23
+; X86-NEXT: .LBB1_22:
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: je .LBB1_25
+; X86-NEXT: jmp .LBB1_26
+; X86-NEXT: .LBB1_19:
+; X86-NEXT: bsrl %eax, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: jne .LBB1_22
+; X86-NEXT: .LBB1_23: # %_udiv-special-cases
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: jne .LBB1_26
+; X86-NEXT: .LBB1_25: # %_udiv-special-cases
+; X86-NEXT: orl $64, %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: .LBB1_26: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jne .LBB1_27
+; X86-NEXT: # %bb.28: # %_udiv-special-cases
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: je .LBB1_31
+; X86-NEXT: .LBB1_30:
+; X86-NEXT: bsrl %ebx, %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: jmp .LBB1_32
+; X86-NEXT: .LBB1_27:
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB1_30
+; X86-NEXT: .LBB1_31: # %_udiv-special-cases
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: orl $32, %ebx
+; X86-NEXT: .LBB1_32: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB1_34
+; X86-NEXT: # %bb.33: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ebx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: .LBB1_34: # %_udiv-special-cases
+; X86-NEXT: orb %ch, %cl
+; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: jne .LBB1_36
+; X86-NEXT: # %bb.35: # %_udiv-special-cases
+; X86-NEXT: orl $128, %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: .LBB1_36: # %_udiv-special-cases
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: subl %esi, %edx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB1_37
+; X86-NEXT: # %bb.38: # %select.false.sink
+; X86-NEXT: movl $255, %ecx
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB1_39: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: jne .LBB1_41
+; X86-NEXT: # %bb.40: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: .LBB1_41: # %select.end
+; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: jne .LBB1_42
+; X86-NEXT: # %bb.48: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorl $255, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: je .LBB1_49
+; X86-NEXT: # %bb.46: # %udiv-bb1
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: notb %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $28, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %edx
+; X86-NEXT: movl 296(%esp,%edx), %esi
+; X86-NEXT: movl 300(%esp,%edx), %edi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 292(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 288(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 284(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 280(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 272(%esp,%edx), %edi
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: movl 276(%esp,%edx), %edx
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl $1, %ecx
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB1_47
+; X86-NEXT: # %bb.43: # %udiv-preheader
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 204(%esp,%eax,4), %edi
+; X86-NEXT: movl 200(%esp,%eax,4), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %edi, %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: movl 196(%esp,%eax,4), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 192(%esp,%eax,4), %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 188(%esp,%eax,4), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 184(%esp,%eax,4), %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 176(%esp,%eax,4), %edx
+; X86-NEXT: movl 180(%esp,%eax,4), %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB1_44: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: subl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB1_44
+; X86-NEXT: .LBB1_45: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: leal (%ebx,%edi,2), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: .LBB1_49: # %udiv-end
+; X86-NEXT: xorl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: xorl %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl %edi, %edx
+; X86-NEXT: xorl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %edi, %ecx
+; X86-NEXT: subl %edi, %ecx
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: sbbl %edi, %edx
+; X86-NEXT: sbbl %edi, %esi
+; X86-NEXT: sbbl %edi, %ebx
+; X86-NEXT: sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: movl %ebx, 16(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 20(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 24(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 28(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB1_37:
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB1_39
+; X86-NEXT: .LBB1_47:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB1_45
+; X86-NEXT: .LBB1_42:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: jmp .LBB1_49
+;
+; X64-LABEL: sdiv256:
+; X64: # %bb.0: # %_udiv-special-cases
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $288, %rsp # imm = 0x120
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 32(%rbp), %r13
+; X64-NEXT: movq %r8, %r10
+; X64-NEXT: sarq $63, %r10
+; X64-NEXT: xorq %r10, %r8
+; X64-NEXT: xorq %r10, %rcx
+; X64-NEXT: xorq %r10, %rdx
+; X64-NEXT: xorq %r10, %rsi
+; X64-NEXT: subq %r10, %rsi
+; X64-NEXT: sbbq %r10, %rdx
+; X64-NEXT: sbbq %r10, %rcx
+; X64-NEXT: sbbq %r10, %r8
+; X64-NEXT: movq %r13, %r11
+; X64-NEXT: sarq $63, %r11
+; X64-NEXT: xorq %r11, %r13
+; X64-NEXT: movq 24(%rbp), %rax
+; X64-NEXT: xorq %r11, %rax
+; X64-NEXT: movq 16(%rbp), %r14
+; X64-NEXT: xorq %r11, %r14
+; X64-NEXT: xorq %r11, %r9
+; X64-NEXT: subq %r11, %r9
+; X64-NEXT: sbbq %r11, %r14
+; X64-NEXT: sbbq %r11, %rax
+; X64-NEXT: sbbq %r11, %r13
+; X64-NEXT: movq %r14, %rbx
+; X64-NEXT: orq %r13, %rbx
+; X64-NEXT: movq %r9, %rdi
+; X64-NEXT: orq %rax, %rdi
+; X64-NEXT: orq %rbx, %rdi
+; X64-NEXT: sete %dil
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: orq %r8, %rbx
+; X64-NEXT: movq %rsi, %r15
+; X64-NEXT: orq %rcx, %r15
+; X64-NEXT: orq %rbx, %r15
+; X64-NEXT: sete %bl
+; X64-NEXT: orb %dil, %bl
+; X64-NEXT: bsrq %r13, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: bsrq %rax, %r15
+; X64-NEXT: xorq $63, %r15
+; X64-NEXT: orq $64, %r15
+; X64-NEXT: testq %r13, %r13
+; X64-NEXT: cmovneq %rdi, %r15
+; X64-NEXT: bsrq %r14, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: bsrq %r9, %r12
+; X64-NEXT: xorq $63, %r12
+; X64-NEXT: orq $64, %r12
+; X64-NEXT: testq %r14, %r14
+; X64-NEXT: cmovneq %rdi, %r12
+; X64-NEXT: orq $128, %r12
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r13, %rdi
+; X64-NEXT: cmovneq %r15, %r12
+; X64-NEXT: bsrq %r8, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: bsrq %rcx, %r9
+; X64-NEXT: xorq $63, %r9
+; X64-NEXT: orq $64, %r9
+; X64-NEXT: testq %r8, %r8
+; X64-NEXT: cmovneq %rdi, %r9
+; X64-NEXT: bsrq %rdx, %rax
+; X64-NEXT: xorq $63, %rax
+; X64-NEXT: movq %rsi, (%rsp) # 8-byte Spill
+; X64-NEXT: bsrq %rsi, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: orq $64, %rdi
+; X64-NEXT: testq %rdx, %rdx
+; X64-NEXT: cmovneq %rax, %rdi
+; X64-NEXT: orq $128, %rdi
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: orq %r8, %rax
+; X64-NEXT: cmovneq %r9, %rdi
+; X64-NEXT: xorl %r9d, %r9d
+; X64-NEXT: subq %rdi, %r12
+; X64-NEXT: movl $0, %r15d
+; X64-NEXT: sbbq %r15, %r15
+; X64-NEXT: movl $0, %r13d
+; X64-NEXT: sbbq %r13, %r13
+; X64-NEXT: sbbq %r9, %r9
+; X64-NEXT: testb %bl, %bl
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: jne .LBB1_1
+; X64-NEXT: # %bb.2: # %select.false.sink
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: movl $255, %esi
+; X64-NEXT: cmpq %r12, %rsi
+; X64-NEXT: movl $0, %esi
+; X64-NEXT: sbbq %r15, %rsi
+; X64-NEXT: movl $0, %esi
+; X64-NEXT: sbbq %r13, %rsi
+; X64-NEXT: sbbq %r9, %rcx
+; X64-NEXT: setb %cl
+; X64-NEXT: .LBB1_3: # %select.end
+; X64-NEXT: xorq %r10, %r11
+; X64-NEXT: xorl %r10d, %r10d
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: cmovneq %r10, %rbx
+; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT: cmovneq %r10, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: cmovneq %r10, %rdx
+; X64-NEXT: cmoveq %r8, %r10
+; X64-NEXT: jne .LBB1_4
+; X64-NEXT: # %bb.10: # %select.end
+; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: xorq $255, %rsi
+; X64-NEXT: orq %r13, %rsi
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: orq %r9, %rdi
+; X64-NEXT: orq %rsi, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: je .LBB1_11
+; X64-NEXT: # %bb.8: # %udiv-bb1
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq (%rsp), %rax # 8-byte Reload
+; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %r12d, %ecx
+; X64-NEXT: notb %cl
+; X64-NEXT: movl %ecx, %edx
+; X64-NEXT: shrb $3, %dl
+; X64-NEXT: andb $24, %dl
+; X64-NEXT: negb %dl
+; X64-NEXT: movsbq %dl, %rdx
+; X64-NEXT: movq 240(%rsp,%rdx), %rdi
+; X64-NEXT: movq 248(%rsp,%rdx), %rsi
+; X64-NEXT: shldq %cl, %rdi, %rsi
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 224(%rsp,%rdx), %rsi
+; X64-NEXT: movq 232(%rsp,%rdx), %r11
+; X64-NEXT: shldq %cl, %r11, %rdi
+; X64-NEXT: movq %rdi, %rdx
+; X64-NEXT: shldq %cl, %rsi, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: shlq %cl, %rsi
+; X64-NEXT: addq $1, %r12
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: jb .LBB1_9
+; X64-NEXT: # %bb.5: # %udiv-preheader
+; X64-NEXT: movq %r15, %r10
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %r12d, %ecx
+; X64-NEXT: shrb $6, %cl
+; X64-NEXT: movzbl %cl, %edi
+; X64-NEXT: movq 152(%rsp,%rdi,8), %r8
+; X64-NEXT: movq 144(%rsp,%rdi,8), %r11
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movl %r12d, %ecx
+; X64-NEXT: shrdq %cl, %r8, %rax
+; X64-NEXT: movq %r12, %rcx
+; X64-NEXT: movq 128(%rsp,%rdi,8), %r12
+; X64-NEXT: movq 136(%rsp,%rdi,8), %rdi
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: shrdq %cl, %r11, %r14
+; X64-NEXT: shrq %cl, %r8
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NEXT: shrdq %cl, %rdi, %r12
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: addq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r15d, %r15d
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: xorl %r13d, %r13d
+; X64-NEXT: movq %r9, (%rsp) # 8-byte Spill
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB1_6: # %udiv-do-while
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: shldq $1, %rax, %r8
+; X64-NEXT: shldq $1, %r14, %rax
+; X64-NEXT: shldq $1, %r12, %r14
+; X64-NEXT: shldq $1, %r10, %r12
+; X64-NEXT: shldq $1, %rdx, %r10
+; X64-NEXT: orq %r13, %r10
+; X64-NEXT: shldq $1, %rbx, %rdx
+; X64-NEXT: orq %rdi, %rdx
+; X64-NEXT: shldq $1, %rsi, %rbx
+; X64-NEXT: orq %r15, %rbx
+; X64-NEXT: addq %rsi, %rsi
+; X64-NEXT: orq %rcx, %rsi
+; X64-NEXT: cmpq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %r14, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %rax, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: sbbq %r8, %rdi
+; X64-NEXT: sarq $63, %rdi
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %rdi, %r15
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; X64-NEXT: movq %rdi, %r13
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: movq %rdi, %r11
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT: subq %rdi, %r12
+; X64-NEXT: sbbq %r11, %r14
+; X64-NEXT: sbbq %r13, %rax
+; X64-NEXT: sbbq %r15, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: addq $-1, %r11
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: adcq $-1, %rdi
+; X64-NEXT: adcq $-1, %r9
+; X64-NEXT: movq (%rsp), %r15 # 8-byte Reload
+; X64-NEXT: adcq $-1, %r15
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r15, (%rsp) # 8-byte Spill
+; X64-NEXT: orq %r15, %rdi
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r9, %r11
+; X64-NEXT: orq %rdi, %r11
+; X64-NEXT: movl $0, %r15d
+; X64-NEXT: movl $0, %edi
+; X64-NEXT: movl $0, %r13d
+; X64-NEXT: jne .LBB1_6
+; X64-NEXT: .LBB1_7: # %udiv-loop-exit
+; X64-NEXT: shldq $1, %rdx, %r10
+; X64-NEXT: shldq $1, %rbx, %rdx
+; X64-NEXT: shldq $1, %rsi, %rbx
+; X64-NEXT: leaq (%rcx,%rsi,2), %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: .LBB1_11: # %udiv-end
+; X64-NEXT: xorq %r11, %r10
+; X64-NEXT: xorq %r11, %rdx
+; X64-NEXT: xorq %r11, %rbx
+; X64-NEXT: xorq %r11, %rcx
+; X64-NEXT: subq %r11, %rcx
+; X64-NEXT: sbbq %r11, %rbx
+; X64-NEXT: sbbq %r11, %rdx
+; X64-NEXT: sbbq %r11, %r10
+; X64-NEXT: movq %rcx, (%rax)
+; X64-NEXT: movq %rbx, 8(%rax)
+; X64-NEXT: movq %rdx, 16(%rax)
+; X64-NEXT: movq %r10, 24(%rax)
+; X64-NEXT: leaq -40(%rbp), %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+; X64-NEXT: .LBB1_1:
+; X64-NEXT: movb $1, %cl
+; X64-NEXT: jmp .LBB1_3
+; X64-NEXT: .LBB1_9:
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: jmp .LBB1_7
+; X64-NEXT: .LBB1_4:
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: jmp .LBB1_11
+ %r = sdiv i256 %a, %b
+ ret i256 %r
+}
+
+define i256 @urem256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: urem256:
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $288, %esp # imm = 0x120
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: movl 72(%ebp), %edi
+; X86-NEXT: movl 56(%ebp), %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: orl 64(%ebp), %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 68(%ebp), %ebx
+; X86-NEXT: movl 52(%ebp), %ecx
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl 44(%ebp), %edx
+; X86-NEXT: orl 60(%ebp), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: sete %cl
+; X86-NEXT: movl 24(%ebp), %edx
+; X86-NEXT: orl 40(%ebp), %edx
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: orl 32(%ebp), %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl 20(%ebp), %edx
+; X86-NEXT: orl 36(%ebp), %edx
+; X86-NEXT: movl 12(%ebp), %esi
+; X86-NEXT: orl 28(%ebp), %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: sete %ch
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: jne .LBB2_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %ebx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB2_3
+; X86-NEXT: .LBB2_1:
+; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB2_3: # %_udiv-special-cases
+; X86-NEXT: movl 64(%ebp), %esi
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl 40(%ebp), %edx
+; X86-NEXT: jne .LBB2_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl 60(%ebp), %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: orl 72(%ebp), %ebx
+; X86-NEXT: je .LBB2_7
+; X86-NEXT: jmp .LBB2_8
+; X86-NEXT: .LBB2_4:
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl 72(%ebp), %ebx
+; X86-NEXT: jne .LBB2_8
+; X86-NEXT: .LBB2_7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: .LBB2_8: # %_udiv-special-cases
+; X86-NEXT: movl 56(%ebp), %esi
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl 44(%ebp), %edi
+; X86-NEXT: movl 48(%ebp), %ebx
+; X86-NEXT: jne .LBB2_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: bsrl 52(%ebp), %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: je .LBB2_13
+; X86-NEXT: .LBB2_12:
+; X86-NEXT: bsrl %ebx, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: jmp .LBB2_14
+; X86-NEXT: .LBB2_9:
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB2_12
+; X86-NEXT: .LBB2_13: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: .LBB2_14: # %_udiv-special-cases
+; X86-NEXT: movl 52(%ebp), %ebx
+; X86-NEXT: orl 56(%ebp), %ebx
+; X86-NEXT: jne .LBB2_16
+; X86-NEXT: # %bb.15: # %_udiv-special-cases
+; X86-NEXT: orl $64, %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: .LBB2_16: # %_udiv-special-cases
+; X86-NEXT: movl 64(%ebp), %edi
+; X86-NEXT: orl 72(%ebp), %edi
+; X86-NEXT: movl 60(%ebp), %ebx
+; X86-NEXT: orl 68(%ebp), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: jne .LBB2_18
+; X86-NEXT: # %bb.17: # %_udiv-special-cases
+; X86-NEXT: orl $128, %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: .LBB2_18: # %_udiv-special-cases
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: jne .LBB2_19
+; X86-NEXT: # %bb.20: # %_udiv-special-cases
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: je .LBB2_23
+; X86-NEXT: .LBB2_22:
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: je .LBB2_25
+; X86-NEXT: jmp .LBB2_26
+; X86-NEXT: .LBB2_19:
+; X86-NEXT: bsrl %edx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: jne .LBB2_22
+; X86-NEXT: .LBB2_23: # %_udiv-special-cases
+; X86-NEXT: bsrl 28(%ebp), %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: jne .LBB2_26
+; X86-NEXT: .LBB2_25: # %_udiv-special-cases
+; X86-NEXT: orl $64, %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: .LBB2_26: # %_udiv-special-cases
+; X86-NEXT: movl 24(%ebp), %edx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB2_27
+; X86-NEXT: # %bb.28: # %_udiv-special-cases
+; X86-NEXT: bsrl 20(%ebp), %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: jmp .LBB2_29
+; X86-NEXT: .LBB2_27:
+; X86-NEXT: bsrl %edx, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: .LBB2_29: # %_udiv-special-cases
+; X86-NEXT: movl 16(%ebp), %edx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB2_30
+; X86-NEXT: # %bb.31: # %_udiv-special-cases
+; X86-NEXT: bsrl 12(%ebp), %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: orl $32, %ebx
+; X86-NEXT: jmp .LBB2_32
+; X86-NEXT: .LBB2_30:
+; X86-NEXT: bsrl %edx, %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: .LBB2_32: # %_udiv-special-cases
+; X86-NEXT: movl 20(%ebp), %edx
+; X86-NEXT: orl 24(%ebp), %edx
+; X86-NEXT: jne .LBB2_34
+; X86-NEXT: # %bb.33: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ebx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: .LBB2_34: # %_udiv-special-cases
+; X86-NEXT: orb %ch, %cl
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: orl 40(%ebp), %edx
+; X86-NEXT: movl 28(%ebp), %ebx
+; X86-NEXT: orl 36(%ebp), %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: jne .LBB2_36
+; X86-NEXT: # %bb.35: # %_udiv-special-cases
+; X86-NEXT: orl $128, %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: .LBB2_36: # %_udiv-special-cases
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB2_37
+; X86-NEXT: # %bb.38: # %select.false.sink
+; X86-NEXT: movl $255, %ecx
+; X86-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: setb %cl
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: .LBB2_39: # %select.end
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jne .LBB2_41
+; X86-NEXT: # %bb.40: # %select.end
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB2_41: # %select.end
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB2_42
+; X86-NEXT: # %bb.48: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: xorl $255, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: je .LBB2_49
+; X86-NEXT: # %bb.46: # %udiv-bb1
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %ebx
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 40(%ebp), %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: notb %cl
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrb $3, %dl
+; X86-NEXT: andb $28, %dl
+; X86-NEXT: negb %dl
+; X86-NEXT: movsbl %dl, %edx
+; X86-NEXT: movl 264(%esp,%edx), %esi
+; X86-NEXT: movl 268(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 260(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 256(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 252(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 248(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 240(%esp,%edx), %edi
+; X86-NEXT: movl 244(%esp,%edx), %edx
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl $1, %ecx
+; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB2_47
+; X86-NEXT: # %bb.43: # %udiv-preheader
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %esi
+; X86-NEXT: movl 172(%esp,%esi,4), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 168(%esp,%esi,4), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 164(%esp,%esi,4), %edx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shrdl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 160(%esp,%esi,4), %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shrdl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 156(%esp,%esi,4), %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 152(%esp,%esi,4), %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shrdl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 144(%esp,%esi,4), %edi
+; X86-NEXT: movl 148(%esp,%esi,4), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebp), %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 68(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 72(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB2_44: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ecx, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl 72(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl 68(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl 64(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: andl 60(%ebp), %edi
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl 56(%ebp), %esi
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl 52(%ebp), %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl 48(%ebp), %eax
+; X86-NEXT: andl 44(%ebp), %ecx
+; X86-NEXT: subl %ecx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: jne .LBB2_44
+; X86-NEXT: .LBB2_45: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: leal (%edx,%eax,2), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: .LBB2_49: # %udiv-end
+; X86-NEXT: movl 52(%ebp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%ebp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: setb %cl
+; X86-NEXT: movl 56(%ebp), %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl 44(%ebp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: setb %cl
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl 52(%ebp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%ebp), %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: setb (%esp) # 1-byte Folded Spill
+; X86-NEXT: movl 56(%ebp), %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%ebp), %eax
+; X86-NEXT: imull %eax, %edi
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: movl 64(%ebp), %edi
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl 68(%ebp), %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl 72(%ebp), %ecx
+; X86-NEXT: imull %esi, %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: adcl %ebx, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl 60(%ebp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull 64(%ebp)
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull 64(%ebp)
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movzbl %bl, %esi
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%ebp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: movl 56(%ebp), %ecx
+; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: imull %ecx, %esi
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: movl 48(%ebp), %ecx
+; X86-NEXT: imull %ecx, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl 44(%ebp), %esi
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl 48(%ebp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull 48(%ebp)
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 12(%ebp), %esi
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%ebp), %ebx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebp), %ebx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebp), %esi
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: sbbl %ecx, %edi
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, (%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, 4(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, 8(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, 12(%edx)
+; X86-NEXT: movl %esi, 16(%edx)
+; X86-NEXT: movl %edi, 20(%edx)
+; X86-NEXT: movl %ecx, 24(%edx)
+; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB2_37:
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB2_39
+; X86-NEXT: .LBB2_47:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB2_45
+; X86-NEXT: .LBB2_42:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB2_49
+;
+; X64-LABEL: urem256:
+; X64: # %bb.0: # %_udiv-special-cases
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $288, %rsp # imm = 0x120
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 24(%rbp), %r14
+; X64-NEXT: movq 32(%rbp), %r15
+; X64-NEXT: movq 16(%rbp), %rbx
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: orq %r15, %rax
+; X64-NEXT: movq %r9, %rdi
+; X64-NEXT: orq %r14, %rdi
+; X64-NEXT: orq %rax, %rdi
+; X64-NEXT: sete %dil
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: orq %r8, %rax
+; X64-NEXT: movq %rsi, %r10
+; X64-NEXT: orq %rcx, %r10
+; X64-NEXT: orq %rax, %r10
+; X64-NEXT: sete %al
+; X64-NEXT: orb %dil, %al
+; X64-NEXT: bsrq %r15, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: bsrq %r14, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: orq $64, %r11
+; X64-NEXT: testq %r15, %r15
+; X64-NEXT: cmovneq %rdi, %r11
+; X64-NEXT: bsrq %rbx, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: bsrq %r9, %r10
+; X64-NEXT: xorq $63, %r10
+; X64-NEXT: orq $64, %r10
+; X64-NEXT: testq %rbx, %rbx
+; X64-NEXT: cmovneq %rdi, %r10
+; X64-NEXT: orq $128, %r10
+; X64-NEXT: movq %r14, %rdi
+; X64-NEXT: orq %r15, %rdi
+; X64-NEXT: cmovneq %r11, %r10
+; X64-NEXT: bsrq %r8, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: bsrq %rcx, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: orq $64, %r11
+; X64-NEXT: testq %r8, %r8
+; X64-NEXT: cmovneq %rdi, %r11
+; X64-NEXT: bsrq %rdx, %r9
+; X64-NEXT: xorq $63, %r9
+; X64-NEXT: bsrq %rsi, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: orq $64, %rdi
+; X64-NEXT: testq %rdx, %rdx
+; X64-NEXT: cmovneq %r9, %rdi
+; X64-NEXT: orq $128, %rdi
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r8, %rcx
+; X64-NEXT: cmovneq %r11, %rdi
+; X64-NEXT: xorl %ebx, %ebx
+; X64-NEXT: subq %rdi, %r10
+; X64-NEXT: movl $0, %r12d
+; X64-NEXT: sbbq %r12, %r12
+; X64-NEXT: movl $0, %r14d
+; X64-NEXT: sbbq %r14, %r14
+; X64-NEXT: sbbq %rbx, %rbx
+; X64-NEXT: testb %al, %al
+; X64-NEXT: jne .LBB2_1
+; X64-NEXT: # %bb.2: # %select.false.sink
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl $255, %ecx
+; X64-NEXT: cmpq %r10, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %r12, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %r14, %rcx
+; X64-NEXT: sbbq %rbx, %rax
+; X64-NEXT: setb %al
+; X64-NEXT: .LBB2_3: # %select.end
+; X64-NEXT: xorl %r11d, %r11d
+; X64-NEXT: testb %al, %al
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: cmovneq %r11, %rdi
+; X64-NEXT: movq %rsi, %rcx
+; X64-NEXT: cmovneq %r11, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: cmovneq %r11, %rax
+; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT: cmoveq %r8, %r11
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: jne .LBB2_4
+; X64-NEXT: # %bb.10: # %select.end
+; X64-NEXT: movq %r10, %rdx
+; X64-NEXT: xorq $255, %rdx
+; X64-NEXT: orq %r14, %rdx
+; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: orq %rbx, %rsi
+; X64-NEXT: orq %rdx, %rsi
+; X64-NEXT: movq %r8, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq (%rsp), %rax # 8-byte Reload
+; X64-NEXT: je .LBB2_11
+; X64-NEXT: # %bb.8: # %udiv-bb1
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %r10d, %ecx
+; X64-NEXT: notb %cl
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: shrb $3, %al
+; X64-NEXT: andb $24, %al
+; X64-NEXT: negb %al
+; X64-NEXT: movsbq %al, %rdx
+; X64-NEXT: movq 240(%rsp,%rdx), %r11
+; X64-NEXT: movq 248(%rsp,%rdx), %rax
+; X64-NEXT: shldq %cl, %r11, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r8, %rsi
+; X64-NEXT: movq 224(%rsp,%rdx), %r8
+; X64-NEXT: movq 232(%rsp,%rdx), %r9
+; X64-NEXT: shldq %cl, %r9, %r11
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: shldq %cl, %r8, %r9
+; X64-NEXT: shlq %cl, %r8
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq $1, %r10
+; X64-NEXT: adcq $0, %r12
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r14
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq %rbx, (%rsp) # 8-byte Spill
+; X64-NEXT: jb .LBB2_9
+; X64-NEXT: # %bb.5: # %udiv-preheader
+; X64-NEXT: movq %r14, %r12
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %r10d, %ecx
+; X64-NEXT: shrb $6, %cl
+; X64-NEXT: movzbl %cl, %edx
+; X64-NEXT: movq 152(%rsp,%rdx,8), %rsi
+; X64-NEXT: movq 144(%rsp,%rdx,8), %rdi
+; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: movl %r10d, %ecx
+; X64-NEXT: shrdq %cl, %rsi, %r8
+; X64-NEXT: movq %r10, %rcx
+; X64-NEXT: movq 128(%rsp,%rdx,8), %r10
+; X64-NEXT: movq 136(%rsp,%rdx,8), %rdx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: shrdq %cl, %rdi, %rbx
+; X64-NEXT: shrq %cl, %rsi
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NEXT: shrdq %cl, %rdx, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: addq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 16(%rbp), %rcx
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 24(%rbp), %rcx
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: xorl %r14d, %r14d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB2_6: # %udiv-do-while
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: shldq $1, %r8, %rsi
+; X64-NEXT: shldq $1, %rbx, %r8
+; X64-NEXT: shldq $1, %r10, %rbx
+; X64-NEXT: shldq $1, %r11, %r10
+; X64-NEXT: shldq $1, %rax, %r11
+; X64-NEXT: orq %r14, %r11
+; X64-NEXT: shldq $1, %r9, %rax
+; X64-NEXT: orq %rdx, %rax
+; X64-NEXT: shldq $1, %r12, %r9
+; X64-NEXT: orq %rdi, %r9
+; X64-NEXT: addq %r12, %r12
+; X64-NEXT: orq %rcx, %r12
+; X64-NEXT: cmpq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %rbx, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %r8, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: sbbq %rsi, %rdx
+; X64-NEXT: sarq $63, %rdx
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: andq %r15, %rdi
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq 24(%rbp), %r15
+; X64-NEXT: andq %r15, %r14
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: andq 16(%rbp), %r13
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT: subq %rdx, %r10
+; X64-NEXT: sbbq %r13, %rbx
+; X64-NEXT: movq 32(%rbp), %r15
+; X64-NEXT: sbbq %r14, %r8
+; X64-NEXT: sbbq %rdi, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: addq $-1, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: adcq $-1, %r14
+; X64-NEXT: movq (%rsp), %r13 # 8-byte Reload
+; X64-NEXT: adcq $-1, %r13
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r13, (%rsp) # 8-byte Spill
+; X64-NEXT: orq %r13, %rdx
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r14, %rdi
+; X64-NEXT: orq %rdx, %rdi
+; X64-NEXT: movl $0, %edi
+; X64-NEXT: movl $0, %edx
+; X64-NEXT: movl $0, %r14d
+; X64-NEXT: jne .LBB2_6
+; X64-NEXT: .LBB2_7: # %udiv-loop-exit
+; X64-NEXT: shldq $1, %rax, %r11
+; X64-NEXT: shldq $1, %r9, %rax
+; X64-NEXT: shldq $1, %r12, %r9
+; X64-NEXT: leaq (%rcx,%r12,2), %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq %r9, %rdi
+; X64-NEXT: .LBB2_11: # %udiv-end
+; X64-NEXT: movq 16(%rbp), %r10
+; X64-NEXT: movq %r10, %rsi
+; X64-NEXT: imulq %rax, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdi, %r9
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: imulq %rbx, %r11
+; X64-NEXT: addq %rdx, %r11
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq 24(%rbp), %rsi
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: imulq %rcx, %rdi
+; X64-NEXT: addq %rdx, %rdi
+; X64-NEXT: imulq %r9, %rsi
+; X64-NEXT: addq %rdi, %rsi
+; X64-NEXT: addq %r15, %r14
+; X64-NEXT: adcq %r11, %rsi
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rsi, %r15
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %r12, %rdi
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %rdi, %rcx
+; X64-NEXT: adcq %r13, %rbx
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: movzbl %sil, %esi
+; X64-NEXT: adcq %rsi, %rdx
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: adcq %r15, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: subq %r11, %rsi
+; X64-NEXT: sbbq %rcx, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: sbbq %rax, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %rdx, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %rsi, (%rax)
+; X64-NEXT: movq %r8, 8(%rax)
+; X64-NEXT: movq %rdi, 16(%rax)
+; X64-NEXT: movq %rcx, 24(%rax)
+; X64-NEXT: leaq -40(%rbp), %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+; X64-NEXT: .LBB2_1:
+; X64-NEXT: movb $1, %al
+; X64-NEXT: jmp .LBB2_3
+; X64-NEXT: .LBB2_9:
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: jmp .LBB2_7
+; X64-NEXT: .LBB2_4:
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq (%rsp), %rax # 8-byte Reload
+; X64-NEXT: jmp .LBB2_11
+ %r = urem i256 %a, %b
+ ret i256 %r
+}
+
+define i256 @srem256(i256 %a, i256 %b) nounwind {
+; X86-LABEL: srem256:
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $368, %esp # imm = 0x170
+; X86-NEXT: movl 40(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebp), %esi
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: movl 20(%ebp), %esi
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 72(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 68(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl 56(%ebp), %edi
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: movl 52(%ebp), %ebx
+; X86-NEXT: xorl %eax, %ebx
+; X86-NEXT: movl 48(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl 44(%ebp), %esi
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: subl %eax, %esi
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB3_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: bsrl %ebx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB3_3
+; X86-NEXT: .LBB3_1:
+; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: .LBB3_3: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jne .LBB3_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB3_6
+; X86-NEXT: .LBB3_4:
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB3_6: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: jne .LBB3_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: .LBB3_8: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jne .LBB3_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: je .LBB3_13
+; X86-NEXT: .LBB3_12:
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: jmp .LBB3_14
+; X86-NEXT: .LBB3_9:
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: jne .LBB3_12
+; X86-NEXT: .LBB3_13: # %_udiv-special-cases
+; X86-NEXT: bsrl %ebx, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: .LBB3_14: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB3_16
+; X86-NEXT: # %bb.15: # %_udiv-special-cases
+; X86-NEXT: orl $64, %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: .LBB3_16: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: jne .LBB3_18
+; X86-NEXT: # %bb.17: # %_udiv-special-cases
+; X86-NEXT: orl $128, %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: .LBB3_18: # %_udiv-special-cases
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jne .LBB3_19
+; X86-NEXT: # %bb.20: # %_udiv-special-cases
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB3_21
+; X86-NEXT: .LBB3_19:
+; X86-NEXT: bsrl %edx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB3_21: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: jne .LBB3_22
+; X86-NEXT: # %bb.23: # %_udiv-special-cases
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: je .LBB3_25
+; X86-NEXT: jmp .LBB3_26
+; X86-NEXT: .LBB3_22:
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: jne .LBB3_26
+; X86-NEXT: .LBB3_25: # %_udiv-special-cases
+; X86-NEXT: orl $64, %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: .LBB3_26: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB3_27
+; X86-NEXT: # %bb.28: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: bsrl %edx, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: je .LBB3_31
+; X86-NEXT: .LBB3_30:
+; X86-NEXT: bsrl %ecx, %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: jmp .LBB3_32
+; X86-NEXT: .LBB3_27:
+; X86-NEXT: bsrl %edx, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: jne .LBB3_30
+; X86-NEXT: .LBB3_31: # %_udiv-special-cases
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: orl $32, %ebx
+; X86-NEXT: .LBB3_32: # %_udiv-special-cases
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB3_34
+; X86-NEXT: # %bb.33: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ebx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: .LBB3_34: # %_udiv-special-cases
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: jne .LBB3_36
+; X86-NEXT: # %bb.35: # %_udiv-special-cases
+; X86-NEXT: orl $128, %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: .LBB3_36: # %_udiv-special-cases
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB3_37
+; X86-NEXT: # %bb.38: # %select.false.sink
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $255, %ecx
+; X86-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: .LBB3_39: # %select.end
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jne .LBB3_41
+; X86-NEXT: # %bb.40: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB3_41: # %select.end
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB3_42
+; X86-NEXT: # %bb.48: # %select.end
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: xorl $255, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: je .LBB3_49
+; X86-NEXT: # %bb.46: # %udiv-bb1
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: notb %cl
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrb $3, %dl
+; X86-NEXT: andb $28, %dl
+; X86-NEXT: negb %dl
+; X86-NEXT: movsbl %dl, %edx
+; X86-NEXT: movl 344(%esp,%edx), %esi
+; X86-NEXT: movl 348(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 340(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 336(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 332(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 328(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 320(%esp,%edx), %edi
+; X86-NEXT: movl 324(%esp,%edx), %edx
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl $1, %ecx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB3_47
+; X86-NEXT: # %bb.43: # %udiv-preheader
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 252(%esp,%eax,4), %esi
+; X86-NEXT: movl 248(%esp,%eax,4), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 244(%esp,%eax,4), %ebx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: shrdl %cl, %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 240(%esp,%eax,4), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 236(%esp,%eax,4), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 232(%esp,%eax,4), %ebx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 224(%esp,%eax,4), %edi
+; X86-NEXT: movl 228(%esp,%eax,4), %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %ebx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB3_44: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: subl %ecx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: xorl %ebx, %ebx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB3_44
+; X86-NEXT: .LBB3_45: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: leal (%edi,%esi,2), %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: .LBB3_49: # %udiv-end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edi, %ebx
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: imull %eax, %ecx
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: imull %ecx, %edx
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: imull %ebx, %esi
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: imull %ecx, %edi
+; X86-NEXT: addl %esi, %edi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: setb %bl
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movzbl %bl, %esi
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: xorl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, (%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, 4(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, 8(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, 12(%edx)
+; X86-NEXT: movl %edi, 16(%edx)
+; X86-NEXT: movl %esi, 20(%edx)
+; X86-NEXT: movl %ecx, 24(%edx)
+; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB3_37:
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jmp .LBB3_39
+; X86-NEXT: .LBB3_47:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB3_45
+; X86-NEXT: .LBB3_42:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jmp .LBB3_49
+;
+; X64-LABEL: srem256:
+; X64: # %bb.0: # %_udiv-special-cases
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $320, %rsp # imm = 0x140
+; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movq 32(%rbp), %r15
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: sarq $63, %rax
+; X64-NEXT: xorq %rax, %r8
+; X64-NEXT: xorq %rax, %r14
+; X64-NEXT: xorq %rax, %rdx
+; X64-NEXT: xorq %rax, %rsi
+; X64-NEXT: subq %rax, %rsi
+; X64-NEXT: sbbq %rax, %rdx
+; X64-NEXT: sbbq %rax, %r14
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: sbbq %rax, %r8
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: sarq $63, %rax
+; X64-NEXT: xorq %rax, %r15
+; X64-NEXT: movq 24(%rbp), %r13
+; X64-NEXT: xorq %rax, %r13
+; X64-NEXT: movq 16(%rbp), %r10
+; X64-NEXT: xorq %rax, %r10
+; X64-NEXT: xorq %rax, %r9
+; X64-NEXT: subq %rax, %r9
+; X64-NEXT: sbbq %rax, %r10
+; X64-NEXT: sbbq %rax, %r13
+; X64-NEXT: sbbq %rax, %r15
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: orq %r15, %rax
+; X64-NEXT: movq %r9, %rcx
+; X64-NEXT: orq %r13, %rcx
+; X64-NEXT: orq %rax, %rcx
+; X64-NEXT: sete %cl
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: orq %r8, %rax
+; X64-NEXT: movq %rsi, %r11
+; X64-NEXT: orq %r14, %r11
+; X64-NEXT: orq %rax, %r11
+; X64-NEXT: sete %al
+; X64-NEXT: orb %cl, %al
+; X64-NEXT: bsrq %r15, %rcx
+; X64-NEXT: xorq $63, %rcx
+; X64-NEXT: bsrq %r13, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: orq $64, %r11
+; X64-NEXT: testq %r15, %r15
+; X64-NEXT: cmovneq %rcx, %r11
+; X64-NEXT: bsrq %r10, %rcx
+; X64-NEXT: xorq $63, %rcx
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: bsrq %r9, %rbx
+; X64-NEXT: xorq $63, %rbx
+; X64-NEXT: orq $64, %rbx
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: testq %r10, %r10
+; X64-NEXT: cmovneq %rcx, %rbx
+; X64-NEXT: orq $128, %rbx
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: orq %r15, %rcx
+; X64-NEXT: cmovneq %r11, %rbx
+; X64-NEXT: bsrq %r8, %rcx
+; X64-NEXT: xorq $63, %rcx
+; X64-NEXT: bsrq %r14, %r9
+; X64-NEXT: xorq $63, %r9
+; X64-NEXT: orq $64, %r9
+; X64-NEXT: testq %r8, %r8
+; X64-NEXT: cmovneq %rcx, %r9
+; X64-NEXT: bsrq %rdx, %rcx
+; X64-NEXT: xorq $63, %rcx
+; X64-NEXT: bsrq %rsi, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: orq $64, %r11
+; X64-NEXT: testq %rdx, %rdx
+; X64-NEXT: cmovneq %rcx, %r11
+; X64-NEXT: orq $128, %r11
+; X64-NEXT: movq %r14, %rcx
+; X64-NEXT: orq %r8, %rcx
+; X64-NEXT: cmovneq %r9, %r11
+; X64-NEXT: xorl %r10d, %r10d
+; X64-NEXT: subq %r11, %rbx
+; X64-NEXT: movl $0, %r11d
+; X64-NEXT: sbbq %r11, %r11
+; X64-NEXT: movl $0, %r15d
+; X64-NEXT: sbbq %r15, %r15
+; X64-NEXT: sbbq %r10, %r10
+; X64-NEXT: testb %al, %al
+; X64-NEXT: jne .LBB3_1
+; X64-NEXT: # %bb.2: # %select.false.sink
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl $255, %ecx
+; X64-NEXT: cmpq %rbx, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %r11, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %r15, %rcx
+; X64-NEXT: sbbq %r10, %rax
+; X64-NEXT: setb %al
+; X64-NEXT: .LBB3_3: # %select.end
+; X64-NEXT: xorl %r12d, %r12d
+; X64-NEXT: testb %al, %al
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: cmovneq %r12, %r9
+; X64-NEXT: movq %rsi, %rcx
+; X64-NEXT: cmovneq %r12, %rcx
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: cmovneq %r12, %rax
+; X64-NEXT: cmoveq %r8, %r12
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: jne .LBB3_9
+; X64-NEXT: # %bb.4: # %select.end
+; X64-NEXT: movq %rbx, %rdx
+; X64-NEXT: xorq $255, %rdx
+; X64-NEXT: orq %r15, %rdx
+; X64-NEXT: movq %r11, %rsi
+; X64-NEXT: orq %r10, %rsi
+; X64-NEXT: orq %rdx, %rsi
+; X64-NEXT: je .LBB3_9
+; X64-NEXT: # %bb.5: # %udiv-bb1
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %ebx, %ecx
+; X64-NEXT: notb %cl
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: shrb $3, %al
+; X64-NEXT: andb $24, %al
+; X64-NEXT: negb %al
+; X64-NEXT: movsbq %al, %rdx
+; X64-NEXT: movq 272(%rsp,%rdx), %rax
+; X64-NEXT: movq 280(%rsp,%rdx), %r12
+; X64-NEXT: shldq %cl, %rax, %r12
+; X64-NEXT: movq 256(%rsp,%rdx), %rsi
+; X64-NEXT: movq 264(%rsp,%rdx), %r9
+; X64-NEXT: shldq %cl, %r9, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: shldq %cl, %rsi, %r9
+; X64-NEXT: shlq %cl, %rsi
+; X64-NEXT: addq $1, %rbx
+; X64-NEXT: adcq $0, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: jb .LBB3_10
+; X64-NEXT: # %bb.6: # %udiv-preheader
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %ebx, %ecx
+; X64-NEXT: shrb $6, %cl
+; X64-NEXT: movzbl %cl, %edx
+; X64-NEXT: movq 184(%rsp,%rdx,8), %r11
+; X64-NEXT: movq 176(%rsp,%rdx,8), %rdi
+; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: movl %ebx, %ecx
+; X64-NEXT: shrdq %cl, %r11, %r10
+; X64-NEXT: movq 160(%rsp,%rdx,8), %r8
+; X64-NEXT: movq 168(%rsp,%rdx,8), %rdx
+; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: shrdq %cl, %rdi, %rbx
+; X64-NEXT: shrq %cl, %r11
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NEXT: shrdq %cl, %rdx, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: addq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: xorl %r14d, %r14d
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB3_7: # %udiv-do-while
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: shldq $1, %r10, %r11
+; X64-NEXT: shldq $1, %rbx, %r10
+; X64-NEXT: shldq $1, %r8, %rbx
+; X64-NEXT: shldq $1, %r12, %r8
+; X64-NEXT: shldq $1, %rax, %r12
+; X64-NEXT: orq %r14, %r12
+; X64-NEXT: shldq $1, %r9, %rax
+; X64-NEXT: orq %rdx, %rax
+; X64-NEXT: shldq $1, %rsi, %r9
+; X64-NEXT: orq %rdi, %r9
+; X64-NEXT: addq %rsi, %rsi
+; X64-NEXT: orq %rcx, %rsi
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: cmpq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %rbx, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %r10, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: sbbq %r11, %rdx
+; X64-NEXT: sarq $63, %rdx
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: andq %r13, %r14
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: movq %r13, %r15
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT: subq %rdx, %r8
+; X64-NEXT: sbbq %r13, %rbx
+; X64-NEXT: movq %r15, %r13
+; X64-NEXT: movq %rsi, %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: sbbq %r14, %r10
+; X64-NEXT: sbbq %rdi, %r11
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: addq $-1, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: adcq $-1, %r14
+; X64-NEXT: adcq $-1, %r15
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r15, %rdx
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r14, %rdi
+; X64-NEXT: orq %rdx, %rdi
+; X64-NEXT: movl $0, %edi
+; X64-NEXT: movl $0, %edx
+; X64-NEXT: movl $0, %r14d
+; X64-NEXT: jne .LBB3_7
+; X64-NEXT: .LBB3_8: # %udiv-loop-exit
+; X64-NEXT: shldq $1, %rax, %r12
+; X64-NEXT: shldq $1, %r9, %rax
+; X64-NEXT: shldq $1, %rsi, %r9
+; X64-NEXT: leaq (%rcx,%rsi,2), %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: .LBB3_9: # %udiv-end
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: imulq %r10, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: imulq %rbx, %r12
+; X64-NEXT: addq %rdx, %r12
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %r13
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: imulq %r9, %r13
+; X64-NEXT: addq %rax, %r13
+; X64-NEXT: addq %r11, %rbx
+; X64-NEXT: adcq %r12, %r13
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdi, %r15
+; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: addq %r12, %rdi
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r11
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %rdi, %rcx
+; X64-NEXT: adcq %r13, %r11
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r9
+; X64-NEXT: addq %r11, %rax
+; X64-NEXT: movzbl %sil, %esi
+; X64-NEXT: adcq %rsi, %rdx
+; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: sbbq %rcx, %rsi
+; X64-NEXT: sbbq %rax, %r14
+; X64-NEXT: sbbq %rdx, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: xorq %rax, %r8
+; X64-NEXT: xorq %rax, %r14
+; X64-NEXT: xorq %rax, %rsi
+; X64-NEXT: xorq %rax, %rdi
+; X64-NEXT: subq %rax, %rdi
+; X64-NEXT: sbbq %rax, %rsi
+; X64-NEXT: sbbq %rax, %r14
+; X64-NEXT: sbbq %rax, %r8
+; X64-NEXT: movq %rdi, (%r15)
+; X64-NEXT: movq %rsi, 8(%r15)
+; X64-NEXT: movq %r14, 16(%r15)
+; X64-NEXT: movq %r8, 24(%r15)
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: leaq -40(%rbp), %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+; X64-NEXT: .LBB3_1:
+; X64-NEXT: movb $1, %al
+; X64-NEXT: jmp .LBB3_3
+; X64-NEXT: .LBB3_10:
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: jmp .LBB3_8
+ %r = srem i256 %a, %b
+ ret i256 %r
+}
+
+; Division by power of 2 should optimize to shift
+define i256 @udiv256_pow2(i256 %a) nounwind {
+; X86-LABEL: udiv256_pow2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ebp, %esi
+; X86-NEXT: shldl $28, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $28, %ebx, %edx
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: shldl $28, %ecx, %ebx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: shldl $28, %edi, %esi
+; X86-NEXT: shldl $28, %eax, %edi
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shldl $28, %eax, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shrdl $4, %eax, %ecx
+; X86-NEXT: shrl $4, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebp, 28(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl %ebp, 24(%eax)
+; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
+; X86-NEXT: movl %ebp, 20(%eax)
+; X86-NEXT: movl %ebx, 16(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+;
+; X64-LABEL: udiv256_pow2:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrdq $4, %rdx, %rsi
+; X64-NEXT: shrdq $4, %rcx, %rdx
+; X64-NEXT: shrdq $4, %r8, %rcx
+; X64-NEXT: shrq $4, %r8
+; X64-NEXT: movq %r8, 24(%rdi)
+; X64-NEXT: movq %rcx, 16(%rdi)
+; X64-NEXT: movq %rdx, 8(%rdi)
+; X64-NEXT: movq %rsi, (%rdi)
+; X64-NEXT: retq
+ %r = udiv i256 %a, 16
+ ret i256 %r
+}
+
+; Division by constant
+define i256 @sdiv256_const(i256 %a) nounwind {
+; X86-LABEL: sdiv256_const:
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $288, %esp # imm = 0x120
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: sarl $31, %eax
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl 24(%ebp), %edx
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebp), %edx
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: movl 16(%ebp), %ebx
+; X86-NEXT: xorl %eax, %ebx
+; X86-NEXT: movl 12(%ebp), %esi
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: subl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: jne .LBB5_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %ecx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB5_3
+; X86-NEXT: .LBB5_1:
+; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB5_3: # %_udiv-special-cases
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: jne .LBB5_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB5_6
+; X86-NEXT: .LBB5_4:
+; X86-NEXT: bsrl %ebx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB5_6: # %_udiv-special-cases
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: jne .LBB5_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB5_8: # %_udiv-special-cases
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB5_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: bsrl %eax, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: jmp .LBB5_11
+; X86-NEXT: .LBB5_9:
+; X86-NEXT: bsrl %edx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: .LBB5_11: # %_udiv-special-cases
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: jne .LBB5_12
+; X86-NEXT: # %bb.13: # %_udiv-special-cases
+; X86-NEXT: bsrl %esi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: jmp .LBB5_14
+; X86-NEXT: .LBB5_12:
+; X86-NEXT: bsrl %ecx, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: .LBB5_14: # %_udiv-special-cases
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: jne .LBB5_16
+; X86-NEXT: # %bb.15: # %_udiv-special-cases
+; X86-NEXT: orl $64, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB5_16: # %_udiv-special-cases
+; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: jne .LBB5_18
+; X86-NEXT: # %bb.17: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl $128, %eax
+; X86-NEXT: .LBB5_18: # %_udiv-special-cases
+; X86-NEXT: movl $253, %esi
+; X86-NEXT: subl %eax, %esi
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB5_19
+; X86-NEXT: # %bb.20: # %select.false.sink
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $255, %edx
+; X86-NEXT: cmpl %esi, %edx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edi, %edx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB5_21: # %select.end
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jne .LBB5_23
+; X86-NEXT: # %bb.22: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: .LBB5_23: # %select.end
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jne .LBB5_24
+; X86-NEXT: # %bb.30: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: xorl $255, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: je .LBB5_31
+; X86-NEXT: # %bb.28: # %udiv-bb1
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: notb %cl
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrb $3, %dl
+; X86-NEXT: andb $28, %dl
+; X86-NEXT: negb %dl
+; X86-NEXT: movsbl %dl, %edx
+; X86-NEXT: movl 264(%esp,%edx), %esi
+; X86-NEXT: movl 268(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 260(%esp,%edx), %ebx
+; X86-NEXT: shldl %cl, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 256(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 252(%esp,%edx), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 248(%esp,%edx), %ebx
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl 240(%esp,%edx), %eax
+; X86-NEXT: movl 244(%esp,%edx), %edx
+; X86-NEXT: shldl %cl, %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl $1, %ecx
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB5_29
+; X86-NEXT: # %bb.25: # %udiv-preheader
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %ebx
+; X86-NEXT: movl 172(%esp,%ebx,4), %esi
+; X86-NEXT: movl 168(%esp,%ebx,4), %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 164(%esp,%ebx,4), %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 160(%esp,%ebx,4), %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shrdl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 156(%esp,%ebx,4), %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 152(%esp,%ebx,4), %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shrdl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 144(%esp,%ebx,4), %edi
+; X86-NEXT: movl 148(%esp,%ebx,4), %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: shrdl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $7, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB5_26: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ebx
+; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $7, %edi
+; X86-NEXT: andl %edi, %ecx
+; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: jne .LBB5_26
+; X86-NEXT: .LBB5_27: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%eax,2), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB5_31: # %udiv-end
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl %eax, (%edx)
+; X86-NEXT: movl %ecx, 4(%edx)
+; X86-NEXT: movl %esi, 8(%edx)
+; X86-NEXT: movl %edi, 12(%edx)
+; X86-NEXT: movl %ebx, 16(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 20(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 24(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB5_19:
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB5_21
+; X86-NEXT: .LBB5_29:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB5_27
+; X86-NEXT: .LBB5_24:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB5_31
+;
+; X64-LABEL: sdiv256_const:
+; X64: # %bb.0: # %_udiv-special-cases
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $256, %rsp # imm = 0x100
+; X64-NEXT: movq %rcx, %r9
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: sarq $63, %rax
+; X64-NEXT: xorq %rax, %r8
+; X64-NEXT: xorq %rax, %r9
+; X64-NEXT: xorq %rax, %rdx
+; X64-NEXT: xorq %rax, %rsi
+; X64-NEXT: subq %rax, %rsi
+; X64-NEXT: sbbq %rax, %rdx
+; X64-NEXT: sbbq %rax, %r9
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: sbbq %rax, %r8
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: orq %r8, %rax
+; X64-NEXT: movq %rsi, %rcx
+; X64-NEXT: orq %r9, %rcx
+; X64-NEXT: bsrq %r8, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: bsrq %r9, %r10
+; X64-NEXT: xorq $63, %r10
+; X64-NEXT: orq $64, %r10
+; X64-NEXT: testq %r8, %r8
+; X64-NEXT: cmovneq %rdi, %r10
+; X64-NEXT: bsrq %rdx, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: bsrq %rsi, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: orq $64, %r11
+; X64-NEXT: testq %rdx, %rdx
+; X64-NEXT: cmovneq %rdi, %r11
+; X64-NEXT: orq $128, %r11
+; X64-NEXT: movq %r9, %rdi
+; X64-NEXT: orq %r8, %rdi
+; X64-NEXT: cmovneq %r10, %r11
+; X64-NEXT: movl $253, %r15d
+; X64-NEXT: subq %r11, %r15
+; X64-NEXT: movl $0, %r11d
+; X64-NEXT: movl $0, %r13d
+; X64-NEXT: sbbq %r13, %r13
+; X64-NEXT: movl $0, %ebx
+; X64-NEXT: sbbq %rbx, %rbx
+; X64-NEXT: sbbq %r11, %r11
+; X64-NEXT: orq %rax, %rcx
+; X64-NEXT: je .LBB5_1
+; X64-NEXT: # %bb.2: # %select.false.sink
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl $255, %ecx
+; X64-NEXT: cmpq %r15, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %r13, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %rbx, %rcx
+; X64-NEXT: sbbq %r11, %rax
+; X64-NEXT: setb %al
+; X64-NEXT: .LBB5_3: # %select.end
+; X64-NEXT: xorl %r14d, %r14d
+; X64-NEXT: testb %al, %al
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: cmovneq %r14, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rsi, %r10
+; X64-NEXT: cmovneq %r14, %r10
+; X64-NEXT: movq %r9, %r12
+; X64-NEXT: cmovneq %r14, %r12
+; X64-NEXT: cmoveq %r8, %r14
+; X64-NEXT: jne .LBB5_4
+; X64-NEXT: # %bb.10: # %select.end
+; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: xorq $255, %rcx
+; X64-NEXT: orq %rbx, %rcx
+; X64-NEXT: movq %r13, %rdi
+; X64-NEXT: orq %r11, %rdi
+; X64-NEXT: orq %rcx, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: je .LBB5_11
+; X64-NEXT: # %bb.8: # %udiv-bb1
+; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %r15d, %ecx
+; X64-NEXT: notb %cl
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: shrb $3, %al
+; X64-NEXT: andb $24, %al
+; X64-NEXT: negb %al
+; X64-NEXT: movsbq %al, %rax
+; X64-NEXT: movq 208(%rsp,%rax), %r12
+; X64-NEXT: movq 216(%rsp,%rax), %r14
+; X64-NEXT: shldq %cl, %r12, %r14
+; X64-NEXT: movq 192(%rsp,%rax), %r10
+; X64-NEXT: movq 200(%rsp,%rax), %rax
+; X64-NEXT: shldq %cl, %rax, %r12
+; X64-NEXT: shldq %cl, %r10, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: shlq %cl, %r10
+; X64-NEXT: addq $1, %r15
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: adcq $0, %r11
+; X64-NEXT: jb .LBB5_9
+; X64-NEXT: # %bb.5: # %udiv-preheader
+; X64-NEXT: movq %r15, %rcx
+; X64-NEXT: movq %r11, %r13
+; X64-NEXT: movl $7, %r11d
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: shrb $6, %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movq 120(%rsp,%rax,8), %rdx
+; X64-NEXT: movq 112(%rsp,%rax,8), %rdi
+; X64-NEXT: movq %rdi, %rsi
+; X64-NEXT: shrdq %cl, %rdx, %rsi
+; X64-NEXT: movq 96(%rsp,%rax,8), %r8
+; X64-NEXT: movq 104(%rsp,%rax,8), %rax
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: shrdq %cl, %rdi, %r9
+; X64-NEXT: shrq %cl, %rdx
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NEXT: shrdq %cl, %rax, %r8
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: addq $-1, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: adcq $-1, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: adcq $-1, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: adcq $-1, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB5_6: # %udiv-do-while
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: shldq $1, %rsi, %rdx
+; X64-NEXT: shldq $1, %r9, %rsi
+; X64-NEXT: shldq $1, %r8, %r9
+; X64-NEXT: shldq $1, %r14, %r8
+; X64-NEXT: shldq $1, %r12, %r14
+; X64-NEXT: orq %rdi, %r14
+; X64-NEXT: shldq $1, %r11, %r12
+; X64-NEXT: orq %rax, %r12
+; X64-NEXT: shldq $1, %r10, %r11
+; X64-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT: addq %r10, %r10
+; X64-NEXT: orq %rcx, %r10
+; X64-NEXT: cmpq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: sbbq %r9, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: sbbq %rsi, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %rdx, %rcx
+; X64-NEXT: sarq $63, %rcx
+; X64-NEXT: movl $7, %eax
+; X64-NEXT: andl %ecx, %eax
+; X64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: subq %rax, %r8
+; X64-NEXT: sbbq $0, %r9
+; X64-NEXT: sbbq $0, %rsi
+; X64-NEXT: sbbq $0, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: addq $-1, %rdi
+; X64-NEXT: adcq $-1, %r15
+; X64-NEXT: adcq $-1, %rbx
+; X64-NEXT: adcq $-1, %r13
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: orq %r13, %rax
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %rbx, %rdi
+; X64-NEXT: orq %rax, %rdi
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: movl $0, %edi
+; X64-NEXT: jne .LBB5_6
+; X64-NEXT: .LBB5_7: # %udiv-loop-exit
+; X64-NEXT: shldq $1, %r12, %r14
+; X64-NEXT: shldq $1, %r11, %r12
+; X64-NEXT: shldq $1, %r10, %r11
+; X64-NEXT: leaq (%rcx,%r10,2), %r10
+; X64-NEXT: movq %r11, %rcx
+; X64-NEXT: .LBB5_11: # %udiv-end
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: xorq %rax, %r14
+; X64-NEXT: xorq %rax, %r12
+; X64-NEXT: xorq %rax, %rcx
+; X64-NEXT: xorq %rax, %r10
+; X64-NEXT: subq %rax, %r10
+; X64-NEXT: sbbq %rax, %rcx
+; X64-NEXT: sbbq %rax, %r12
+; X64-NEXT: sbbq %rax, %r14
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %r10, (%rax)
+; X64-NEXT: movq %rcx, 8(%rax)
+; X64-NEXT: movq %r12, 16(%rax)
+; X64-NEXT: movq %r14, 24(%rax)
+; X64-NEXT: leaq -40(%rbp), %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+; X64-NEXT: .LBB5_1:
+; X64-NEXT: movb $1, %al
+; X64-NEXT: jmp .LBB5_3
+; X64-NEXT: .LBB5_9:
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: jmp .LBB5_7
+; X64-NEXT: .LBB5_4:
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: jmp .LBB5_11
+ %r = sdiv i256 %a, 7
+ ret i256 %r
+}
diff --git a/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll b/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
index a635d55d2033d..505077f5df5f5 100644
--- a/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
+++ b/llvm/test/CodeGen/X86/expand-large-fp-optnone.ll
@@ -6,242 +6,12 @@
; Function Attrs: noinline optnone
define double @main(i224 %0) #0 {
; CHECK-LABEL: main:
-; CHECK: # %bb.0: # %entryitofp-entry
-; CHECK-NEXT: pushq %rbp
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: pushq %r15
-; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: pushq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: pushq %r13
-; CHECK-NEXT: .cfi_def_cfa_offset 40
-; CHECK-NEXT: pushq %r12
-; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 56
-; CHECK-NEXT: subq $88, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 144
-; CHECK-NEXT: .cfi_offset %rbx, -56
-; CHECK-NEXT: .cfi_offset %r12, -48
-; CHECK-NEXT: .cfi_offset %r13, -40
-; CHECK-NEXT: .cfi_offset %r14, -32
-; CHECK-NEXT: .cfi_offset %r15, -24
-; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: movq %rdi, %r8
-; CHECK-NEXT: orq %rdx, %r8
-; CHECK-NEXT: movq %rsi, %r9
-; CHECK-NEXT: orq %rax, %r9
-; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: orq %r9, %r8
-; CHECK-NEXT: je .LBB0_10
-; CHECK-NEXT: jmp .LBB0_1
-; CHECK-NEXT: .LBB0_1: # %itofp-if-end
-; CHECK-NEXT: movslq %ecx, %rax
-; CHECK-NEXT: movq %rax, %r9
-; CHECK-NEXT: sarq $31, %r9
-; CHECK-NEXT: sarq $63, %rax
-; CHECK-NEXT: xorq %rax, %rcx
-; CHECK-NEXT: xorq %rax, %rdx
-; CHECK-NEXT: xorq %rax, %rsi
-; CHECK-NEXT: xorq %r9, %rdi
-; CHECK-NEXT: subq %r9, %rdi
-; CHECK-NEXT: sbbq %rax, %rsi
-; CHECK-NEXT: sbbq %rax, %rdx
-; CHECK-NEXT: sbbq %rax, %rcx
-; CHECK-NEXT: movq %rcx, %r8
-; CHECK-NEXT: shldq $32, %rdx, %r8
-; CHECK-NEXT: bsrq %r8, %rax
-; CHECK-NEXT: xorl $63, %eax
-; CHECK-NEXT: movq %rdx, %r10
-; CHECK-NEXT: shldq $32, %rsi, %r10
-; CHECK-NEXT: bsrq %r10, %r11
-; CHECK-NEXT: xorl $63, %r11d
-; CHECK-NEXT: orl $64, %r11d
-; CHECK-NEXT: testq %r8, %r8
-; CHECK-NEXT: cmovnel %eax, %r11d
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: shldq $32, %rdi, %rbx
-; CHECK-NEXT: bsrq %rbx, %r14
-; CHECK-NEXT: xorl $63, %r14d
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shlq $32, %rax
-; CHECK-NEXT: bsrq %rax, %rax
-; CHECK-NEXT: xorl $63, %eax
-; CHECK-NEXT: orl $64, %eax
-; CHECK-NEXT: testq %rbx, %rbx
-; CHECK-NEXT: cmovnel %r14d, %eax
-; CHECK-NEXT: subl $-128, %eax
-; CHECK-NEXT: orq %r8, %r10
-; CHECK-NEXT: cmovnel %r11d, %eax
-; CHECK-NEXT: movl $224, %r11d
-; CHECK-NEXT: subl %eax, %r11d
-; CHECK-NEXT: movl $223, %r10d
-; CHECK-NEXT: subl %eax, %r10d
-; CHECK-NEXT: cmpl $53, %r11d
-; CHECK-NEXT: jle .LBB0_8
-; CHECK-NEXT: # %bb.2: # %itofp-if-then4
-; CHECK-NEXT: movl %r11d, %r8d
-; CHECK-NEXT: subl $54, %r8d
-; CHECK-NEXT: je .LBB0_4
-; CHECK-NEXT: jmp .LBB0_3
-; CHECK-NEXT: .LBB0_3: # %itofp-if-then4
-; CHECK-NEXT: movl %r11d, %r8d
-; CHECK-NEXT: subl $55, %r8d
-; CHECK-NEXT: jne .LBB0_5
-; CHECK-NEXT: # %bb.11:
-; CHECK-NEXT: jmp .LBB0_6
-; CHECK-NEXT: .LBB0_4: # %itofp-sw-bb
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: shldq $1, %rdi, %rax
-; CHECK-NEXT: movq %rdx, %r8
-; CHECK-NEXT: shldq $1, %rsi, %r8
-; CHECK-NEXT: shldq $1, %rdx, %rcx
-; CHECK-NEXT: addq %rdi, %rdi
-; CHECK-NEXT: movq %rax, %rsi
-; CHECK-NEXT: movq %r8, %rdx
-; CHECK-NEXT: jmp .LBB0_6
-; CHECK-NEXT: .LBB0_5: # %itofp-sw-default
-; CHECK-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movl %ecx, %r8d
-; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $-87, %r8b
-; CHECK-NEXT: subb %al, %r8b
-; CHECK-NEXT: movb %r8b, %bl
-; CHECK-NEXT: shrb $6, %bl
-; CHECK-NEXT: movzbl %bl, %r12d
-; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, (%rsp)
-; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq -24(%rsp,%r12,8), %rbx
-; CHECK-NEXT: movq -32(%rsp,%r12,8), %r13
-; CHECK-NEXT: movq %rcx, %rbp
-; CHECK-NEXT: movb %r8b, %cl
-; CHECK-NEXT: movq %r13, %r14
-; CHECK-NEXT: shrdq %cl, %rbx, %r14
-; CHECK-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq -48(%rsp,%r12,8), %r15
-; CHECK-NEXT: movq -40(%rsp,%r12,8), %r12
-; CHECK-NEXT: movb %r8b, %cl
-; CHECK-NEXT: movq %r12, %r14
-; CHECK-NEXT: shrdq %cl, %r13, %r14
-; CHECK-NEXT: movb %r8b, %cl
-; CHECK-NEXT: shrq %cl, %rbx
-; CHECK-NEXT: movb %r8b, %cl
-; CHECK-NEXT: shrdq %cl, %r12, %r15
-; CHECK-NEXT: addb $55, %al
-; CHECK-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rbp, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb %al, %cl
-; CHECK-NEXT: shrb $3, %cl
-; CHECK-NEXT: andb $24, %cl
-; CHECK-NEXT: negb %cl
-; CHECK-NEXT: movsbq %cl, %rdx
-; CHECK-NEXT: movq -80(%rsp,%rdx), %rsi
-; CHECK-NEXT: movq -72(%rsp,%rdx), %rdi
-; CHECK-NEXT: movq -64(%rsp,%rdx), %r8
-; CHECK-NEXT: movb %al, %cl
-; CHECK-NEXT: movq %r8, %r12
-; CHECK-NEXT: shldq %cl, %rdi, %r12
-; CHECK-NEXT: movb %al, %cl
-; CHECK-NEXT: movq %rsi, %r13
-; CHECK-NEXT: shlq %cl, %r13
-; CHECK-NEXT: orq %r12, %r13
-; CHECK-NEXT: movq -56(%rsp,%rdx), %rdx
-; CHECK-NEXT: movb %al, %cl
-; CHECK-NEXT: shldq %cl, %r8, %rdx
-; CHECK-NEXT: movl %edx, %edx
-; CHECK-NEXT: movb %al, %cl
-; CHECK-NEXT: shldq %cl, %rsi, %rdi
-; CHECK-NEXT: orq %rdx, %rdi
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: orq %rdi, %r13
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: orq %rax, %r15
-; CHECK-NEXT: movq %r15, %rdi
-; CHECK-NEXT: movq %r14, %rsi
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; CHECK-NEXT: movq %rbx, %rcx
-; CHECK-NEXT: jmp .LBB0_6
-; CHECK-NEXT: .LBB0_6: # %itofp-sw-epilog
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: shrl $2, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: orq %rax, %rdi
-; CHECK-NEXT: addq $1, %rdi
-; CHECK-NEXT: adcq $0, %rsi
-; CHECK-NEXT: adcq $0, %rdx
-; CHECK-NEXT: adcq $0, %rcx
-; CHECK-NEXT: movq %rsi, %rdx
-; CHECK-NEXT: shldq $62, %rdi, %rdx
-; CHECK-NEXT: movq %rdx, %rax
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: btq $55, %rdi
-; CHECK-NEXT: jae .LBB0_9
-; CHECK-NEXT: jmp .LBB0_7
-; CHECK-NEXT: .LBB0_7: # %itofp-if-then20
-; CHECK-NEXT: shldq $61, %rdi, %rsi
-; CHECK-NEXT: movq %rsi, %rax
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: movq %rsi, %rdx
-; CHECK-NEXT: movl %r11d, %r10d
-; CHECK-NEXT: jmp .LBB0_9
-; CHECK-NEXT: .LBB0_8: # %itofp-if-else
-; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: addb $85, %al
-; CHECK-NEXT: movb %al, %cl
-; CHECK-NEXT: shrb $3, %cl
-; CHECK-NEXT: andb $24, %cl
-; CHECK-NEXT: negb %cl
-; CHECK-NEXT: movsbq %cl, %rcx
-; CHECK-NEXT: movq 48(%rsp,%rcx), %rdx
-; CHECK-NEXT: movb %al, %cl
-; CHECK-NEXT: shlq %cl, %rdx
-; CHECK-NEXT: movq %rdx, %rax
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: .LBB0_9: # %itofp-if-end26
-; CHECK-NEXT: andl $-2147483648, %r9d # imm = 0x80000000
-; CHECK-NEXT: shll $20, %r10d
-; CHECK-NEXT: addl $1072693248, %r10d # imm = 0x3FF00000
-; CHECK-NEXT: andl $1048575, %eax # imm = 0xFFFFF
-; CHECK-NEXT: orl %r9d, %eax
-; CHECK-NEXT: orl %r10d, %eax
-; CHECK-NEXT: movl %eax, %eax
-; CHECK-NEXT: shlq $32, %rax
-; CHECK-NEXT: movabsq $4294967295, %rcx # imm = 0xFFFFFFFF
-; CHECK-NEXT: andq %rcx, %rdx
-; CHECK-NEXT: orq %rdx, %rax
-; CHECK-NEXT: movq %rax, %xmm0
-; CHECK-NEXT: .LBB0_10: # %itofp-return
-; CHECK-NEXT: addq $88, %rsp
-; CHECK-NEXT: .cfi_def_cfa_offset 56
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: popq %r12
-; CHECK-NEXT: .cfi_def_cfa_offset 40
-; CHECK-NEXT: popq %r13
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: popq %r14
-; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: popq %r15
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: movslq %ecx, %rcx
+; CHECK-NEXT: callq __floatoidf at PLT
+; CHECK-NEXT: popq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/fp-i129.ll b/llvm/test/CodeGen/X86/fp-i129.ll
index c55c19abbd9b8..cf260b8cc4773 100644
--- a/llvm/test/CodeGen/X86/fp-i129.ll
+++ b/llvm/test/CodeGen/X86/fp-i129.ll
@@ -1,94 +1,136 @@
; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,X64
+; On i686, these are expanded inline. On x86_64 with MaxLargeFPConvertBitWidthSupported=256,
+; i129 is promoted to i256 and uses __fix*oi / __float*oi libcalls.
define i129 @fptosi_float(float %a) nounwind {
-; CHECK-LABEL: fptosi_float:
-; CHECK-NOT: call
+; X86-LABEL: fptosi_float:
+; X86-NOT: __fixsfoi
+;
+; X64-LABEL: fptosi_float:
+; X64: callq __fixsfoi at PLT
%res = fptosi float %a to i129
ret i129 %res
}
define i129 @fptosi_double(double %a) nounwind {
-; CHECK-LABEL: fptosi_double:
-; CHECK-NOT: call
+; X86-LABEL: fptosi_double:
+; X86-NOT: __fixdfoi
+;
+; X64-LABEL: fptosi_double:
+; X64: callq __fixdfoi at PLT
%res = fptosi double %a to i129
ret i129 %res
}
define i129 @fptosi_fp128(fp128 %a) nounwind {
-; CHECK-LABEL: fptosi_fp128:
-; CHECK-NOT: call
+; X86-LABEL: fptosi_fp128:
+; X86-NOT: __fixtfoi
+;
+; X64-LABEL: fptosi_fp128:
+; X64: callq __fixtfoi at PLT
%res = fptosi fp128 %a to i129
ret i129 %res
}
define i129 @fptoui_float(float %a) nounwind {
-; CHECK-LABEL: fptoui_float:
-; CHECK-NOT: call
+; X86-LABEL: fptoui_float:
+; X86-NOT: __fixunssfoi
+;
+; X64-LABEL: fptoui_float:
+; X64: callq __fixunssfoi at PLT
%res = fptoui float %a to i129
ret i129 %res
}
define i129 @fptoui_double(double %a) nounwind {
-; CHECK-LABEL: fptoui_double:
-; CHECK-NOT: call
+; X86-LABEL: fptoui_double:
+; X86-NOT: __fixunsdfoi
+;
+; X64-LABEL: fptoui_double:
+; X64: callq __fixunsdfoi at PLT
%res = fptoui double %a to i129
ret i129 %res
}
define i129 @fptoui_fp128(fp128 %a) nounwind {
-; CHECK-LABEL: fptoui_fp128:
-; CHECK-NOT: call
+; X86-LABEL: fptoui_fp128:
+; X86-NOT: __fixunstfoi
+;
+; X64-LABEL: fptoui_fp128:
+; X64: callq __fixunstfoi at PLT
%res = fptoui fp128 %a to i129
ret i129 %res
}
define float @sitofp_float(i129 %a) nounwind {
-; CHECK-LABEL: sitofp_float:
-; CHECK-NOT: call
+; X86-LABEL: sitofp_float:
+; X86-NOT: __floatoisf
+;
+; X64-LABEL: sitofp_float:
+; X64: callq __floatoisf at PLT
%res = sitofp i129 %a to float
ret float %res
}
define double @sitofp_double(i129 %a) nounwind {
-; CHECK-LABEL: sitofp_double:
-; CHECK-NOT: call
+; X86-LABEL: sitofp_double:
+; X86-NOT: __floatoidf
+;
+; X64-LABEL: sitofp_double:
+; X64: callq __floatoidf at PLT
%res = sitofp i129 %a to double
ret double %res
}
define fp128 @sitofp_fp128(i129 %a) nounwind {
-; CHECK-LABEL: sitofp_fp128:
-; CHECK-NOT: call
+; X86-LABEL: sitofp_fp128:
+; X86-NOT: __floatoitf
+;
+; X64-LABEL: sitofp_fp128:
+; X64: callq __floatoitf at PLT
%res = sitofp i129 %a to fp128
ret fp128 %res
}
define float @uitofp_float(i129 %a) nounwind {
-; CHECK-LABEL: uitofp_float:
-; CHECK-NOT: call
+; X86-LABEL: uitofp_float:
+; X86-NOT: __floatunoisf
+;
+; X64-LABEL: uitofp_float:
+; X64: callq __floatunoisf at PLT
%res = uitofp i129 %a to float
ret float %res
}
define double @uitofp_double(i129 %a) nounwind {
-; CHECK-LABEL: uitofp_double:
-; CHECK-NOT: call
+; X86-LABEL: uitofp_double:
+; X86-NOT: __floatunoidf
+;
+; X64-LABEL: uitofp_double:
+; X64: callq __floatunoidf at PLT
%res = uitofp i129 %a to double
ret double %res
}
define fp128 @uitofp_fp128(i129 %a) nounwind {
-; CHECK-LABEL: uitofp_fp128:
-; CHECK-NOT: call
+; X86-LABEL: uitofp_fp128:
+; X86-NOT: __floatunoitf
+;
+; X64-LABEL: uitofp_fp128:
+; X64: callq __floatunoitf at PLT
%res = uitofp i129 %a to fp128
ret fp128 %res
}
-; higher sizes
+; i257 is wider than MaxLargeFPConvertBitWidthSupported=256, so the FP conversion
+; is expanded inline. The inline expansion uses i256 multiply/shift libcalls.
define i257 @fptosi257_double(double %a) nounwind {
-; CHECK-LABEL: fptosi257_double:
-; CHECK-NOT: call
+; X86-LABEL: fptosi257_double:
+; X86-NOT: __fixdfoi
+;
+; X64-LABEL: fptosi257_double:
+; X64-NOT: __fixdfoi
%res = fptosi double %a to i257
ret i257 %res
}
diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
index f78e34ef60569..0cbe783b69c0e 100644
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -118,8 +118,330 @@ define i128 @test2(i128 %x) nounwind {
define i128 @test3(i128 %x) nounwind {
; X86-LABEL: test3:
-; X86 doesn't have __divti3, so the urem is expanded into a loop.
-; X86: udiv-do-while
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $160, %esp
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: sarl $31, %ebx
+; X86-NEXT: xorl %ebx, %eax
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: xorl %ebx, %edi
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: xorl %ebx, %edx
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: xorl %ebx, %esi
+; X86-NEXT: subl %ebx, %esi
+; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: sbbl %ebx, %edi
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: jne .LBB2_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB2_3
+; X86-NEXT: .LBB2_1:
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB2_3: # %_udiv-special-cases
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB2_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB2_6
+; X86-NEXT: .LBB2_4:
+; X86-NEXT: bsrl %edx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB2_6: # %_udiv-special-cases
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: jne .LBB2_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: .LBB2_8: # %_udiv-special-cases
+; X86-NEXT: movl $61, %ecx
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB2_9
+; X86-NEXT: # %bb.10: # %select.false.sink
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $127, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: cmpl %esi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB2_11: # %select.end
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jne .LBB2_13
+; X86-NEXT: # %bb.12: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: .LBB2_13: # %select.end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: notl %ebx
+; X86-NEXT: jne .LBB2_14
+; X86-NEXT: # %bb.20: # %select.end
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: xorl $127, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: je .LBB2_21
+; X86-NEXT: # %bb.18: # %udiv-bb1
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorb $127, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 136(%esp,%eax), %edx
+; X86-NEXT: movl 140(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esp,%eax), %edi
+; X86-NEXT: movl 132(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jb .LBB2_19
+; X86-NEXT: # %bb.15: # %udiv-preheader
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %edi
+; X86-NEXT: movl 92(%esp,%edi), %eax
+; X86-NEXT: movl 88(%esp,%edi), %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shrdl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%edi), %ebx
+; X86-NEXT: movl 84(%esp,%edi), %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $3, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $4, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB2_16: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %eax
+; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %ebx, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ecx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl $4, %ebx
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: movl $3, %ebx
+; X86-NEXT: andl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: addl $-1, %edi
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB2_16
+; X86-NEXT: .LBB2_17: # %udiv-loop-exit
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: shldl $1, %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%eax,2), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: .LBB2_21: # %udiv-end
+; X86-NEXT: xorl %ebx, %edx
+; X86-NEXT: xorl %ebx, %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl %ebx, %edx
+; X86-NEXT: xorl %ebx, %edi
+; X86-NEXT: subl %ebx, %edi
+; X86-NEXT: sbbl %ebx, %edx
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB2_9:
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: jmp .LBB2_11
+; X86-NEXT: .LBB2_19:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB2_17
+; X86-NEXT: .LBB2_14:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jmp .LBB2_21
;
; X64-LABEL: test3:
; X64: # %bb.0:
@@ -129,6 +451,7 @@ define i128 @test3(i128 %x) nounwind {
; X64-NEXT: callq __divti3 at PLT
; X64-NEXT: popq %rcx
; X64-NEXT: retq
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
%tmp = sdiv i128 %x, -73786976294838206467
ret i128 %tmp
}
diff --git a/llvm/test/CodeGen/X86/memfold-mov32r0.ll b/llvm/test/CodeGen/X86/memfold-mov32r0.ll
index f7cbf6c33c94c..985b8a597dee2 100644
--- a/llvm/test/CodeGen/X86/memfold-mov32r0.ll
+++ b/llvm/test/CodeGen/X86/memfold-mov32r0.ll
@@ -3,7 +3,7 @@
; CHECK: movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
define i32 @test() nounwind {
entry:
- %div = udiv i256 0, 0
- store i256 %div, ptr null, align 16
+ %div = udiv i512 0, 0
+ store i512 %div, ptr null, align 16
ret i32 0
}
diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index bb93e34fda7c4..07096cd3482ec 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -4817,1126 +4817,668 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
; X64-LABEL: test_1024:
; X64: # %bb.0:
; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
; X64-NEXT: pushq %r15
; X64-NEXT: pushq %r14
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: subq $240, %rsp
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $1216, %rsp # imm = 0x4C0
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq 40(%rdi), %rbx
-; X64-NEXT: movq 32(%rdi), %r12
-; X64-NEXT: movq 56(%rdi), %r15
-; X64-NEXT: movq 48(%rdi), %r10
-; X64-NEXT: movq (%rsi), %r11
-; X64-NEXT: movq 8(%rsi), %r14
-; X64-NEXT: movq %rsi, %r13
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %r9, %r8
-; X64-NEXT: adcq %rsi, %r10
-; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %r9d
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %r10, %rsi
-; X64-NEXT: adcq %r9, %rcx
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r9, %r11
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: addq %r11, %rax
+; X64-NEXT: movq 8(%rdi), %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r10, %r9
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %rbx, %r11
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r9, %r15
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %rbx
-; X64-NEXT: addq %rdi, %r15
-; X64-NEXT: adcq %r8, %rbx
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq 16(%r13), %r8
-; X64-NEXT: movq %r12, %r10
-; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rdi, %r12
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq 24(%r13), %rbp
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: addq %r12, %rax
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: adcq %r9, %r13
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %r13, %r9
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: addq %r15, %r14
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbx, %r12
-; X64-NEXT: movq %r12, (%rsp) # 8-byte Spill
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: addq %rsi, %r9
-; X64-NEXT: adcq %rcx, %rdi
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: adcq %rsi, %rcx
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %rdx
-; X64-NEXT: addq %r9, %r11
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rdi, %rbx
+; X64-NEXT: movq 16(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 24(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 96(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 104(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 112(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 120(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 64(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 72(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 80(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 88(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 32(%rdi), %rbx
; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %rcx
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rdx
+; X64-NEXT: movq 40(%rdi), %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq 16(%r14), %r11
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq 24(%r14), %r8
-; X64-NEXT: movq %r8, %rax
+; X64-NEXT: movq 48(%rdi), %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 56(%rdi), %r8
; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rdi, %rbx
-; X64-NEXT: adcq %rsi, %r15
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r15, %rdi
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %rcx
-; X64-NEXT: movq (%r14), %rbp
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r10
+; X64-NEXT: movq 96(%rsi), %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq 8(%r14), %r14
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rsi, %r12
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: addq %r12, %rax
+; X64-NEXT: movq 104(%rsi), %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %rsi
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r14, %r15
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rsi, %r13
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: addq %r9, %r13
-; X64-NEXT: adcq %rbx, %r12
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq 112(%rsi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 120(%rsi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq (%rsi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 8(%rsi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 16(%rsi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 24(%rsi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 32(%rsi), %r12
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 40(%rsi), %r13
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 48(%rsi), %r15
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 56(%rsi), %r14
; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rsi, %rbx
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: adcq %r9, %rbp
-; X64-NEXT: setb %r9b
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %rbp, %rsi
-; X64-NEXT: movzbl %r9b, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %r13, %r10
+; X64-NEXT: movq 64(%rsi), %r9
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 72(%rsi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 80(%rsi), %r10
; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r12, %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: addq %rdi, %rsi
-; X64-NEXT: adcq %rcx, %r15
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: movq %r8, %rdi
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rcx, %r12
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: movq %r11, %rbp
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r12, %r11
-; X64-NEXT: adcq %rdi, %r13
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %r8, %r9
-; X64-NEXT: mulq %r14
-; X64-NEXT: addq %r13, %rax
-; X64-NEXT: movzbl %dil, %ecx
-; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %rsi, %rbx
-; X64-NEXT: adcq %r15, %r11
-; X64-NEXT: movzbl %r10b, %ecx
-; X64-NEXT: adcq %rcx, %rax
-; X64-NEXT: adcq $0, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; X64-NEXT: movq 88(%rsi), %r11
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq (%rsp), %rdx # 8-byte Folded Reload
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT: subq $8, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: pushq %r11
+; X64-NEXT: pushq %r10
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq 32(%r8), %rcx
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rsi, %r11
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq 40(%r8), %rbx
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %r11, %rsi
-; X64-NEXT: adcq %rdi, %r15
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r15, %r11
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r15, %rbp
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %rbx, %rcx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: addq %rbp, %rax
-; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT: adcq %r13, %r10
-; X64-NEXT: setb %bl
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r10, %rbp
-; X64-NEXT: movzbl %bl, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %r12, %rbp
-; X64-NEXT: adcq %rsi, %r15
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq 48(%r8), %rcx
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r14, %r12
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rsi, %r13
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq 56(%r8), %rsi
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %r13, %r9
-; X64-NEXT: adcq %r10, %r14
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r14, %r13
-; X64-NEXT: movzbl %r8b, %eax
-; X64-NEXT: adcq %rax, %rsi
-; X64-NEXT: addq %rbp, %rbx
-; X64-NEXT: adcq %r15, %r9
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: addq %r11, %r13
-; X64-NEXT: adcq %rdi, %rsi
-; X64-NEXT: setb %r11b
+; X64-NEXT: movq %r12, %r9
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r13
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r12
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: movq %rbx, %r9
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %rbx, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: movq %r13, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rdi, %r14
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r14, %rbp
-; X64-NEXT: adcq %r10, %r8
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r8, %rdi
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %r13, %r12
-; X64-NEXT: adcq %rsi, %rbp
-; X64-NEXT: movzbl %r11b, %eax
-; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: adcq %rax, (%rsp) # 8-byte Folded Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %r12
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %rbx, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: movq %r13, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r13, %r9
+; X64-NEXT: movq %r13, %r12
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %rbx, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: movq %r15, %rbx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r12, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rcx, %r8
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r8, %rbx
-; X64-NEXT: adcq %rsi, %r10
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %r10, %rsi
-; X64-NEXT: movzbl %r8b, %eax
-; X64-NEXT: adcq %rax, %rcx
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r8, %r14
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %r11, %r13
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: addq %r14, %rax
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r10, %r8
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: movq %r12, %r11
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r8, %r13
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: addq %rbp, %r13
-; X64-NEXT: adcq %rbx, %r12
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: movq %r11, %rbx
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %r8, %r10
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r10, %r11
-; X64-NEXT: adcq %rbp, %r8
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %r9, %rbp
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r8, %rbx
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %r9
-; X64-NEXT: addq %r13, %r14
-; X64-NEXT: movq %r14, %r13
-; X64-NEXT: adcq %r12, %r11
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: addq %rsi, %rbx
-; X64-NEXT: adcq %rcx, %r9
-; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: movq %r15, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r13, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r15
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq %r15, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rcx, %r8
-; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %r8, %rax
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: adcq %rsi, %rcx
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rbp
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %rdx
-; X64-NEXT: addq %rbx, %r10
-; X64-NEXT: adcq %r9, %r8
-; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT: adcq %rax, %rcx
-; X64-NEXT: adcq $0, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; X64-NEXT: adcq %rdi, %r13
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %r11
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT: adcq %rax, %r10
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rdx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r13, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %rbx
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r14, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: pushq %r12
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r14, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r12
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: movq %rbx, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq %rbx, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: movq %r14, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: pushq %rbx
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq %r15, %rsi
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT: movq 64(%r13), %r15
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %r13, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r14, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %rbx
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: movq %r13, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: movq %rbx, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: pushq %r13
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %rbx, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r13
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: movq %rbx, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rsi, %r8
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r13
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: movq %r12, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r14, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r14, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $32, %rsp
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq 72(%r13), %rsi
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r8, %rbx
-; X64-NEXT: adcq %rdi, %r10
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %r10, %r9
-; X64-NEXT: movzbl %r8b, %eax
-; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: movq %r15, %rcx
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r8, %r14
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: movq %r12, %rcx
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: addq %r14, %rax
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r10, %r8
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: movq %r15, %r12
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r8, %rbp
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %r11, %rbp
-; X64-NEXT: adcq %rbx, %r15
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: adcq $0, %rax
+; X64-NEXT: adcq $0, %r11
+; X64-NEXT: addq %rdi, %rax
+; X64-NEXT: adcq %rcx, %r11
+; X64-NEXT: setb %cl
+; X64-NEXT: movzbl %cl, %ecx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
; X64-NEXT: adcq $0, %r9
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq 80(%r13), %r14
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %r8, %r11
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq 88(%r13), %rbx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: addq %r11, %rax
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: adcq %r10, %r8
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r8, %r13
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: addq %rbp, %rsi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %r11
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: adcq $0, %r12
-; X64-NEXT: addq %r9, %r13
-; X64-NEXT: adcq %rdi, %r12
-; X64-NEXT: setb %bpl
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rdi, %r10
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rbx
; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq %r9, %r15
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: addq %r10, %rax
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: adcq %r8, %rdi
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %rcx, %r9
-; X64-NEXT: mulq %rbx
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movzbl %r8b, %eax
-; X64-NEXT: adcq %rax, %rdx
-; X64-NEXT: addq %r13, %rsi
-; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r12, %r10
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r10
; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %bpl, %eax
-; X64-NEXT: adcq %rax, %rcx
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rdx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: imulq %rax, %rbx
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rbx, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: imulq %rcx, %r14
-; X64-NEXT: addq %rdx, %r14
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: imulq %rsi, %r10
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r10, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: imulq %r11, %rbx
-; X64-NEXT: addq %rdx, %rbx
-; X64-NEXT: addq %r8, %rdi
-; X64-NEXT: adcq %r14, %rbx
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r8, %r14
-; X64-NEXT: adcq $0, %r10
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: addq %r14, %rax
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r10, %r8
-; X64-NEXT: setb %r10b
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r14
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r14
+; X64-NEXT: adcq $0, %r15
; X64-NEXT: addq %r8, %r14
-; X64-NEXT: movzbl %r10b, %eax
-; X64-NEXT: adcq %rax, %rsi
-; X64-NEXT: addq %rdi, %r14
-; X64-NEXT: adcq %rbx, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq 112(%rcx), %r10
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT: imulq %r11, %r10
-; X64-NEXT: addq %rdx, %r10
-; X64-NEXT: movq 120(%rcx), %rax
-; X64-NEXT: imulq %rdi, %rax
-; X64-NEXT: movq %rdi, %r12
-; X64-NEXT: addq %rax, %r10
-; X64-NEXT: movq 96(%rcx), %r13
-; X64-NEXT: movq 104(%rcx), %r8
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: movq %r15, %rbx
-; X64-NEXT: imulq %r8, %rbx
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rbx, %rdx
-; X64-NEXT: imulq %r13, %r9
-; X64-NEXT: addq %rdx, %r9
-; X64-NEXT: addq %rbp, %rdi
-; X64-NEXT: adcq %r10, %r9
-; X64-NEXT: movq %r9, %r15
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %r10, %r12
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %r11
+; X64-NEXT: adcq %rdi, %r15
+; X64-NEXT: setb %dil
+; X64-NEXT: movzbl %dil, %edi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT: adcq %rsi, %rdi
+; X64-NEXT: adcq %rdx, %r8
+; X64-NEXT: adcq $0, %rax
+; X64-NEXT: adcq $0, %r11
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rdx
; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r12, %r13
-; X64-NEXT: adcq %rbp, %r10
-; X64-NEXT: setb %bl
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: addq %r10, %rax
-; X64-NEXT: movzbl %bl, %r8d
-; X64-NEXT: adcq %r8, %rdx
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: adcq %r15, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: adcq %r14, %rax
-; X64-NEXT: adcq %rsi, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq 80(%r14), %r10
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq 88(%r14), %r15
-; X64-NEXT: movq %r15, %rax
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: adcq $0, %rdx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT: adcq $0, %r12
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: addq %rdx, %r12
+; X64-NEXT: adcq %rsi, %r13
+; X64-NEXT: setb %dl
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT: movzbl %dl, %edx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r15
; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r9, %rdi
-; X64-NEXT: adcq %r8, %rcx
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rcx, %r12
-; X64-NEXT: movzbl %r8b, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: movq 64(%r14), %rcx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq 72(%r14), %r8
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %r11, %r14
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq %rcx, %r9
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: addq %r14, %rax
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbx, %r11
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r11, %rbp
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %rbx
-; X64-NEXT: addq %rsi, %rbp
-; X64-NEXT: adcq %rdi, %rbx
+; X64-NEXT: adcq %rdi, %r10
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r8, %rbx
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %r12
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq %r9, %rcx
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %r8, %r10
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rsi, %r11
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: addq %r11, %rax
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: adcq %rdi, %r13
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r13, %rdi
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %rsi
-; X64-NEXT: addq %rbp, %r9
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbx, %r11
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: adcq $0, %rdx
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: addq %r12, %rdi
-; X64-NEXT: adcq %r15, %rsi
-; X64-NEXT: setb %cl
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r9, %rbx
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq %r8, %r9
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rax, %r12
; X64-NEXT: adcq %r11, %r13
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: addq %r13, %rax
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movzbl %r8b, %eax
-; X64-NEXT: adcq %rax, %rdx
-; X64-NEXT: addq %rdi, %r15
-; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rsi, %r10
+; X64-NEXT: adcq %rcx, %rdx
+; X64-NEXT: adcq %r9, %rsi
+; X64-NEXT: setb %al
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %r8
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %r9
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movq %r8, %r10
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: movq %r9, %r11
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: addq %rcx, %r8
+; X64-NEXT: adcq %rdi, %r9
+; X64-NEXT: setb %cl
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: movzbl %cl, %ecx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT: adcq %rdx, %r10
; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %cl, %eax
-; X64-NEXT: adcq %rax, %r11
+; X64-NEXT: adcq %rsi, %r11
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rdx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq 96(%rcx), %rsi
-; X64-NEXT: imulq %rsi, %r9
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r9, %rdx
-; X64-NEXT: movq 104(%rcx), %r9
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: imulq %r9, %rax
-; X64-NEXT: addq %rdx, %rax
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq 112(%rcx), %rax
-; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: adcq %rax, %r8
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rax
; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: imulq %r10, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rcx, %rdx
-; X64-NEXT: movq 120(%r14), %r13
-; X64-NEXT: imulq %rbx, %r13
-; X64-NEXT: addq %rdx, %r13
-; X64-NEXT: addq %rdi, %r8
-; X64-NEXT: adcq %r11, %r13
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %rbx, %rcx
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: adcq $0, %rdx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rdi, %rbx
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rbx, %r12
-; X64-NEXT: adcq %r11, %rcx
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %rbx
-; X64-NEXT: addq %r8, %r9
-; X64-NEXT: adcq %r13, %rbx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: imulq %r10, %rdi
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: imulq %r14, %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: adcq $0, %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq $0, %rcx
; X64-NEXT: addq %rdx, %rax
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: imulq %r8, %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rsi, %rdx
-; X64-NEXT: imulq %rdi, %rbp
-; X64-NEXT: addq %rdx, %rbp
-; X64-NEXT: addq %rcx, %r11
-; X64-NEXT: adcq %r13, %rbp
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %r8, %r15
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %rcx, %rsi
+; X64-NEXT: adcq %rsi, %rcx
+; X64-NEXT: setb %dl
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movzbl %dl, %edx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT: addq %rax, %rdi
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %rcx, %r12
+; X64-NEXT: adcq %rdx, %r15
+; X64-NEXT: adcq %rsi, %r14
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: adcq $0, %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq $0, %rcx
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rsi, %rcx
-; X64-NEXT: adcq %r8, %rdi
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: addq %rdi, %rax
-; X64-NEXT: movzbl %sil, %esi
-; X64-NEXT: adcq %rsi, %rdx
-; X64-NEXT: addq %r11, %rax
-; X64-NEXT: adcq %rbp, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: adcq %r12, %rcx
+; X64-NEXT: addq %rax, %rdi
+; X64-NEXT: adcq %rcx, %r8
+; X64-NEXT: setb %al
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movzbl %al, %r10d
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: addq %rdi, %rsi
+; X64-NEXT: adcq %r8, %rdx
+; X64-NEXT: adcq %r10, %rcx
; X64-NEXT: adcq %r9, %rax
-; X64-NEXT: adcq %rbx, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: movq %rsi, %r8
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; X64-NEXT: movq %rdi, %r9
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: adcq %r12, %rdx
+; X64-NEXT: adcq %r15, %rcx
+; X64-NEXT: adcq %r14, %rax
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT: adcq %r13, %rdi
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, (%rsi)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 8(%rsi)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 16(%rsi)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 24(%rsi)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 32(%rsi)
-; X64-NEXT: movq (%rsp), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 40(%rsi)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 48(%rsi)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 56(%rsi)
-; X64-NEXT: movq %r8, 64(%rsi)
-; X64-NEXT: movq %r9, 72(%rsi)
-; X64-NEXT: movq %r10, 80(%rsi)
-; X64-NEXT: movq %r11, 88(%rsi)
-; X64-NEXT: movq %r13, 96(%rsi)
-; X64-NEXT: movq %rcx, 104(%rsi)
-; X64-NEXT: movq %rax, 112(%rsi)
-; X64-NEXT: movq %rdx, 120(%rsi)
-; X64-NEXT: addq $240, %rsp
+; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq %r10, 16(%r9)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq %r10, 24(%r9)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq %r10, 32(%r9)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq %r10, 40(%r9)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq %r10, 48(%r9)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq %r10, 56(%r9)
+; X64-NEXT: movq %r8, 64(%r9)
+; X64-NEXT: movq %rdi, 72(%r9)
+; X64-NEXT: movq %rbx, 80(%r9)
+; X64-NEXT: movq %r11, 88(%r9)
+; X64-NEXT: movq %rsi, 96(%r9)
+; X64-NEXT: movq %rdx, 104(%r9)
+; X64-NEXT: movq %rcx, 112(%r9)
+; X64-NEXT: movq %rax, 120(%r9)
+; X64-NEXT: movaps %xmm0, (%r9)
+; X64-NEXT: leaq -40(%rbp), %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 2421aabdbcd99..b91a6e184c400 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -1174,277 +1174,151 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X64-LABEL: test_512:
; X64: # %bb.0:
; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
; X64-NEXT: pushq %r15
; X64-NEXT: pushq %r14
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: pushq %rax
-; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq (%rdi), %rbx
-; X64-NEXT: movq 8(%rdi), %rdi
-; X64-NEXT: movq 24(%rax), %r14
-; X64-NEXT: movq 16(%rax), %rax
-; X64-NEXT: movq (%rsi), %r8
-; X64-NEXT: movq 8(%rsi), %r11
-; X64-NEXT: movq %rsi, %r13
-; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $288, %rsp # imm = 0x120
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 32(%rdi), %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rcx, %r10
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r10, %r15
-; X64-NEXT: adcq %r9, %rcx
-; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %esi
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: adcq %rsi, %rdx
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rbx, %rsi
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq 40(%rdi), %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: movq %rsi, %r8
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %r14, %rax
+; X64-NEXT: movq 48(%rdi), %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbx, %rcx
-; X64-NEXT: setb %sil
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: adcq %rax, %r14
-; X64-NEXT: addq %rbp, %rbx
-; X64-NEXT: adcq %r15, %r14
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: adcq $0, %r12
-; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 56(%rdi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq (%rdi), %r9
+; X64-NEXT: movq 8(%rdi), %r10
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 16(%rdi), %r12
+; X64-NEXT: movq 24(%rdi), %r13
+; X64-NEXT: movq 16(%rsi), %rax
+; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT: movq 24(%rsi), %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq (%rsi), %r15
+; X64-NEXT: movq 8(%rsi), %r14
+; X64-NEXT: movq 32(%rsi), %rax
+; X64-NEXT: movq 40(%rsi), %rdx
+; X64-NEXT: movq 48(%rsi), %rcx
+; X64-NEXT: movq 56(%rsi), %r8
+; X64-NEXT: subq $8, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: movq %r9, %rbx
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %r10
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: movq %r13, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r15, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: movq %r13, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: movq %r13, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: pushq %r12
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: movq %r14, %rdx
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: movq %r12, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %rbx, %r13
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: movq %rbx, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r15, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq 16(%r13), %r10
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %r12
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq 24(%rsi), %rsi
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rbp, %r11
-; X64-NEXT: adcq %r15, %rcx
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %rbx, %r13
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r14, %r11
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: addq %r9, %rbp
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; X64-NEXT: setb %dil
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r11
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: adcq %r9, %rcx
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movzbl %r8b, %eax
-; X64-NEXT: adcq %rax, %rdx
-; X64-NEXT: addq %rbp, %r11
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: adcq %rax, %rcx
-; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rdx
-; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq 32(%r8), %r15
-; X64-NEXT: imulq %r15, %rsi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rsi, %rdx
-; X64-NEXT: movq 40(%r8), %rsi
-; X64-NEXT: imulq %rsi, %r10
-; X64-NEXT: addq %rdx, %r10
-; X64-NEXT: movq 48(%r8), %rax
-; X64-NEXT: movq %rax, %rdi
+; X64-NEXT: movq %rbx, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: imulq %r9, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: movq 56(%r8), %r8
-; X64-NEXT: imulq %r11, %r8
-; X64-NEXT: addq %rdx, %r8
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq %r10, %r8
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rcx, %r15
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r12
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $32, %rsp
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq $0, %r8
; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r15, %r13
-; X64-NEXT: adcq %rdi, %rcx
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rcx, %r10
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: addq %rbx, %r10
-; X64-NEXT: adcq %r8, %r12
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq 48(%r8), %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: imulq %r14, %rsi
-; X64-NEXT: addq %rdx, %rsi
-; X64-NEXT: movq %r8, %rdx
-; X64-NEXT: movq 56(%r8), %rax
-; X64-NEXT: imulq %rdi, %rax
-; X64-NEXT: movq %rdi, %r8
-; X64-NEXT: addq %rax, %rsi
-; X64-NEXT: movq 32(%rdx), %rbp
-; X64-NEXT: movq 40(%rdx), %r9
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: imulq %r9, %rdi
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT: imulq %rbp, %r11
-; X64-NEXT: addq %rdx, %r11
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq %rsi, %r11
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rdi, %r8
-; X64-NEXT: adcq %r15, %rcx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: adcq $0, %rdx
+; X64-NEXT: addq %r8, %rsi
+; X64-NEXT: adcq %rdi, %rdx
; X64-NEXT: setb %dil
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movzbl %dil, %ecx
-; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: adcq %r11, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: adcq %r13, %r8
-; X64-NEXT: adcq %r10, %rax
-; X64-NEXT: adcq %r12, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, (%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 8(%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 16(%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 24(%rcx)
-; X64-NEXT: movq %rsi, 32(%rcx)
-; X64-NEXT: movq %r8, 40(%rcx)
-; X64-NEXT: movq %rax, 48(%rcx)
-; X64-NEXT: movq %rdx, 56(%rcx)
-; X64-NEXT: addq $8, %rsp
+; X64-NEXT: movzbl %dil, %edi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: addq %rsi, %r11
+; X64-NEXT: adcq %rdx, %r14
+; X64-NEXT: adcq %rdi, %r10
+; X64-NEXT: adcq %r8, %r9
+; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movaps %xmm0, (%rdx)
+; X64-NEXT: movq %rax, 16(%rdx)
+; X64-NEXT: movq %rcx, 24(%rdx)
+; X64-NEXT: movq %r11, 32(%rdx)
+; X64-NEXT: movq %r14, 40(%rdx)
+; X64-NEXT: movq %r10, 48(%rdx)
+; X64-NEXT: movq %r9, 56(%rdx)
+; X64-NEXT: leaq -40(%rbp), %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
index 28029793211f0..2b79d7e145368 100644
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -12,141 +12,163 @@
define i256 @test1(i256 %a) nounwind {
; ILP-LABEL: test1:
; ILP: # %bb.0:
+; ILP-NEXT: pushq %rbp
+; ILP-NEXT: movq %rsp, %rbp
+; ILP-NEXT: andq $-32, %rsp
+; ILP-NEXT: subq $96, %rsp
; ILP-NEXT: movq %rdi, %rax
; ILP-NEXT: xorps %xmm0, %xmm0
-; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; ILP-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; ILP-NEXT: movaps %xmm0, (%rsp)
; ILP-NEXT: leal (%rsi,%rsi), %ecx
; ILP-NEXT: addb $3, %cl
-; ILP-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; ILP-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; ILP-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; ILP-NEXT: movq $1, {{[0-9]+}}(%rsp)
; ILP-NEXT: movl %ecx, %edx
; ILP-NEXT: shrb $3, %dl
; ILP-NEXT: andb $24, %dl
; ILP-NEXT: negb %dl
-; ILP-NEXT: movsbq %dl, %rdx
-; ILP-NEXT: movq -24(%rsp,%rdx), %rsi
-; ILP-NEXT: movq -16(%rsp,%rdx), %rdi
-; ILP-NEXT: shldq %cl, %rsi, %rdi
-; ILP-NEXT: movq -40(%rsp,%rdx), %r8
-; ILP-NEXT: movq -32(%rsp,%rdx), %rdx
+; ILP-NEXT: movsbq %dl, %rsi
+; ILP-NEXT: movq 48(%rsp,%rsi), %rdx
+; ILP-NEXT: movq 56(%rsp,%rsi), %rdi
+; ILP-NEXT: shldq %cl, %rdx, %rdi
+; ILP-NEXT: movq 32(%rsp,%rsi), %r8
+; ILP-NEXT: movq 40(%rsp,%rsi), %rsi
; ILP-NEXT: movq %r8, %r9
; ILP-NEXT: shlq %cl, %r9
-; ILP-NEXT: movq %rdx, %r10
+; ILP-NEXT: movq %rsi, %r10
; ILP-NEXT: shldq %cl, %r8, %r10
; ILP-NEXT: movq %rdi, 24(%rax)
; ILP-NEXT: movq %r10, 8(%rax)
; ILP-NEXT: movq %r9, (%rax)
-; ILP-NEXT: shlq %cl, %rsi
+; ILP-NEXT: shlq %cl, %rdx
; ILP-NEXT: notb %cl
-; ILP-NEXT: shrq %rdx
+; ILP-NEXT: shrq %rsi
; ILP-NEXT: # kill: def $cl killed $cl killed $ecx
-; ILP-NEXT: shrq %cl, %rdx
-; ILP-NEXT: orq %rsi, %rdx
-; ILP-NEXT: movq %rdx, 16(%rax)
+; ILP-NEXT: shrq %cl, %rsi
+; ILP-NEXT: orq %rdx, %rsi
+; ILP-NEXT: movq %rsi, 16(%rax)
+; ILP-NEXT: movq %rbp, %rsp
+; ILP-NEXT: popq %rbp
; ILP-NEXT: retq
;
; HYBRID-LABEL: test1:
; HYBRID: # %bb.0:
+; HYBRID-NEXT: pushq %rbp
+; HYBRID-NEXT: movq %rsp, %rbp
+; HYBRID-NEXT: andq $-32, %rsp
+; HYBRID-NEXT: subq $96, %rsp
; HYBRID-NEXT: movq %rdi, %rax
; HYBRID-NEXT: xorps %xmm0, %xmm0
-; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; HYBRID-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; HYBRID-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; HYBRID-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; HYBRID-NEXT: movaps %xmm0, (%rsp)
+; HYBRID-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; HYBRID-NEXT: movq $1, {{[0-9]+}}(%rsp)
; HYBRID-NEXT: leal (%rsi,%rsi), %ecx
; HYBRID-NEXT: addb $3, %cl
; HYBRID-NEXT: movl %ecx, %edx
; HYBRID-NEXT: shrb $3, %dl
; HYBRID-NEXT: andb $24, %dl
; HYBRID-NEXT: negb %dl
-; HYBRID-NEXT: movsbq %dl, %rdx
-; HYBRID-NEXT: movq -24(%rsp,%rdx), %rsi
-; HYBRID-NEXT: movq -16(%rsp,%rdx), %rdi
-; HYBRID-NEXT: shldq %cl, %rsi, %rdi
+; HYBRID-NEXT: movsbq %dl, %rsi
+; HYBRID-NEXT: movq 48(%rsp,%rsi), %rdx
+; HYBRID-NEXT: movq 56(%rsp,%rsi), %rdi
+; HYBRID-NEXT: shldq %cl, %rdx, %rdi
; HYBRID-NEXT: movq %rdi, 24(%rax)
-; HYBRID-NEXT: movq -40(%rsp,%rdx), %rdi
-; HYBRID-NEXT: movq -32(%rsp,%rdx), %rdx
-; HYBRID-NEXT: movq %rdx, %r8
+; HYBRID-NEXT: movq 32(%rsp,%rsi), %rdi
+; HYBRID-NEXT: movq 40(%rsp,%rsi), %rsi
+; HYBRID-NEXT: movq %rsi, %r8
; HYBRID-NEXT: shldq %cl, %rdi, %r8
; HYBRID-NEXT: movq %r8, 8(%rax)
; HYBRID-NEXT: shlq %cl, %rdi
; HYBRID-NEXT: movq %rdi, (%rax)
-; HYBRID-NEXT: shlq %cl, %rsi
+; HYBRID-NEXT: shlq %cl, %rdx
; HYBRID-NEXT: notb %cl
-; HYBRID-NEXT: shrq %rdx
+; HYBRID-NEXT: shrq %rsi
; HYBRID-NEXT: # kill: def $cl killed $cl killed $ecx
-; HYBRID-NEXT: shrq %cl, %rdx
-; HYBRID-NEXT: orq %rsi, %rdx
-; HYBRID-NEXT: movq %rdx, 16(%rax)
+; HYBRID-NEXT: shrq %cl, %rsi
+; HYBRID-NEXT: orq %rdx, %rsi
+; HYBRID-NEXT: movq %rsi, 16(%rax)
+; HYBRID-NEXT: movq %rbp, %rsp
+; HYBRID-NEXT: popq %rbp
; HYBRID-NEXT: retq
;
; BURR-LABEL: test1:
; BURR: # %bb.0:
+; BURR-NEXT: pushq %rbp
+; BURR-NEXT: movq %rsp, %rbp
+; BURR-NEXT: andq $-32, %rsp
+; BURR-NEXT: subq $96, %rsp
; BURR-NEXT: movq %rdi, %rax
; BURR-NEXT: xorps %xmm0, %xmm0
-; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; BURR-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; BURR-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; BURR-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; BURR-NEXT: movaps %xmm0, (%rsp)
+; BURR-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; BURR-NEXT: movq $1, {{[0-9]+}}(%rsp)
; BURR-NEXT: leal (%rsi,%rsi), %ecx
; BURR-NEXT: addb $3, %cl
; BURR-NEXT: movl %ecx, %edx
; BURR-NEXT: shrb $3, %dl
; BURR-NEXT: andb $24, %dl
; BURR-NEXT: negb %dl
-; BURR-NEXT: movsbq %dl, %rdx
-; BURR-NEXT: movq -24(%rsp,%rdx), %rsi
-; BURR-NEXT: movq -16(%rsp,%rdx), %rdi
-; BURR-NEXT: shldq %cl, %rsi, %rdi
+; BURR-NEXT: movsbq %dl, %rsi
+; BURR-NEXT: movq 48(%rsp,%rsi), %rdx
+; BURR-NEXT: movq 56(%rsp,%rsi), %rdi
+; BURR-NEXT: shldq %cl, %rdx, %rdi
; BURR-NEXT: movq %rdi, 24(%rax)
-; BURR-NEXT: movq -40(%rsp,%rdx), %rdi
-; BURR-NEXT: movq -32(%rsp,%rdx), %rdx
-; BURR-NEXT: movq %rdx, %r8
+; BURR-NEXT: movq 32(%rsp,%rsi), %rdi
+; BURR-NEXT: movq 40(%rsp,%rsi), %rsi
+; BURR-NEXT: movq %rsi, %r8
; BURR-NEXT: shldq %cl, %rdi, %r8
; BURR-NEXT: movq %r8, 8(%rax)
; BURR-NEXT: shlq %cl, %rdi
; BURR-NEXT: movq %rdi, (%rax)
-; BURR-NEXT: shlq %cl, %rsi
+; BURR-NEXT: shlq %cl, %rdx
; BURR-NEXT: notb %cl
-; BURR-NEXT: shrq %rdx
+; BURR-NEXT: shrq %rsi
; BURR-NEXT: # kill: def $cl killed $cl killed $ecx
-; BURR-NEXT: shrq %cl, %rdx
-; BURR-NEXT: orq %rsi, %rdx
-; BURR-NEXT: movq %rdx, 16(%rax)
+; BURR-NEXT: shrq %cl, %rsi
+; BURR-NEXT: orq %rdx, %rsi
+; BURR-NEXT: movq %rsi, 16(%rax)
+; BURR-NEXT: movq %rbp, %rsp
+; BURR-NEXT: popq %rbp
; BURR-NEXT: retq
;
; SRC-LABEL: test1:
; SRC: # %bb.0:
+; SRC-NEXT: pushq %rbp
+; SRC-NEXT: movq %rsp, %rbp
+; SRC-NEXT: andq $-32, %rsp
+; SRC-NEXT: subq $96, %rsp
; SRC-NEXT: movq %rdi, %rax
; SRC-NEXT: leal (%rsi,%rsi), %edx
; SRC-NEXT: addb $3, %dl
; SRC-NEXT: xorps %xmm0, %xmm0
-; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SRC-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SRC-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SRC-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SRC-NEXT: movaps %xmm0, (%rsp)
+; SRC-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SRC-NEXT: movq $1, {{[0-9]+}}(%rsp)
; SRC-NEXT: movl %edx, %ecx
; SRC-NEXT: shrb $3, %cl
; SRC-NEXT: andb $24, %cl
; SRC-NEXT: negb %cl
; SRC-NEXT: movsbq %cl, %rsi
-; SRC-NEXT: movq -24(%rsp,%rsi), %rdi
+; SRC-NEXT: movq 48(%rsp,%rsi), %rdi
; SRC-NEXT: movq %rdi, %r8
; SRC-NEXT: movl %edx, %ecx
; SRC-NEXT: shlq %cl, %r8
; SRC-NEXT: notb %cl
-; SRC-NEXT: movq -40(%rsp,%rsi), %r9
-; SRC-NEXT: movq -32(%rsp,%rsi), %r10
+; SRC-NEXT: movq 32(%rsp,%rsi), %r9
+; SRC-NEXT: movq 40(%rsp,%rsi), %r10
; SRC-NEXT: movq %r10, %r11
; SRC-NEXT: shrq %r11
; SRC-NEXT: shrq %cl, %r11
; SRC-NEXT: orq %r8, %r11
-; SRC-NEXT: movq -16(%rsp,%rsi), %rsi
+; SRC-NEXT: movq 56(%rsp,%rsi), %rsi
; SRC-NEXT: movl %edx, %ecx
; SRC-NEXT: shldq %cl, %rdi, %rsi
; SRC-NEXT: movq %r9, %rdi
@@ -156,10 +178,16 @@ define i256 @test1(i256 %a) nounwind {
; SRC-NEXT: movq %r10, 8(%rax)
; SRC-NEXT: movq %rdi, (%rax)
; SRC-NEXT: movq %r11, 16(%rax)
+; SRC-NEXT: movq %rbp, %rsp
+; SRC-NEXT: popq %rbp
; SRC-NEXT: retq
;
; LIN-LABEL: test1:
; LIN: # %bb.0:
+; LIN-NEXT: pushq %rbp
+; LIN-NEXT: movq %rsp, %rbp
+; LIN-NEXT: andq $-32, %rsp
+; LIN-NEXT: subq $96, %rsp
; LIN-NEXT: movq %rdi, %rax
; LIN-NEXT: leal (%rsi,%rsi), %edx
; LIN-NEXT: addb $3, %dl
@@ -169,21 +197,21 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: negb %cl
; LIN-NEXT: movsbq %cl, %rsi
; LIN-NEXT: xorps %xmm0, %xmm0
-; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; LIN-NEXT: movq -40(%rsp,%rsi), %rdi
+; LIN-NEXT: movaps %xmm0, (%rsp)
+; LIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; LIN-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; LIN-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; LIN-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; LIN-NEXT: movq 32(%rsp,%rsi), %rdi
; LIN-NEXT: movq %rdi, %r8
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shlq %cl, %r8
; LIN-NEXT: movq %r8, (%rax)
-; LIN-NEXT: movq -32(%rsp,%rsi), %r8
+; LIN-NEXT: movq 40(%rsp,%rsi), %r8
; LIN-NEXT: movq %r8, %r9
; LIN-NEXT: shldq %cl, %rdi, %r9
; LIN-NEXT: movq %r9, 8(%rax)
-; LIN-NEXT: movq -24(%rsp,%rsi), %rdi
+; LIN-NEXT: movq 48(%rsp,%rsi), %rdi
; LIN-NEXT: movq %rdi, %r9
; LIN-NEXT: shlq %cl, %r9
; LIN-NEXT: shrq %r8
@@ -191,10 +219,12 @@ define i256 @test1(i256 %a) nounwind {
; LIN-NEXT: shrq %cl, %r8
; LIN-NEXT: orq %r9, %r8
; LIN-NEXT: movq %r8, 16(%rax)
-; LIN-NEXT: movq -16(%rsp,%rsi), %rsi
+; LIN-NEXT: movq 56(%rsp,%rsi), %rsi
; LIN-NEXT: movl %edx, %ecx
; LIN-NEXT: shldq %cl, %rdi, %rsi
; LIN-NEXT: movq %rsi, 24(%rax)
+; LIN-NEXT: movq %rbp, %rsp
+; LIN-NEXT: popq %rbp
; LIN-NEXT: retq
%b = add i256 %a, 1
%m = shl i256 %b, 1
diff --git a/llvm/test/CodeGen/X86/shift-i256-narrow-amount.ll b/llvm/test/CodeGen/X86/shift-i256-narrow-amount.ll
new file mode 100644
index 0000000000000..a053175ae4c95
--- /dev/null
+++ b/llvm/test/CodeGen/X86/shift-i256-narrow-amount.ll
@@ -0,0 +1,382 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- -O2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- -O2 | FileCheck %s --check-prefix=X64
+;
+; Test i256 shifts with narrow (i32) shift amounts. These use the
+; ExpandShiftWithUnknownAmountBit path (parts expansion) rather than the
+; shift-through-stack approach, because ExpandShiftWithKnownAmountBit
+; cannot determine the high bit of the shift amount.
+
+define void @shl_i256_by_i32(i256 %x, i32 %amt, ptr %r) nounwind {
+; X86-LABEL: shl_i256_by_i32:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $112, %esp
+; X86-NEXT: movzbl 40(%ebp), %ecx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: movl 16(%ebp), %esi
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $28, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %edi
+; X86-NEXT: movl 68(%esp,%edi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 72(%esp,%edi), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 76(%esp,%edi), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%edi), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 84(%esp,%edi), %edx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl 88(%esp,%edi), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl 64(%esp,%edi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 92(%esp,%edi), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl 44(%ebp), %edi
+; X86-NEXT: movl %edx, 28(%edi)
+; X86-NEXT: movl %eax, 24(%edi)
+; X86-NEXT: movl %ebx, 20(%edi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 16(%edi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%edi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 8(%edi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, 4(%edi)
+; X86-NEXT: movl %eax, (%edi)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; X64-LABEL: shl_i256_by_i32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $96, %rsp
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, (%rsp)
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: shrb $3, %al
+; X64-NEXT: andb $24, %al
+; X64-NEXT: negb %al
+; X64-NEXT: movsbq %al, %rax
+; X64-NEXT: movq 32(%rsp,%rax), %rdx
+; X64-NEXT: movq 40(%rsp,%rax), %rsi
+; X64-NEXT: movq 48(%rsp,%rax), %rdi
+; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: movl %r8d, %ecx
+; X64-NEXT: shldq %cl, %rsi, %r10
+; X64-NEXT: movq 56(%rsp,%rax), %rax
+; X64-NEXT: shldq %cl, %rdi, %rax
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: shlq %cl, %rdi
+; X64-NEXT: shldq %cl, %rdx, %rsi
+; X64-NEXT: movq %rax, 24(%r9)
+; X64-NEXT: movq %r10, 16(%r9)
+; X64-NEXT: movq %rsi, 8(%r9)
+; X64-NEXT: movq %rdi, (%r9)
+; X64-NEXT: movq %rbp, %rsp
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+entry:
+ %amt256 = zext i32 %amt to i256
+ %res = shl i256 %x, %amt256
+ store i256 %res, ptr %r
+ ret void
+}
+
+define void @lshr_i256_by_i32(i256 %x, i32 %amt, ptr %r) nounwind {
+; X86-LABEL: lshr_i256_by_i32:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $112, %esp
+; X86-NEXT: movzbl 40(%ebp), %ecx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: movl 16(%ebp), %esi
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 40(%esp,%eax,4), %edx
+; X86-NEXT: movl 36(%esp,%eax,4), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%esp,%eax,4), %esi
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax,4), %ebx
+; X86-NEXT: shrdl %cl, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%esp,%eax,4), %esi
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl 56(%esp,%eax,4), %edx
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl 32(%esp,%eax,4), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%esp,%eax,4), %eax
+; X86-NEXT: shrdl %cl, %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: movl %eax, 28(%ecx)
+; X86-NEXT: movl %edx, 24(%ecx)
+; X86-NEXT: movl %esi, 20(%ecx)
+; X86-NEXT: movl %ebx, 16(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 8(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 4(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, (%ecx)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; X64-LABEL: lshr_i256_by_i32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $96, %rsp
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdi, (%rsp)
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: shrb $6, %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movq 16(%rsp,%rax,8), %rdx
+; X64-NEXT: movq (%rsp,%rax,8), %rsi
+; X64-NEXT: movq 8(%rsp,%rax,8), %rdi
+; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: movl %r8d, %ecx
+; X64-NEXT: shrdq %cl, %rdx, %r10
+; X64-NEXT: movq 24(%rsp,%rax,8), %rax
+; X64-NEXT: shrdq %cl, %rax, %rdx
+; X64-NEXT: shrdq %cl, %rdi, %rsi
+; X64-NEXT: shrq %cl, %rax
+; X64-NEXT: movq %rax, 24(%r9)
+; X64-NEXT: movq %rdx, 16(%r9)
+; X64-NEXT: movq %r10, 8(%r9)
+; X64-NEXT: movq %rsi, (%r9)
+; X64-NEXT: movq %rbp, %rsp
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+entry:
+ %amt256 = zext i32 %amt to i256
+ %res = lshr i256 %x, %amt256
+ store i256 %res, ptr %r
+ ret void
+}
+
+define void @ashr_i256_by_i32(i256 %x, i32 %amt, ptr %r) nounwind {
+; X86-LABEL: ashr_i256_by_i32:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $112, %esp
+; X86-NEXT: movzbl 40(%ebp), %ecx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: movl 16(%ebp), %esi
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: sarl $31, %edi
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 40(%esp,%eax,4), %edx
+; X86-NEXT: movl 36(%esp,%eax,4), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%esp,%eax,4), %esi
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax,4), %ebx
+; X86-NEXT: shrdl %cl, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%esp,%eax,4), %esi
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl 56(%esp,%eax,4), %edx
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl 32(%esp,%eax,4), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%esp,%eax,4), %eax
+; X86-NEXT: shrdl %cl, %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sarl %cl, %eax
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: movl %eax, 28(%ecx)
+; X86-NEXT: movl %edx, 24(%ecx)
+; X86-NEXT: movl %esi, 20(%ecx)
+; X86-NEXT: movl %ebx, 16(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 8(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 4(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, (%ecx)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; X64-LABEL: ashr_i256_by_i32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $96, %rsp
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdi, (%rsp)
+; X64-NEXT: sarq $63, %rcx
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: shrb $6, %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: movq 16(%rsp,%rax,8), %rdx
+; X64-NEXT: movq (%rsp,%rax,8), %rsi
+; X64-NEXT: movq 8(%rsp,%rax,8), %rdi
+; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: movl %r8d, %ecx
+; X64-NEXT: shrdq %cl, %rdx, %r10
+; X64-NEXT: movq 24(%rsp,%rax,8), %rax
+; X64-NEXT: shrdq %cl, %rax, %rdx
+; X64-NEXT: shrdq %cl, %rdi, %rsi
+; X64-NEXT: sarq %cl, %rax
+; X64-NEXT: movq %rax, 24(%r9)
+; X64-NEXT: movq %rdx, 16(%r9)
+; X64-NEXT: movq %r10, 8(%r9)
+; X64-NEXT: movq %rsi, (%r9)
+; X64-NEXT: movq %rbp, %rsp
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+entry:
+ %amt256 = zext i32 %amt to i256
+ %res = ashr i256 %x, %amt256
+ store i256 %res, ptr %r
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index 128e2199fb56f..0663bec48899a 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -88,31 +88,35 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
;
; CHECK-X64-O0-LABEL: shift1:
; CHECK-X64-O0: # %bb.0: # %entry
-; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; CHECK-X64-O0-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; CHECK-X64-O0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: pushq %rbp
+; CHECK-X64-O0-NEXT: movq %rsp, %rbp
+; CHECK-X64-O0-NEXT: andq $-32, %rsp
+; CHECK-X64-O0-NEXT: subq $128, %rsp
+; CHECK-X64-O0-NEXT: movq 24(%rbp), %rax
+; CHECK-X64-O0-NEXT: movq 16(%rbp), %rax
+; CHECK-X64-O0-NEXT: movq 32(%rbp), %rax
+; CHECK-X64-O0-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: sarq $63, %rcx
-; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movb %r8b, %cl
; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-X64-O0-NEXT: movb %cl, %dl
; CHECK-X64-O0-NEXT: shrb $6, %dl
; CHECK-X64-O0-NEXT: movzbl %dl, %edx
; CHECK-X64-O0-NEXT: movl %edx, %edi
-; CHECK-X64-O0-NEXT: movq -56(%rsp,%rdi,8), %rsi
-; CHECK-X64-O0-NEXT: movq -72(%rsp,%rdi,8), %r8
-; CHECK-X64-O0-NEXT: movq -64(%rsp,%rdi,8), %r9
+; CHECK-X64-O0-NEXT: movq 48(%rsp,%rdi,8), %rsi
+; CHECK-X64-O0-NEXT: movq 32(%rsp,%rdi,8), %r8
+; CHECK-X64-O0-NEXT: movq 40(%rsp,%rdi,8), %r9
; CHECK-X64-O0-NEXT: movq %r9, %rdx
; CHECK-X64-O0-NEXT: shrdq %cl, %rsi, %rdx
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: movq -48(%rsp,%rdi,8), %rdi
+; CHECK-X64-O0-NEXT: movq 56(%rsp,%rdi,8), %rdi
; CHECK-X64-O0-NEXT: shrdq %cl, %rdi, %rsi
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
; CHECK-X64-O0-NEXT: shrdq %cl, %r9, %r8
@@ -124,30 +128,36 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-X64-O0-NEXT: movq %rsi, 16(%rax)
; CHECK-X64-O0-NEXT: movq %rdx, 8(%rax)
; CHECK-X64-O0-NEXT: movq %rcx, (%rax)
+; CHECK-X64-O0-NEXT: movq %rbp, %rsp
+; CHECK-X64-O0-NEXT: popq %rbp
; CHECK-X64-O0-NEXT: retq
;
; CHECK-X64-O2-LABEL: shift1:
; CHECK-X64-O2: # %bb.0: # %entry
-; CHECK-X64-O2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: pushq %rbp
+; CHECK-X64-O2-NEXT: movq %rsp, %rbp
+; CHECK-X64-O2-NEXT: andq $-32, %rsp
+; CHECK-X64-O2-NEXT: subq $96, %rsp
+; CHECK-X64-O2-NEXT: movq 32(%rbp), %rax
+; CHECK-X64-O2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movq %rdi, (%rsp)
; CHECK-X64-O2-NEXT: sarq $63, %rcx
-; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movl %r8d, %ecx
; CHECK-X64-O2-NEXT: shrb $6, %cl
; CHECK-X64-O2-NEXT: movzbl %cl, %edx
-; CHECK-X64-O2-NEXT: movq -56(%rsp,%rdx,8), %rsi
-; CHECK-X64-O2-NEXT: movq -72(%rsp,%rdx,8), %rdi
-; CHECK-X64-O2-NEXT: movq -64(%rsp,%rdx,8), %r9
+; CHECK-X64-O2-NEXT: movq 16(%rsp,%rdx,8), %rsi
+; CHECK-X64-O2-NEXT: movq (%rsp,%rdx,8), %rdi
+; CHECK-X64-O2-NEXT: movq 8(%rsp,%rdx,8), %r9
; CHECK-X64-O2-NEXT: movq %r9, %r10
; CHECK-X64-O2-NEXT: movl %r8d, %ecx
; CHECK-X64-O2-NEXT: shrdq %cl, %rsi, %r10
-; CHECK-X64-O2-NEXT: movq -48(%rsp,%rdx,8), %rdx
+; CHECK-X64-O2-NEXT: movq 24(%rsp,%rdx,8), %rdx
; CHECK-X64-O2-NEXT: shrdq %cl, %rdx, %rsi
; CHECK-X64-O2-NEXT: shrdq %cl, %r9, %rdi
; CHECK-X64-O2-NEXT: sarq %cl, %rdx
@@ -155,6 +165,8 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
; CHECK-X64-O2-NEXT: movq %rsi, 16(%rax)
; CHECK-X64-O2-NEXT: movq %r10, 8(%rax)
; CHECK-X64-O2-NEXT: movq %rdi, (%rax)
+; CHECK-X64-O2-NEXT: movq %rbp, %rsp
+; CHECK-X64-O2-NEXT: popq %rbp
; CHECK-X64-O2-NEXT: retq
entry:
%0 = ashr i256 %x, %a
@@ -245,15 +257,19 @@ define i256 @shift2(i256 %c) nounwind
;
; CHECK-X64-O0-LABEL: shift2:
; CHECK-X64-O0: # %bb.0:
+; CHECK-X64-O0-NEXT: pushq %rbp
+; CHECK-X64-O0-NEXT: movq %rsp, %rbp
+; CHECK-X64-O0-NEXT: andq $-32, %rsp
+; CHECK-X64-O0-NEXT: subq $128, %rsp
; CHECK-X64-O0-NEXT: movq %rdi, %rax
-; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O0-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O0-NEXT: movq $0, {{[0-9]+}}(%rsp)
; CHECK-X64-O0-NEXT: movb %sil, %cl
; CHECK-X64-O0-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-X64-O0-NEXT: movb %cl, %dl
@@ -261,13 +277,13 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-X64-O0-NEXT: andb $24, %dl
; CHECK-X64-O0-NEXT: negb %dl
; CHECK-X64-O0-NEXT: movsbq %dl, %r8
-; CHECK-X64-O0-NEXT: movq -40(%rsp,%r8), %r9
-; CHECK-X64-O0-NEXT: movq -32(%rsp,%r8), %rdx
-; CHECK-X64-O0-NEXT: movq -24(%rsp,%r8), %r10
+; CHECK-X64-O0-NEXT: movq 64(%rsp,%r8), %r9
+; CHECK-X64-O0-NEXT: movq 72(%rsp,%r8), %rdx
+; CHECK-X64-O0-NEXT: movq 80(%rsp,%r8), %r10
; CHECK-X64-O0-NEXT: movq %r10, %rsi
; CHECK-X64-O0-NEXT: shldq %cl, %rdx, %rsi
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
-; CHECK-X64-O0-NEXT: movq -16(%rsp,%r8), %r8
+; CHECK-X64-O0-NEXT: movq 88(%rsp,%r8), %r8
; CHECK-X64-O0-NEXT: shldq %cl, %r10, %r8
; CHECK-X64-O0-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %cl # 1-byte Reload
; CHECK-X64-O0-NEXT: movq %r9, %r10
@@ -280,29 +296,35 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-X64-O0-NEXT: movq %rsi, 16(%rdi)
; CHECK-X64-O0-NEXT: movq %rdx, 8(%rdi)
; CHECK-X64-O0-NEXT: movq %rcx, (%rdi)
+; CHECK-X64-O0-NEXT: movq %rbp, %rsp
+; CHECK-X64-O0-NEXT: popq %rbp
; CHECK-X64-O0-NEXT: retq
;
; CHECK-X64-O2-LABEL: shift2:
; CHECK-X64-O2: # %bb.0:
+; CHECK-X64-O2-NEXT: pushq %rbp
+; CHECK-X64-O2-NEXT: movq %rsp, %rbp
+; CHECK-X64-O2-NEXT: andq $-32, %rsp
+; CHECK-X64-O2-NEXT: subq $96, %rsp
; CHECK-X64-O2-NEXT: movq %rsi, %rcx
; CHECK-X64-O2-NEXT: movq %rdi, %rax
; CHECK-X64-O2-NEXT: xorps %xmm0, %xmm0
-; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-X64-O2-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movaps %xmm0, (%rsp)
+; CHECK-X64-O2-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; CHECK-X64-O2-NEXT: movq $1, {{[0-9]+}}(%rsp)
; CHECK-X64-O2-NEXT: movl %ecx, %edx
; CHECK-X64-O2-NEXT: shrb $3, %dl
; CHECK-X64-O2-NEXT: andb $24, %dl
; CHECK-X64-O2-NEXT: negb %dl
; CHECK-X64-O2-NEXT: movsbq %dl, %rdx
-; CHECK-X64-O2-NEXT: movq -40(%rsp,%rdx), %rsi
-; CHECK-X64-O2-NEXT: movq -32(%rsp,%rdx), %rdi
-; CHECK-X64-O2-NEXT: movq -24(%rsp,%rdx), %r8
+; CHECK-X64-O2-NEXT: movq 32(%rsp,%rdx), %rsi
+; CHECK-X64-O2-NEXT: movq 40(%rsp,%rdx), %rdi
+; CHECK-X64-O2-NEXT: movq 48(%rsp,%rdx), %r8
; CHECK-X64-O2-NEXT: movq %r8, %r9
; CHECK-X64-O2-NEXT: shldq %cl, %rdi, %r9
-; CHECK-X64-O2-NEXT: movq -16(%rsp,%rdx), %rdx
+; CHECK-X64-O2-NEXT: movq 56(%rsp,%rdx), %rdx
; CHECK-X64-O2-NEXT: shldq %cl, %r8, %rdx
; CHECK-X64-O2-NEXT: movq %rsi, %r8
; CHECK-X64-O2-NEXT: shlq %cl, %r8
@@ -312,6 +334,8 @@ define i256 @shift2(i256 %c) nounwind
; CHECK-X64-O2-NEXT: movq %r9, 16(%rax)
; CHECK-X64-O2-NEXT: movq %rdi, 8(%rax)
; CHECK-X64-O2-NEXT: movq %r8, (%rax)
+; CHECK-X64-O2-NEXT: movq %rbp, %rsp
+; CHECK-X64-O2-NEXT: popq %rbp
; CHECK-X64-O2-NEXT: retq
{
%b = shl i256 1, %c ; %c must not be a constant
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index f60585e978104..9e3b9ca717df0 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -9,111 +9,118 @@
define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; SSE-LABEL: shl_i512:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $128, %rsp
+; SSE-NEXT: movq 40(%rbp), %rax
+; SSE-NEXT: movaps 16(%rbp), %xmm0
+; SSE-NEXT: movq 32(%rbp), %r10
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %eax
; SSE-NEXT: andl $56, %eax
; SSE-NEXT: negl %eax
; SSE-NEXT: cltq
-; SSE-NEXT: movq -56(%rsp,%rax), %rdx
-; SSE-NEXT: movq -48(%rsp,%rax), %r9
+; SSE-NEXT: movq 72(%rsp,%rax), %rdx
+; SSE-NEXT: movq 80(%rsp,%rax), %r9
; SSE-NEXT: movq %r9, %rsi
; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: movq -40(%rsp,%rax), %r10
+; SSE-NEXT: movq 88(%rsp,%rax), %r10
; SSE-NEXT: movq %r10, %r8
; SSE-NEXT: shldq %cl, %r9, %r8
-; SSE-NEXT: movq -32(%rsp,%rax), %r9
-; SSE-NEXT: movq %r9, %r11
-; SSE-NEXT: shldq %cl, %r10, %r11
-; SSE-NEXT: movq -24(%rsp,%rax), %r10
+; SSE-NEXT: movq 96(%rsp,%rax), %r11
+; SSE-NEXT: movq %r11, %r9
+; SSE-NEXT: shldq %cl, %r10, %r9
+; SSE-NEXT: movq 104(%rsp,%rax), %r10
; SSE-NEXT: movq %r10, %rbx
-; SSE-NEXT: shldq %cl, %r9, %rbx
-; SSE-NEXT: movq -16(%rsp,%rax), %r9
-; SSE-NEXT: movq %r9, %r14
+; SSE-NEXT: shldq %cl, %r11, %rbx
+; SSE-NEXT: movq 112(%rsp,%rax), %r11
+; SSE-NEXT: movq %r11, %r14
; SSE-NEXT: shldq %cl, %r10, %r14
-; SSE-NEXT: movq -8(%rsp,%rax), %r10
-; SSE-NEXT: shldq %cl, %r9, %r10
-; SSE-NEXT: movq -64(%rsp,%rax), %rax
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: shlq %cl, %r9
+; SSE-NEXT: movq 120(%rsp,%rax), %r10
+; SSE-NEXT: shldq %cl, %r11, %r10
+; SSE-NEXT: movq 64(%rsp,%rax), %rax
+; SSE-NEXT: movq %rax, %r11
+; SSE-NEXT: shlq %cl, %r11
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; SSE-NEXT: shldq %cl, %rax, %rdx
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: movq %r10, 56(%rdi)
; SSE-NEXT: movq %r14, 48(%rdi)
; SSE-NEXT: movq %rbx, 40(%rdi)
-; SSE-NEXT: movq %r11, 32(%rdi)
+; SSE-NEXT: movq %r9, 32(%rdi)
; SSE-NEXT: movq %r8, 24(%rdi)
; SSE-NEXT: movq %rsi, 16(%rdi)
; SSE-NEXT: movq %rdx, 8(%rdi)
-; SSE-NEXT: movq %r9, (%rdi)
-; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: movq %r11, (%rdi)
+; SSE-NEXT: leaq -16(%rbp), %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: shl_i512:
; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: movq 40(%rbp), %rax
+; AVX2-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX2-NEXT: movq 32(%rbp), %r10
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: shrl $3, %eax
; AVX2-NEXT: andl $56, %eax
; AVX2-NEXT: negl %eax
; AVX2-NEXT: movslq %eax, %r8
-; AVX2-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX2-NEXT: movq -48(%rsp,%r8), %rax
+; AVX2-NEXT: movq 72(%rsp,%r8), %rdx
+; AVX2-NEXT: movq 80(%rsp,%r8), %rax
; AVX2-NEXT: movq %rax, %rsi
; AVX2-NEXT: shldq %cl, %rdx, %rsi
-; AVX2-NEXT: movq -40(%rsp,%r8), %r10
+; AVX2-NEXT: movq 88(%rsp,%r8), %r10
; AVX2-NEXT: movq %r10, %r9
; AVX2-NEXT: shldq %cl, %rax, %r9
-; AVX2-NEXT: movq -32(%rsp,%r8), %rax
+; AVX2-NEXT: movq 96(%rsp,%r8), %rax
; AVX2-NEXT: movq %rax, %r11
; AVX2-NEXT: shldq %cl, %r10, %r11
-; AVX2-NEXT: movq -24(%rsp,%r8), %r10
+; AVX2-NEXT: movq 104(%rsp,%r8), %r10
; AVX2-NEXT: movq %r10, %rbx
; AVX2-NEXT: shldq %cl, %rax, %rbx
-; AVX2-NEXT: movq -16(%rsp,%r8), %rax
+; AVX2-NEXT: movq 112(%rsp,%r8), %rax
; AVX2-NEXT: movq %rax, %r14
; AVX2-NEXT: shldq %cl, %r10, %r14
-; AVX2-NEXT: movq -8(%rsp,%r8), %r10
+; AVX2-NEXT: movq 120(%rsp,%r8), %r10
; AVX2-NEXT: shldq %cl, %rax, %r10
; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movq -64(%rsp,%r8), %rdi
+; AVX2-NEXT: movq 64(%rsp,%r8), %rdi
; AVX2-NEXT: shlxq %rcx, %rdi, %r8
; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX2-NEXT: shldq %cl, %rdi, %rdx
@@ -125,55 +132,59 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; AVX2-NEXT: movq %rsi, 16(%rax)
; AVX2-NEXT: movq %rdx, 8(%rax)
; AVX2-NEXT: movq %r8, (%rax)
-; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: leaq -16(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: shl_i512:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $128, %rsp
+; AVX512F-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX512F-NEXT: movq 32(%rbp), %rax
+; AVX512F-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq 40(%rbp), %rax
+; AVX512F-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm0, (%rsp)
+; AVX512F-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
; AVX512F-NEXT: negl %eax
; AVX512F-NEXT: movslq %eax, %r8
-; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
+; AVX512F-NEXT: movq 72(%rsp,%r8), %rdx
+; AVX512F-NEXT: movq 80(%rsp,%r8), %rax
; AVX512F-NEXT: movq %rax, %rsi
; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
+; AVX512F-NEXT: movq 88(%rsp,%r8), %r10
; AVX512F-NEXT: movq %r10, %r9
; AVX512F-NEXT: shldq %cl, %rax, %r9
-; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
+; AVX512F-NEXT: movq 96(%rsp,%r8), %rax
; AVX512F-NEXT: movq %rax, %r11
; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
+; AVX512F-NEXT: movq 104(%rsp,%r8), %r10
; AVX512F-NEXT: movq %r10, %rbx
; AVX512F-NEXT: shldq %cl, %rax, %rbx
-; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
+; AVX512F-NEXT: movq 112(%rsp,%r8), %rax
; AVX512F-NEXT: movq %rax, %r14
; AVX512F-NEXT: shldq %cl, %r10, %r14
-; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
+; AVX512F-NEXT: movq 120(%rsp,%r8), %r10
; AVX512F-NEXT: shldq %cl, %rax, %r10
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
+; AVX512F-NEXT: movq 64(%rsp,%r8), %rdi
; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512F-NEXT: shldq %cl, %rdi, %rdx
@@ -185,54 +196,59 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %rsi, 16(%rax)
; AVX512F-NEXT: movq %rdx, 8(%rax)
; AVX512F-NEXT: movq %r8, (%rax)
-; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: leaq -16(%rbp), %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shl_i512:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
; AVX512VL-NEXT: pushq %r15
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $160, %rsp
+; AVX512VL-NEXT: movq 40(%rbp), %rax
+; AVX512VL-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX512VL-NEXT: movq 32(%rbp), %r10
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
; AVX512VL-NEXT: andl $63, %ecx
; AVX512VL-NEXT: shrl $3, %eax
; AVX512VL-NEXT: andl $56, %eax
; AVX512VL-NEXT: negl %eax
; AVX512VL-NEXT: movslq %eax, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
+; AVX512VL-NEXT: movq 72(%rsp,%r9), %rdx
+; AVX512VL-NEXT: movq 80(%rsp,%r9), %rax
; AVX512VL-NEXT: movq %rax, %rsi
; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
+; AVX512VL-NEXT: movq 88(%rsp,%r9), %r10
; AVX512VL-NEXT: movq %r10, %r8
; AVX512VL-NEXT: shldq %cl, %rax, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
+; AVX512VL-NEXT: movq 96(%rsp,%r9), %r11
; AVX512VL-NEXT: movq %r11, %rbx
; AVX512VL-NEXT: shldq %cl, %r10, %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
+; AVX512VL-NEXT: movq 104(%rsp,%r9), %rdi
; AVX512VL-NEXT: movq %rdi, %r10
; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
+; AVX512VL-NEXT: movq 64(%rsp,%r9), %r11
+; AVX512VL-NEXT: movq 112(%rsp,%r9), %r14
; AVX512VL-NEXT: movq %r14, %r15
; AVX512VL-NEXT: shldq %cl, %rdi, %r15
-; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
+; AVX512VL-NEXT: movq 120(%rsp,%r9), %rdi
; AVX512VL-NEXT: shldq %cl, %r14, %rdi
; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -245,55 +261,61 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rsi, 16(%rax)
; AVX512VL-NEXT: movq %rdx, 8(%rax)
; AVX512VL-NEXT: movq %r9, (%rax)
+; AVX512VL-NEXT: leaq -24(%rbp), %rsp
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: shl_i512:
; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
; AVX512VBMI-NEXT: pushq %r15
; AVX512VBMI-NEXT: pushq %r14
; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $160, %rsp
+; AVX512VBMI-NEXT: movq 40(%rbp), %rax
+; AVX512VBMI-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX512VBMI-NEXT: movq 32(%rbp), %r10
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VBMI-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movl %eax, %ecx
; AVX512VBMI-NEXT: andl $63, %ecx
; AVX512VBMI-NEXT: shrl $3, %eax
; AVX512VBMI-NEXT: andl $56, %eax
; AVX512VBMI-NEXT: negl %eax
; AVX512VBMI-NEXT: movslq %eax, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
+; AVX512VBMI-NEXT: movq 72(%rsp,%r9), %rdx
+; AVX512VBMI-NEXT: movq 80(%rsp,%r9), %rax
; AVX512VBMI-NEXT: movq %rax, %rsi
; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
+; AVX512VBMI-NEXT: movq 88(%rsp,%r9), %r10
; AVX512VBMI-NEXT: movq %r10, %r8
; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
+; AVX512VBMI-NEXT: movq 96(%rsp,%r9), %r11
; AVX512VBMI-NEXT: movq %r11, %rbx
; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT: movq 104(%rsp,%r9), %rdi
; AVX512VBMI-NEXT: movq %rdi, %r10
; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
+; AVX512VBMI-NEXT: movq 64(%rsp,%r9), %r11
+; AVX512VBMI-NEXT: movq 112(%rsp,%r9), %r14
; AVX512VBMI-NEXT: movq %r14, %r15
; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT: movq 120(%rsp,%r9), %rdi
; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -306,9 +328,11 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
; AVX512VBMI-NEXT: movq %r9, (%rax)
+; AVX512VBMI-NEXT: leaq -24(%rbp), %rsp
; AVX512VBMI-NEXT: popq %rbx
; AVX512VBMI-NEXT: popq %r14
; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = shl i512 %a0, %a1
@@ -318,43 +342,47 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; SSE-LABEL: lshr_i512:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: pushq %r15
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $160, %rsp
+; SSE-NEXT: movq 40(%rbp), %rax
+; SSE-NEXT: movaps 16(%rbp), %xmm0
+; SSE-NEXT: movq 32(%rbp), %r10
; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rsi, (%rsp)
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %eax
; SSE-NEXT: andl $56, %eax
-; SSE-NEXT: movq -112(%rsp,%rax), %rdx
-; SSE-NEXT: movq -120(%rsp,%rax), %r9
+; SSE-NEXT: movq 16(%rsp,%rax), %rdx
+; SSE-NEXT: movq 8(%rsp,%rax), %r9
; SSE-NEXT: movq %r9, %rsi
; SSE-NEXT: shrdq %cl, %rdx, %rsi
-; SSE-NEXT: movq -104(%rsp,%rax), %r8
+; SSE-NEXT: movq 24(%rsp,%rax), %r8
; SSE-NEXT: shrdq %cl, %r8, %rdx
-; SSE-NEXT: movq -96(%rsp,%rax), %r10
+; SSE-NEXT: movq 32(%rsp,%rax), %r10
; SSE-NEXT: shrdq %cl, %r10, %r8
-; SSE-NEXT: movq -88(%rsp,%rax), %r11
+; SSE-NEXT: movq 40(%rsp,%rax), %r11
; SSE-NEXT: shrdq %cl, %r11, %r10
-; SSE-NEXT: movq -80(%rsp,%rax), %rbx
+; SSE-NEXT: movq 48(%rsp,%rax), %rbx
; SSE-NEXT: shrdq %cl, %rbx, %r11
-; SSE-NEXT: movq -72(%rsp,%rax), %r14
+; SSE-NEXT: movq 56(%rsp,%rax), %r14
; SSE-NEXT: shrdq %cl, %r14, %rbx
-; SSE-NEXT: movq -128(%rsp,%rax), %r15
+; SSE-NEXT: movq (%rsp,%rax), %r15
; SSE-NEXT: shrdq %cl, %r9, %r15
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -367,47 +395,53 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; SSE-NEXT: movq %rdx, 16(%rdi)
; SSE-NEXT: movq %rsi, 8(%rdi)
; SSE-NEXT: movq %r15, (%rdi)
+; SSE-NEXT: leaq -24(%rbp), %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: lshr_i512:
; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $160, %rsp
+; AVX2-NEXT: movq 40(%rbp), %rax
+; AVX2-NEXT: vmovaps 16(%rbp), %xmm0
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq 32(%rbp), %r10
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rsi, (%rsp)
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: shrl $3, %eax
; AVX2-NEXT: andl $56, %eax
-; AVX2-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX2-NEXT: movq -120(%rsp,%rax), %r9
+; AVX2-NEXT: movq 16(%rsp,%rax), %rdx
+; AVX2-NEXT: movq 8(%rsp,%rax), %r9
; AVX2-NEXT: movq %r9, %rsi
; AVX2-NEXT: shrdq %cl, %rdx, %rsi
-; AVX2-NEXT: movq -104(%rsp,%rax), %r8
+; AVX2-NEXT: movq 24(%rsp,%rax), %r8
; AVX2-NEXT: shrdq %cl, %r8, %rdx
-; AVX2-NEXT: movq -96(%rsp,%rax), %r10
+; AVX2-NEXT: movq 32(%rsp,%rax), %r10
; AVX2-NEXT: shrdq %cl, %r10, %r8
-; AVX2-NEXT: movq -88(%rsp,%rax), %r11
+; AVX2-NEXT: movq 40(%rsp,%rax), %r11
; AVX2-NEXT: shrdq %cl, %r11, %r10
-; AVX2-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX2-NEXT: movq 48(%rsp,%rax), %rbx
; AVX2-NEXT: shrdq %cl, %rbx, %r11
-; AVX2-NEXT: movq -128(%rsp,%rax), %r14
-; AVX2-NEXT: movq -72(%rsp,%rax), %r15
+; AVX2-NEXT: movq (%rsp,%rax), %r14
+; AVX2-NEXT: movq 56(%rsp,%rax), %r15
; AVX2-NEXT: shrdq %cl, %r15, %rbx
; AVX2-NEXT: shrdq %cl, %r9, %r14
; AVX2-NEXT: movq %rdi, %rax
@@ -420,47 +454,53 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; AVX2-NEXT: movq %rdx, 16(%rdi)
; AVX2-NEXT: movq %rsi, 8(%rdi)
; AVX2-NEXT: movq %r14, (%rdi)
+; AVX2-NEXT: leaq -24(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: lshr_i512:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: pushq %r15
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $160, %rsp
+; AVX512F-NEXT: movq 40(%rbp), %rax
+; AVX512F-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX512F-NEXT: movq 32(%rbp), %r10
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rsi, (%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512F-NEXT: movq 16(%rsp,%rax), %rdx
+; AVX512F-NEXT: movq 8(%rsp,%rax), %r9
; AVX512F-NEXT: movq %r9, %rsi
; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512F-NEXT: movq 24(%rsp,%rax), %r8
; AVX512F-NEXT: shrdq %cl, %r8, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512F-NEXT: movq 32(%rsp,%rax), %r10
; AVX512F-NEXT: shrdq %cl, %r10, %r8
-; AVX512F-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512F-NEXT: movq 40(%rsp,%rax), %r11
; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512F-NEXT: movq 48(%rsp,%rax), %rbx
; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rax), %r15
+; AVX512F-NEXT: movq (%rsp,%rax), %r14
+; AVX512F-NEXT: movq 56(%rsp,%rax), %r15
; AVX512F-NEXT: shrdq %cl, %r15, %rbx
; AVX512F-NEXT: shrdq %cl, %r9, %r14
; AVX512F-NEXT: movq %rdi, %rax
@@ -473,48 +513,54 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %rdx, 16(%rdi)
; AVX512F-NEXT: movq %rsi, 8(%rdi)
; AVX512F-NEXT: movq %r14, (%rdi)
+; AVX512F-NEXT: leaq -24(%rbp), %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_i512:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
; AVX512VL-NEXT: pushq %r15
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $160, %rsp
+; AVX512VL-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX512VL-NEXT: movq 40(%rbp), %rax
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq 32(%rbp), %r10
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rsi, (%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
; AVX512VL-NEXT: andl $63, %ecx
; AVX512VL-NEXT: shrl $3, %eax
; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512VL-NEXT: movq 16(%rsp,%rax), %rdx
+; AVX512VL-NEXT: movq 8(%rsp,%rax), %r9
; AVX512VL-NEXT: movq %r9, %rsi
; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512VL-NEXT: movq 24(%rsp,%rax), %r8
; AVX512VL-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512VL-NEXT: movq 32(%rsp,%rax), %r10
; AVX512VL-NEXT: shrdq %cl, %r10, %r8
-; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512VL-NEXT: movq 40(%rsp,%rax), %r11
; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512VL-NEXT: movq 48(%rsp,%rax), %rbx
; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14
+; AVX512VL-NEXT: movq 56(%rsp,%rax), %r14
; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15
+; AVX512VL-NEXT: movq (%rsp,%rax), %r15
; AVX512VL-NEXT: shrdq %cl, %r9, %r15
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
@@ -526,49 +572,55 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rdx, 16(%rdi)
; AVX512VL-NEXT: movq %rsi, 8(%rdi)
; AVX512VL-NEXT: movq %r15, (%rdi)
+; AVX512VL-NEXT: leaq -24(%rbp), %rsp
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_i512:
; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
; AVX512VBMI-NEXT: pushq %r15
; AVX512VBMI-NEXT: pushq %r14
; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $160, %rsp
+; AVX512VBMI-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX512VBMI-NEXT: movq 40(%rbp), %rax
; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq 32(%rbp), %r10
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rsi, (%rsp)
; AVX512VBMI-NEXT: movl %eax, %ecx
; AVX512VBMI-NEXT: andl $63, %ecx
; AVX512VBMI-NEXT: shrl $3, %eax
; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512VBMI-NEXT: movq 16(%rsp,%rax), %rdx
+; AVX512VBMI-NEXT: movq 8(%rsp,%rax), %r9
; AVX512VBMI-NEXT: movq %r9, %rsi
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512VBMI-NEXT: movq 24(%rsp,%rax), %r8
; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512VBMI-NEXT: movq 32(%rsp,%rax), %r10
; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512VBMI-NEXT: movq 40(%rsp,%rax), %r11
; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512VBMI-NEXT: movq 48(%rsp,%rax), %rbx
; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14
+; AVX512VBMI-NEXT: movq 56(%rsp,%rax), %r14
; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15
+; AVX512VBMI-NEXT: movq (%rsp,%rax), %r15
; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15
; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
@@ -580,9 +632,11 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
; AVX512VBMI-NEXT: movq %r15, (%rdi)
+; AVX512VBMI-NEXT: leaq -24(%rbp), %rsp
; AVX512VBMI-NEXT: popq %rbx
; AVX512VBMI-NEXT: popq %r14
; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = lshr i512 %a0, %a1
@@ -592,47 +646,51 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
; SSE-LABEL: ashr_i512:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: pushq %r15
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $160, %rsp
+; SSE-NEXT: movq 40(%rbp), %rax
+; SSE-NEXT: movaps 16(%rbp), %xmm0
+; SSE-NEXT: movq 32(%rbp), %r10
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rsi, (%rsp)
; SSE-NEXT: sarq $63, %r10
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %eax
; SSE-NEXT: andl $56, %eax
-; SSE-NEXT: movq -112(%rsp,%rax), %rdx
-; SSE-NEXT: movq -120(%rsp,%rax), %r9
+; SSE-NEXT: movq 16(%rsp,%rax), %rdx
+; SSE-NEXT: movq 8(%rsp,%rax), %r9
; SSE-NEXT: movq %r9, %rsi
; SSE-NEXT: shrdq %cl, %rdx, %rsi
-; SSE-NEXT: movq -104(%rsp,%rax), %r8
+; SSE-NEXT: movq 24(%rsp,%rax), %r8
; SSE-NEXT: shrdq %cl, %r8, %rdx
-; SSE-NEXT: movq -96(%rsp,%rax), %r10
+; SSE-NEXT: movq 32(%rsp,%rax), %r10
; SSE-NEXT: shrdq %cl, %r10, %r8
-; SSE-NEXT: movq -88(%rsp,%rax), %r11
+; SSE-NEXT: movq 40(%rsp,%rax), %r11
; SSE-NEXT: shrdq %cl, %r11, %r10
-; SSE-NEXT: movq -80(%rsp,%rax), %rbx
+; SSE-NEXT: movq 48(%rsp,%rax), %rbx
; SSE-NEXT: shrdq %cl, %rbx, %r11
-; SSE-NEXT: movq -72(%rsp,%rax), %r14
+; SSE-NEXT: movq 56(%rsp,%rax), %r14
; SSE-NEXT: shrdq %cl, %r14, %rbx
-; SSE-NEXT: movq -128(%rsp,%rax), %r15
+; SSE-NEXT: movq (%rsp,%rax), %r15
; SSE-NEXT: shrdq %cl, %r9, %r15
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -645,53 +703,59 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
; SSE-NEXT: movq %rdx, 16(%rdi)
; SSE-NEXT: movq %rsi, 8(%rdi)
; SSE-NEXT: movq %r15, (%rdi)
+; SSE-NEXT: leaq -24(%rbp), %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: ashr_i512:
; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $160, %rsp
+; AVX2-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX2-NEXT: movq 32(%rbp), %r10
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq 40(%rbp), %rax
+; AVX2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rsi, (%rsp)
; AVX2-NEXT: sarq $63, %r10
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: shrl $3, %eax
; AVX2-NEXT: andl $56, %eax
-; AVX2-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX2-NEXT: movq -120(%rsp,%rax), %r9
+; AVX2-NEXT: movq 16(%rsp,%rax), %rdx
+; AVX2-NEXT: movq 8(%rsp,%rax), %r9
; AVX2-NEXT: movq %r9, %rsi
; AVX2-NEXT: shrdq %cl, %rdx, %rsi
-; AVX2-NEXT: movq -104(%rsp,%rax), %r8
+; AVX2-NEXT: movq 24(%rsp,%rax), %r8
; AVX2-NEXT: shrdq %cl, %r8, %rdx
-; AVX2-NEXT: movq -96(%rsp,%rax), %r10
+; AVX2-NEXT: movq 32(%rsp,%rax), %r10
; AVX2-NEXT: shrdq %cl, %r10, %r8
-; AVX2-NEXT: movq -88(%rsp,%rax), %r11
+; AVX2-NEXT: movq 40(%rsp,%rax), %r11
; AVX2-NEXT: shrdq %cl, %r11, %r10
-; AVX2-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX2-NEXT: movq 48(%rsp,%rax), %rbx
; AVX2-NEXT: shrdq %cl, %rbx, %r11
-; AVX2-NEXT: movq -128(%rsp,%rax), %r14
-; AVX2-NEXT: movq -72(%rsp,%rax), %r15
+; AVX2-NEXT: movq (%rsp,%rax), %r14
+; AVX2-NEXT: movq 56(%rsp,%rax), %r15
; AVX2-NEXT: shrdq %cl, %r15, %rbx
; AVX2-NEXT: shrdq %cl, %r9, %r14
; AVX2-NEXT: movq %rdi, %rax
@@ -704,53 +768,59 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
; AVX2-NEXT: movq %rdx, 16(%rdi)
; AVX2-NEXT: movq %rsi, 8(%rdi)
; AVX2-NEXT: movq %r14, (%rdi)
+; AVX2-NEXT: leaq -24(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
; AVX512F-LABEL: ashr_i512:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: pushq %r15
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $160, %rsp
+; AVX512F-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX512F-NEXT: movq 32(%rbp), %r10
+; AVX512F-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq 40(%rbp), %rax
+; AVX512F-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rsi, (%rsp)
; AVX512F-NEXT: sarq $63, %r10
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512F-NEXT: movq 16(%rsp,%rax), %rdx
+; AVX512F-NEXT: movq 8(%rsp,%rax), %r9
; AVX512F-NEXT: movq %r9, %rsi
; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512F-NEXT: movq 24(%rsp,%rax), %r8
; AVX512F-NEXT: shrdq %cl, %r8, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512F-NEXT: movq 32(%rsp,%rax), %r10
; AVX512F-NEXT: shrdq %cl, %r10, %r8
-; AVX512F-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512F-NEXT: movq 40(%rsp,%rax), %r11
; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512F-NEXT: movq 48(%rsp,%rax), %rbx
; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rax), %r15
+; AVX512F-NEXT: movq (%rsp,%rax), %r14
+; AVX512F-NEXT: movq 56(%rsp,%rax), %r15
; AVX512F-NEXT: shrdq %cl, %r15, %rbx
; AVX512F-NEXT: shrdq %cl, %r9, %r14
; AVX512F-NEXT: movq %rdi, %rax
@@ -763,54 +833,60 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %rdx, 16(%rdi)
; AVX512F-NEXT: movq %rsi, 8(%rdi)
; AVX512F-NEXT: movq %r14, (%rdi)
+; AVX512F-NEXT: leaq -24(%rbp), %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_i512:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
; AVX512VL-NEXT: pushq %r15
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $160, %rsp
+; AVX512VL-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX512VL-NEXT: movq 32(%rbp), %r10
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq 40(%rbp), %rax
+; AVX512VL-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rsi, (%rsp)
; AVX512VL-NEXT: sarq $63, %r10
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
; AVX512VL-NEXT: andl $63, %ecx
; AVX512VL-NEXT: shrl $3, %eax
; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512VL-NEXT: movq 16(%rsp,%rax), %rdx
+; AVX512VL-NEXT: movq 8(%rsp,%rax), %r9
; AVX512VL-NEXT: movq %r9, %rsi
; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512VL-NEXT: movq 24(%rsp,%rax), %r8
; AVX512VL-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512VL-NEXT: movq 32(%rsp,%rax), %r10
; AVX512VL-NEXT: shrdq %cl, %r10, %r8
-; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512VL-NEXT: movq 40(%rsp,%rax), %r11
; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512VL-NEXT: movq 48(%rsp,%rax), %rbx
; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14
+; AVX512VL-NEXT: movq 56(%rsp,%rax), %r14
; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15
+; AVX512VL-NEXT: movq (%rsp,%rax), %r15
; AVX512VL-NEXT: shrdq %cl, %r9, %r15
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
@@ -822,54 +898,60 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rdx, 16(%rdi)
; AVX512VL-NEXT: movq %rsi, 8(%rdi)
; AVX512VL-NEXT: movq %r15, (%rdi)
+; AVX512VL-NEXT: leaq -24(%rbp), %rsp
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_i512:
; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
; AVX512VBMI-NEXT: pushq %r15
; AVX512VBMI-NEXT: pushq %r14
; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $160, %rsp
+; AVX512VBMI-NEXT: vmovaps 16(%rbp), %xmm0
+; AVX512VBMI-NEXT: movq 32(%rbp), %r10
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq 40(%rbp), %rax
+; AVX512VBMI-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rsi, (%rsp)
; AVX512VBMI-NEXT: sarq $63, %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movl %eax, %ecx
; AVX512VBMI-NEXT: andl $63, %ecx
; AVX512VBMI-NEXT: shrl $3, %eax
; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512VBMI-NEXT: movq 16(%rsp,%rax), %rdx
+; AVX512VBMI-NEXT: movq 8(%rsp,%rax), %r9
; AVX512VBMI-NEXT: movq %r9, %rsi
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512VBMI-NEXT: movq 24(%rsp,%rax), %r8
; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512VBMI-NEXT: movq 32(%rsp,%rax), %r10
; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512VBMI-NEXT: movq 40(%rsp,%rax), %r11
; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512VBMI-NEXT: movq 48(%rsp,%rax), %rbx
; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14
+; AVX512VBMI-NEXT: movq 56(%rsp,%rax), %r14
; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15
+; AVX512VBMI-NEXT: movq (%rsp,%rax), %r15
; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15
; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
@@ -881,9 +963,11 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
; AVX512VBMI-NEXT: movq %r15, (%rdi)
+; AVX512VBMI-NEXT: leaq -24(%rbp), %rsp
; AVX512VBMI-NEXT: popq %rbx
; AVX512VBMI-NEXT: popq %r14
; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: retq
%r = ashr i512 %a0, %a1
ret i512 %r
@@ -892,103 +976,110 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
; SSE-LABEL: shl_i512_load:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pushq %rax
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $128, %rsp
; SSE-NEXT: movaps (%rsi), %xmm0
; SSE-NEXT: movaps 16(%rsi), %xmm1
; SSE-NEXT: movaps 32(%rsi), %xmm2
; SSE-NEXT: movaps 48(%rsi), %xmm3
; SSE-NEXT: xorps %xmm4, %xmm4
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, (%rsp)
+; SSE-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %edx, %ecx
; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %edx
; SSE-NEXT: andl $56, %edx
; SSE-NEXT: negl %edx
; SSE-NEXT: movslq %edx, %rax
-; SSE-NEXT: movq -56(%rsp,%rax), %rdx
-; SSE-NEXT: movq -48(%rsp,%rax), %r9
+; SSE-NEXT: movq 72(%rsp,%rax), %rdx
+; SSE-NEXT: movq 80(%rsp,%rax), %r9
; SSE-NEXT: movq %r9, %rsi
; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: movq -40(%rsp,%rax), %r10
+; SSE-NEXT: movq 88(%rsp,%rax), %r10
; SSE-NEXT: movq %r10, %r8
; SSE-NEXT: shldq %cl, %r9, %r8
-; SSE-NEXT: movq -32(%rsp,%rax), %r9
-; SSE-NEXT: movq %r9, %r11
-; SSE-NEXT: shldq %cl, %r10, %r11
-; SSE-NEXT: movq -24(%rsp,%rax), %r10
+; SSE-NEXT: movq 96(%rsp,%rax), %r11
+; SSE-NEXT: movq %r11, %r9
+; SSE-NEXT: shldq %cl, %r10, %r9
+; SSE-NEXT: movq 104(%rsp,%rax), %r10
; SSE-NEXT: movq %r10, %rbx
-; SSE-NEXT: shldq %cl, %r9, %rbx
-; SSE-NEXT: movq -16(%rsp,%rax), %r9
-; SSE-NEXT: movq %r9, %r14
+; SSE-NEXT: shldq %cl, %r11, %rbx
+; SSE-NEXT: movq 112(%rsp,%rax), %r11
+; SSE-NEXT: movq %r11, %r14
; SSE-NEXT: shldq %cl, %r10, %r14
-; SSE-NEXT: movq -8(%rsp,%rax), %r10
-; SSE-NEXT: shldq %cl, %r9, %r10
-; SSE-NEXT: movq -64(%rsp,%rax), %rax
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: shlq %cl, %r9
+; SSE-NEXT: movq 120(%rsp,%rax), %r10
+; SSE-NEXT: shldq %cl, %r11, %r10
+; SSE-NEXT: movq 64(%rsp,%rax), %rax
+; SSE-NEXT: movq %rax, %r11
+; SSE-NEXT: shlq %cl, %r11
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; SSE-NEXT: shldq %cl, %rax, %rdx
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: movq %r10, 56(%rdi)
; SSE-NEXT: movq %r14, 48(%rdi)
; SSE-NEXT: movq %rbx, 40(%rdi)
-; SSE-NEXT: movq %r11, 32(%rdi)
+; SSE-NEXT: movq %r9, 32(%rdi)
; SSE-NEXT: movq %r8, 24(%rdi)
; SSE-NEXT: movq %rsi, 16(%rdi)
; SSE-NEXT: movq %rdx, 8(%rdi)
-; SSE-NEXT: movq %r9, (%rdi)
-; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: movq %r11, (%rdi)
+; SSE-NEXT: leaq -16(%rbp), %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: shl_i512_load:
; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: vmovups (%rsi), %ymm0
-; AVX2-NEXT: vmovups 32(%rsi), %ymm1
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: vmovaps (%rsi), %ymm0
+; AVX2-NEXT: vmovaps 32(%rsi), %ymm1
; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, (%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: movl %edx, %ecx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: shrl $3, %edx
; AVX2-NEXT: andl $56, %edx
; AVX2-NEXT: negl %edx
; AVX2-NEXT: movslq %edx, %r8
-; AVX2-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX2-NEXT: movq -48(%rsp,%r8), %rax
+; AVX2-NEXT: movq 72(%rsp,%r8), %rdx
+; AVX2-NEXT: movq 80(%rsp,%r8), %rax
; AVX2-NEXT: movq %rax, %rsi
; AVX2-NEXT: shldq %cl, %rdx, %rsi
-; AVX2-NEXT: movq -40(%rsp,%r8), %r10
+; AVX2-NEXT: movq 88(%rsp,%r8), %r10
; AVX2-NEXT: movq %r10, %r9
; AVX2-NEXT: shldq %cl, %rax, %r9
-; AVX2-NEXT: movq -32(%rsp,%r8), %rax
+; AVX2-NEXT: movq 96(%rsp,%r8), %rax
; AVX2-NEXT: movq %rax, %r11
; AVX2-NEXT: shldq %cl, %r10, %r11
-; AVX2-NEXT: movq -24(%rsp,%r8), %r10
+; AVX2-NEXT: movq 104(%rsp,%r8), %r10
; AVX2-NEXT: movq %r10, %rbx
; AVX2-NEXT: shldq %cl, %rax, %rbx
-; AVX2-NEXT: movq -16(%rsp,%r8), %rax
+; AVX2-NEXT: movq 112(%rsp,%r8), %rax
; AVX2-NEXT: movq %rax, %r14
; AVX2-NEXT: shldq %cl, %r10, %r14
-; AVX2-NEXT: movq -8(%rsp,%r8), %r10
+; AVX2-NEXT: movq 120(%rsp,%r8), %r10
; AVX2-NEXT: shldq %cl, %rax, %r10
; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movq -64(%rsp,%r8), %rdi
+; AVX2-NEXT: movq 64(%rsp,%r8), %rdi
; AVX2-NEXT: shlxq %rcx, %rdi, %r8
; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX2-NEXT: shldq %cl, %rdi, %rdx
@@ -1000,47 +1091,51 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX2-NEXT: movq %rsi, 16(%rax)
; AVX2-NEXT: movq %rdx, 8(%rax)
; AVX2-NEXT: movq %r8, (%rax)
-; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: leaq -16(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: shl_i512_load:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $128, %rsp
; AVX512F-NEXT: vmovups (%rsi), %zmm0
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm1, (%rsp)
+; AVX512F-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %edx, %ecx
; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: shrl $3, %edx
; AVX512F-NEXT: andl $56, %edx
; AVX512F-NEXT: negl %edx
; AVX512F-NEXT: movslq %edx, %r8
-; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
+; AVX512F-NEXT: movq 72(%rsp,%r8), %rdx
+; AVX512F-NEXT: movq 80(%rsp,%r8), %rax
; AVX512F-NEXT: movq %rax, %rsi
; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
+; AVX512F-NEXT: movq 88(%rsp,%r8), %r10
; AVX512F-NEXT: movq %r10, %r9
; AVX512F-NEXT: shldq %cl, %rax, %r9
-; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
+; AVX512F-NEXT: movq 96(%rsp,%r8), %rax
; AVX512F-NEXT: movq %rax, %r11
; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
+; AVX512F-NEXT: movq 104(%rsp,%r8), %r10
; AVX512F-NEXT: movq %r10, %rbx
; AVX512F-NEXT: shldq %cl, %rax, %rbx
-; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
+; AVX512F-NEXT: movq 112(%rsp,%r8), %rax
; AVX512F-NEXT: movq %rax, %r14
; AVX512F-NEXT: shldq %cl, %r10, %r14
-; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
+; AVX512F-NEXT: movq 120(%rsp,%r8), %r10
; AVX512F-NEXT: shldq %cl, %rax, %r10
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
+; AVX512F-NEXT: movq 64(%rsp,%r8), %rdi
; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512F-NEXT: shldq %cl, %rdi, %rdx
@@ -1052,48 +1147,53 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %rsi, 16(%rax)
; AVX512F-NEXT: movq %rdx, 8(%rax)
; AVX512F-NEXT: movq %r8, (%rax)
-; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: leaq -16(%rbp), %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shl_i512_load:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
; AVX512VL-NEXT: pushq %r15
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vmovups (%rsi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $160, %rsp
+; AVX512VL-NEXT: vmovaps (%rsi), %ymm0
+; AVX512VL-NEXT: vmovaps 32(%rsi), %ymm1
; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm2, (%rsp)
+; AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %edx, %ecx
; AVX512VL-NEXT: andl $63, %ecx
; AVX512VL-NEXT: shrl $3, %edx
; AVX512VL-NEXT: andl $56, %edx
; AVX512VL-NEXT: negl %edx
; AVX512VL-NEXT: movslq %edx, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
+; AVX512VL-NEXT: movq 72(%rsp,%r9), %rdx
+; AVX512VL-NEXT: movq 80(%rsp,%r9), %rax
; AVX512VL-NEXT: movq %rax, %rsi
; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
+; AVX512VL-NEXT: movq 88(%rsp,%r9), %r10
; AVX512VL-NEXT: movq %r10, %r8
; AVX512VL-NEXT: shldq %cl, %rax, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
+; AVX512VL-NEXT: movq 96(%rsp,%r9), %r11
; AVX512VL-NEXT: movq %r11, %rbx
; AVX512VL-NEXT: shldq %cl, %r10, %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
+; AVX512VL-NEXT: movq 104(%rsp,%r9), %rdi
; AVX512VL-NEXT: movq %rdi, %r10
; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
+; AVX512VL-NEXT: movq 64(%rsp,%r9), %r11
+; AVX512VL-NEXT: movq 112(%rsp,%r9), %r14
; AVX512VL-NEXT: movq %r14, %r15
; AVX512VL-NEXT: shldq %cl, %rdi, %r15
-; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
+; AVX512VL-NEXT: movq 120(%rsp,%r9), %rdi
; AVX512VL-NEXT: shldq %cl, %r14, %rdi
; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -1106,49 +1206,55 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rsi, 16(%rax)
; AVX512VL-NEXT: movq %rdx, 8(%rax)
; AVX512VL-NEXT: movq %r9, (%rax)
+; AVX512VL-NEXT: leaq -24(%rbp), %rsp
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: shl_i512_load:
; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
; AVX512VBMI-NEXT: pushq %r15
; AVX512VBMI-NEXT: pushq %r14
; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $160, %rsp
+; AVX512VBMI-NEXT: vmovaps (%rsi), %ymm0
+; AVX512VBMI-NEXT: vmovaps 32(%rsi), %ymm1
; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm2, (%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movl %edx, %ecx
; AVX512VBMI-NEXT: andl $63, %ecx
; AVX512VBMI-NEXT: shrl $3, %edx
; AVX512VBMI-NEXT: andl $56, %edx
; AVX512VBMI-NEXT: negl %edx
; AVX512VBMI-NEXT: movslq %edx, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
+; AVX512VBMI-NEXT: movq 72(%rsp,%r9), %rdx
+; AVX512VBMI-NEXT: movq 80(%rsp,%r9), %rax
; AVX512VBMI-NEXT: movq %rax, %rsi
; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
+; AVX512VBMI-NEXT: movq 88(%rsp,%r9), %r10
; AVX512VBMI-NEXT: movq %r10, %r8
; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
+; AVX512VBMI-NEXT: movq 96(%rsp,%r9), %r11
; AVX512VBMI-NEXT: movq %r11, %rbx
; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT: movq 104(%rsp,%r9), %rdi
; AVX512VBMI-NEXT: movq %rdi, %r10
; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
+; AVX512VBMI-NEXT: movq 64(%rsp,%r9), %r11
+; AVX512VBMI-NEXT: movq 112(%rsp,%r9), %r14
; AVX512VBMI-NEXT: movq %r14, %r15
; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT: movq 120(%rsp,%r9), %rdi
; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -1161,9 +1267,11 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
; AVX512VBMI-NEXT: movq %r9, (%rax)
+; AVX512VBMI-NEXT: leaq -24(%rbp), %rsp
; AVX512VBMI-NEXT: popq %rbx
; AVX512VBMI-NEXT: popq %r14
; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -1174,41 +1282,44 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
; SSE-LABEL: lshr_i512_load:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pushq %rax
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $128, %rsp
; SSE-NEXT: movaps (%rsi), %xmm0
; SSE-NEXT: movaps 16(%rsi), %xmm1
; SSE-NEXT: movaps 32(%rsi), %xmm2
; SSE-NEXT: movaps 48(%rsi), %xmm3
; SSE-NEXT: xorps %xmm4, %xmm4
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: movl %edx, %ecx
; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %edx
; SSE-NEXT: andl $56, %edx
-; SSE-NEXT: movq -112(%rsp,%rdx), %rsi
-; SSE-NEXT: movq -120(%rsp,%rdx), %rax
+; SSE-NEXT: movq 16(%rsp,%rdx), %rsi
+; SSE-NEXT: movq 8(%rsp,%rdx), %rax
; SSE-NEXT: movq %rax, %r8
; SSE-NEXT: shrdq %cl, %rsi, %r8
-; SSE-NEXT: movq -104(%rsp,%rdx), %r9
+; SSE-NEXT: movq 24(%rsp,%rdx), %r9
; SSE-NEXT: shrdq %cl, %r9, %rsi
-; SSE-NEXT: movq -96(%rsp,%rdx), %r10
+; SSE-NEXT: movq 32(%rsp,%rdx), %r10
; SSE-NEXT: shrdq %cl, %r10, %r9
-; SSE-NEXT: movq -88(%rsp,%rdx), %r11
+; SSE-NEXT: movq 40(%rsp,%rdx), %r11
; SSE-NEXT: shrdq %cl, %r11, %r10
-; SSE-NEXT: movq -80(%rsp,%rdx), %rbx
+; SSE-NEXT: movq 48(%rsp,%rdx), %rbx
; SSE-NEXT: shrdq %cl, %rbx, %r11
-; SSE-NEXT: movq -72(%rsp,%rdx), %r14
+; SSE-NEXT: movq 56(%rsp,%rdx), %r14
; SSE-NEXT: shrdq %cl, %r14, %rbx
-; SSE-NEXT: movq -128(%rsp,%rdx), %rdx
+; SSE-NEXT: movq (%rsp,%rdx), %rdx
; SSE-NEXT: shrdq %cl, %rax, %rdx
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -1221,41 +1332,45 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
; SSE-NEXT: movq %rsi, 16(%rdi)
; SSE-NEXT: movq %r8, 8(%rdi)
; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: leaq -16(%rbp), %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: lshr_i512_load:
; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: vmovups (%rsi), %ymm0
-; AVX2-NEXT: vmovups 32(%rsi), %ymm1
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: vmovaps (%rsi), %ymm0
+; AVX2-NEXT: vmovaps 32(%rsi), %ymm1
; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: movl %edx, %ecx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: shrl $3, %edx
; AVX2-NEXT: andl $56, %edx
-; AVX2-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX2-NEXT: movq -120(%rsp,%rdx), %rax
+; AVX2-NEXT: movq 16(%rsp,%rdx), %rsi
+; AVX2-NEXT: movq 8(%rsp,%rdx), %rax
; AVX2-NEXT: movq %rax, %r8
; AVX2-NEXT: shrdq %cl, %rsi, %r8
-; AVX2-NEXT: movq -104(%rsp,%rdx), %r9
+; AVX2-NEXT: movq 24(%rsp,%rdx), %r9
; AVX2-NEXT: shrdq %cl, %r9, %rsi
-; AVX2-NEXT: movq -96(%rsp,%rdx), %r10
+; AVX2-NEXT: movq 32(%rsp,%rdx), %r10
; AVX2-NEXT: shrdq %cl, %r10, %r9
-; AVX2-NEXT: movq -88(%rsp,%rdx), %r11
+; AVX2-NEXT: movq 40(%rsp,%rdx), %r11
; AVX2-NEXT: shrdq %cl, %r11, %r10
-; AVX2-NEXT: movq -80(%rsp,%rdx), %rbx
+; AVX2-NEXT: movq 48(%rsp,%rdx), %rbx
; AVX2-NEXT: shrdq %cl, %rbx, %r11
-; AVX2-NEXT: movq -128(%rsp,%rdx), %r14
-; AVX2-NEXT: movq -72(%rsp,%rdx), %rdx
+; AVX2-NEXT: movq (%rsp,%rdx), %r14
+; AVX2-NEXT: movq 56(%rsp,%rdx), %rdx
; AVX2-NEXT: shrdq %cl, %rdx, %rbx
; AVX2-NEXT: shrdq %cl, %rax, %r14
; AVX2-NEXT: movq %rdi, %rax
@@ -1268,39 +1383,43 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX2-NEXT: movq %rsi, 16(%rdi)
; AVX2-NEXT: movq %r8, 8(%rdi)
; AVX2-NEXT: movq %r14, (%rdi)
-; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: leaq -16(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: lshr_i512_load:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $128, %rsp
; AVX512F-NEXT: vmovups (%rsi), %zmm0
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm0, (%rsp)
; AVX512F-NEXT: movl %edx, %ecx
; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: shrl $3, %edx
; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rax
+; AVX512F-NEXT: movq 16(%rsp,%rdx), %rsi
+; AVX512F-NEXT: movq 8(%rsp,%rdx), %rax
; AVX512F-NEXT: movq %rax, %r8
; AVX512F-NEXT: shrdq %cl, %rsi, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rdx), %r9
+; AVX512F-NEXT: movq 24(%rsp,%rdx), %r9
; AVX512F-NEXT: shrdq %cl, %r9, %rsi
-; AVX512F-NEXT: movq -96(%rsp,%rdx), %r10
+; AVX512F-NEXT: movq 32(%rsp,%rdx), %r10
; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rdx), %r11
+; AVX512F-NEXT: movq 40(%rsp,%rdx), %r11
; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rdx), %rbx
+; AVX512F-NEXT: movq 48(%rsp,%rdx), %rbx
; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rdx), %rdx
+; AVX512F-NEXT: movq (%rsp,%rdx), %r14
+; AVX512F-NEXT: movq 56(%rsp,%rdx), %rdx
; AVX512F-NEXT: shrdq %cl, %rdx, %rbx
; AVX512F-NEXT: shrdq %cl, %rax, %r14
; AVX512F-NEXT: movq %rdi, %rax
@@ -1313,42 +1432,46 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %rsi, 16(%rdi)
; AVX512F-NEXT: movq %r8, 8(%rdi)
; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: leaq -16(%rbp), %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_i512_load:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rsi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $128, %rsp
+; AVX512VL-NEXT: vmovaps (%rsi), %ymm0
+; AVX512VL-NEXT: vmovaps 32(%rsi), %ymm1
; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VL-NEXT: movl %edx, %ecx
; AVX512VL-NEXT: andl $63, %ecx
; AVX512VL-NEXT: shrl $3, %edx
; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rax
+; AVX512VL-NEXT: movq 16(%rsp,%rdx), %rsi
+; AVX512VL-NEXT: movq 8(%rsp,%rdx), %rax
; AVX512VL-NEXT: movq %rax, %r8
; AVX512VL-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rdx), %r9
+; AVX512VL-NEXT: movq 24(%rsp,%rdx), %r9
; AVX512VL-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VL-NEXT: movq -96(%rsp,%rdx), %r10
+; AVX512VL-NEXT: movq 32(%rsp,%rdx), %r10
; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rdx), %r11
+; AVX512VL-NEXT: movq 40(%rsp,%rdx), %r11
; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rdx), %rbx
+; AVX512VL-NEXT: movq 48(%rsp,%rdx), %rbx
; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rdx), %r14
+; AVX512VL-NEXT: movq 56(%rsp,%rdx), %r14
; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rdx
+; AVX512VL-NEXT: movq (%rsp,%rdx), %rdx
; AVX512VL-NEXT: shrdq %cl, %rax, %rdx
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
@@ -1360,43 +1483,47 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rsi, 16(%rdi)
; AVX512VL-NEXT: movq %r8, 8(%rdi)
; AVX512VL-NEXT: movq %rdx, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
+; AVX512VL-NEXT: leaq -16(%rbp), %rsp
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_i512_load:
; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
; AVX512VBMI-NEXT: pushq %r14
; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $128, %rsp
+; AVX512VBMI-NEXT: vmovaps (%rsi), %ymm0
+; AVX512VBMI-NEXT: vmovaps 32(%rsi), %ymm1
; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VBMI-NEXT: movl %edx, %ecx
; AVX512VBMI-NEXT: andl $63, %ecx
; AVX512VBMI-NEXT: shrl $3, %edx
; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rax
+; AVX512VBMI-NEXT: movq 16(%rsp,%rdx), %rsi
+; AVX512VBMI-NEXT: movq 8(%rsp,%rdx), %rax
; AVX512VBMI-NEXT: movq %rax, %r8
; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rdx), %r9
+; AVX512VBMI-NEXT: movq 24(%rsp,%rdx), %r9
; AVX512VBMI-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VBMI-NEXT: movq -96(%rsp,%rdx), %r10
+; AVX512VBMI-NEXT: movq 32(%rsp,%rdx), %r10
; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rdx), %r11
+; AVX512VBMI-NEXT: movq 40(%rsp,%rdx), %r11
; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rdx), %rbx
+; AVX512VBMI-NEXT: movq 48(%rsp,%rdx), %rbx
; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rdx), %r14
+; AVX512VBMI-NEXT: movq 56(%rsp,%rdx), %r14
; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT: movq (%rsp,%rdx), %rdx
; AVX512VBMI-NEXT: shrdq %cl, %rax, %rdx
; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
@@ -1408,9 +1535,10 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX512VBMI-NEXT: movq %rsi, 16(%rdi)
; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
; AVX512VBMI-NEXT: movq %rdx, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
+; AVX512VBMI-NEXT: leaq -16(%rbp), %rsp
; AVX512VBMI-NEXT: popq %rbx
; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -1421,47 +1549,50 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
; SSE-LABEL: ashr_i512_load:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pushq %rax
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $128, %rsp
; SSE-NEXT: movaps (%rsi), %xmm0
; SSE-NEXT: movaps 16(%rsi), %xmm1
; SSE-NEXT: movaps 32(%rsi), %xmm2
; SSE-NEXT: movq 48(%rsi), %rax
; SSE-NEXT: movq 56(%rsi), %rcx
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: sarq $63, %rcx
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %edx, %ecx
; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %edx
; SSE-NEXT: andl $56, %edx
-; SSE-NEXT: movq -112(%rsp,%rdx), %rsi
-; SSE-NEXT: movq -120(%rsp,%rdx), %rax
+; SSE-NEXT: movq 16(%rsp,%rdx), %rsi
+; SSE-NEXT: movq 8(%rsp,%rdx), %rax
; SSE-NEXT: movq %rax, %r8
; SSE-NEXT: shrdq %cl, %rsi, %r8
-; SSE-NEXT: movq -104(%rsp,%rdx), %r9
+; SSE-NEXT: movq 24(%rsp,%rdx), %r9
; SSE-NEXT: shrdq %cl, %r9, %rsi
-; SSE-NEXT: movq -96(%rsp,%rdx), %r10
+; SSE-NEXT: movq 32(%rsp,%rdx), %r10
; SSE-NEXT: shrdq %cl, %r10, %r9
-; SSE-NEXT: movq -88(%rsp,%rdx), %r11
+; SSE-NEXT: movq 40(%rsp,%rdx), %r11
; SSE-NEXT: shrdq %cl, %r11, %r10
-; SSE-NEXT: movq -80(%rsp,%rdx), %rbx
+; SSE-NEXT: movq 48(%rsp,%rdx), %rbx
; SSE-NEXT: shrdq %cl, %rbx, %r11
-; SSE-NEXT: movq -72(%rsp,%rdx), %r14
+; SSE-NEXT: movq 56(%rsp,%rdx), %r14
; SSE-NEXT: shrdq %cl, %r14, %rbx
-; SSE-NEXT: movq -128(%rsp,%rdx), %rdx
+; SSE-NEXT: movq (%rsp,%rdx), %rdx
; SSE-NEXT: shrdq %cl, %rax, %rdx
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -1474,51 +1605,55 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
; SSE-NEXT: movq %rsi, 16(%rdi)
; SSE-NEXT: movq %r8, 8(%rdi)
; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: leaq -16(%rbp), %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: ashr_i512_load:
; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: vmovups (%rsi), %ymm0
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
+; AVX2-NEXT: vmovaps (%rsi), %ymm0
; AVX2-NEXT: vmovaps 32(%rsi), %xmm1
; AVX2-NEXT: movq 48(%rsi), %rax
; AVX2-NEXT: movq 56(%rsi), %rcx
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; AVX2-NEXT: movl %edx, %ecx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: shrl $3, %edx
; AVX2-NEXT: andl $56, %edx
-; AVX2-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX2-NEXT: movq -120(%rsp,%rdx), %rax
+; AVX2-NEXT: movq 16(%rsp,%rdx), %rsi
+; AVX2-NEXT: movq 8(%rsp,%rdx), %rax
; AVX2-NEXT: movq %rax, %r8
; AVX2-NEXT: shrdq %cl, %rsi, %r8
-; AVX2-NEXT: movq -104(%rsp,%rdx), %r9
+; AVX2-NEXT: movq 24(%rsp,%rdx), %r9
; AVX2-NEXT: shrdq %cl, %r9, %rsi
-; AVX2-NEXT: movq -96(%rsp,%rdx), %r10
+; AVX2-NEXT: movq 32(%rsp,%rdx), %r10
; AVX2-NEXT: shrdq %cl, %r10, %r9
-; AVX2-NEXT: movq -88(%rsp,%rdx), %r11
+; AVX2-NEXT: movq 40(%rsp,%rdx), %r11
; AVX2-NEXT: shrdq %cl, %r11, %r10
-; AVX2-NEXT: movq -80(%rsp,%rdx), %rbx
+; AVX2-NEXT: movq 48(%rsp,%rdx), %rbx
; AVX2-NEXT: shrdq %cl, %rbx, %r11
-; AVX2-NEXT: movq -128(%rsp,%rdx), %r14
-; AVX2-NEXT: movq -72(%rsp,%rdx), %rdx
+; AVX2-NEXT: movq (%rsp,%rdx), %r14
+; AVX2-NEXT: movq 56(%rsp,%rdx), %rdx
; AVX2-NEXT: shrdq %cl, %rdx, %rbx
; AVX2-NEXT: shrdq %cl, %rax, %r14
; AVX2-NEXT: movq %rdi, %rax
@@ -1531,52 +1666,56 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX2-NEXT: movq %rsi, 16(%rdi)
; AVX2-NEXT: movq %r8, 8(%rdi)
; AVX2-NEXT: movq %r14, (%rdi)
-; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: leaq -16(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: ashr_i512_load:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rsi), %ymm0
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $128, %rsp
+; AVX512F-NEXT: vmovaps (%rsi), %ymm0
; AVX512F-NEXT: vmovaps 32(%rsi), %xmm1
; AVX512F-NEXT: movq 48(%rsi), %rax
; AVX512F-NEXT: movq 56(%rsi), %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
; AVX512F-NEXT: sarq $63, %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %edx, %ecx
; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: shrl $3, %edx
; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rax
+; AVX512F-NEXT: movq 16(%rsp,%rdx), %rsi
+; AVX512F-NEXT: movq 8(%rsp,%rdx), %rax
; AVX512F-NEXT: movq %rax, %r8
; AVX512F-NEXT: shrdq %cl, %rsi, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rdx), %r9
+; AVX512F-NEXT: movq 24(%rsp,%rdx), %r9
; AVX512F-NEXT: shrdq %cl, %r9, %rsi
-; AVX512F-NEXT: movq -96(%rsp,%rdx), %r10
+; AVX512F-NEXT: movq 32(%rsp,%rdx), %r10
; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rdx), %r11
+; AVX512F-NEXT: movq 40(%rsp,%rdx), %r11
; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rdx), %rbx
+; AVX512F-NEXT: movq 48(%rsp,%rdx), %rbx
; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rdx), %rdx
+; AVX512F-NEXT: movq (%rsp,%rdx), %r14
+; AVX512F-NEXT: movq 56(%rsp,%rdx), %rdx
; AVX512F-NEXT: shrdq %cl, %rdx, %rbx
; AVX512F-NEXT: shrdq %cl, %rax, %r14
; AVX512F-NEXT: movq %rdi, %rax
@@ -1589,52 +1728,56 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %rsi, 16(%rdi)
; AVX512F-NEXT: movq %r8, 8(%rdi)
; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: leaq -16(%rbp), %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_i512_load:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rsi), %ymm0
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $128, %rsp
+; AVX512VL-NEXT: vmovaps (%rsi), %ymm0
; AVX512VL-NEXT: vmovaps 32(%rsi), %xmm1
; AVX512VL-NEXT: movq 48(%rsi), %rax
; AVX512VL-NEXT: movq 56(%rsi), %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VL-NEXT: sarq $63, %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %edx, %ecx
; AVX512VL-NEXT: andl $63, %ecx
; AVX512VL-NEXT: shrl $3, %edx
; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rax
+; AVX512VL-NEXT: movq 16(%rsp,%rdx), %rsi
+; AVX512VL-NEXT: movq 8(%rsp,%rdx), %rax
; AVX512VL-NEXT: movq %rax, %r8
; AVX512VL-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rdx), %r9
+; AVX512VL-NEXT: movq 24(%rsp,%rdx), %r9
; AVX512VL-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VL-NEXT: movq -96(%rsp,%rdx), %r10
+; AVX512VL-NEXT: movq 32(%rsp,%rdx), %r10
; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rdx), %r11
+; AVX512VL-NEXT: movq 40(%rsp,%rdx), %r11
; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rdx), %rbx
+; AVX512VL-NEXT: movq 48(%rsp,%rdx), %rbx
; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rdx), %r14
+; AVX512VL-NEXT: movq 56(%rsp,%rdx), %r14
; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rdx
+; AVX512VL-NEXT: movq (%rsp,%rdx), %rdx
; AVX512VL-NEXT: shrdq %cl, %rax, %rdx
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
@@ -1646,53 +1789,57 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rsi, 16(%rdi)
; AVX512VL-NEXT: movq %r8, 8(%rdi)
; AVX512VL-NEXT: movq %rdx, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
+; AVX512VL-NEXT: leaq -16(%rbp), %rsp
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_i512_load:
; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
; AVX512VBMI-NEXT: pushq %r14
; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm0
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $128, %rsp
+; AVX512VBMI-NEXT: vmovaps (%rsi), %ymm0
; AVX512VBMI-NEXT: vmovaps 32(%rsi), %xmm1
; AVX512VBMI-NEXT: movq 48(%rsi), %rax
; AVX512VBMI-NEXT: movq 56(%rsi), %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VBMI-NEXT: sarq $63, %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movl %edx, %ecx
; AVX512VBMI-NEXT: andl $63, %ecx
; AVX512VBMI-NEXT: shrl $3, %edx
; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rax
+; AVX512VBMI-NEXT: movq 16(%rsp,%rdx), %rsi
+; AVX512VBMI-NEXT: movq 8(%rsp,%rdx), %rax
; AVX512VBMI-NEXT: movq %rax, %r8
; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rdx), %r9
+; AVX512VBMI-NEXT: movq 24(%rsp,%rdx), %r9
; AVX512VBMI-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VBMI-NEXT: movq -96(%rsp,%rdx), %r10
+; AVX512VBMI-NEXT: movq 32(%rsp,%rdx), %r10
; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rdx), %r11
+; AVX512VBMI-NEXT: movq 40(%rsp,%rdx), %r11
; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rdx), %rbx
+; AVX512VBMI-NEXT: movq 48(%rsp,%rdx), %rbx
; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rdx), %r14
+; AVX512VBMI-NEXT: movq 56(%rsp,%rdx), %r14
; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT: movq (%rsp,%rdx), %rdx
; AVX512VBMI-NEXT: shrdq %cl, %rax, %rdx
; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
@@ -1704,9 +1851,10 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
; AVX512VBMI-NEXT: movq %rsi, 16(%rdi)
; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
; AVX512VBMI-NEXT: movq %rdx, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
+; AVX512VBMI-NEXT: leaq -16(%rbp), %rsp
; AVX512VBMI-NEXT: popq %rbx
; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -1971,7 +2119,7 @@ define i512 @shl_i512_511(i512 %a0) nounwind {
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovaps %xmm0, 32(%rdi)
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, (%rdi)
+; AVX2-NEXT: vmovaps %ymm0, (%rdi)
; AVX2-NEXT: movq $0, 48(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1984,7 +2132,7 @@ define i512 @shl_i512_511(i512 %a0) nounwind {
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovaps %xmm0, 32(%rdi)
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %ymm0, (%rdi)
+; AVX512F-NEXT: vmovaps %ymm0, (%rdi)
; AVX512F-NEXT: movq $0, 48(%rdi)
; AVX512F-NEXT: retq
;
@@ -1996,7 +2144,7 @@ define i512 @shl_i512_511(i512 %a0) nounwind {
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, 32(%rdi)
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, (%rdi)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rdi)
; AVX512VL-NEXT: movq $0, 48(%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -2009,7 +2157,7 @@ define i512 @shl_i512_511(i512 %a0) nounwind {
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512VBMI-NEXT: vmovaps %xmm0, 32(%rdi)
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, (%rdi)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rdi)
; AVX512VBMI-NEXT: movq $0, 48(%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
@@ -2111,99 +2259,106 @@ define i512 @ashr_i512_511(i512 %a0) nounwind {
define i512 @shl_1_i512(i512 %a0) nounwind {
; SSE-LABEL: shl_1_i512:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pushq %rax
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $128, %rsp
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %esi
; SSE-NEXT: andl $56, %esi
; SSE-NEXT: negl %esi
; SSE-NEXT: movslq %esi, %rax
-; SSE-NEXT: movq -56(%rsp,%rax), %rdx
-; SSE-NEXT: movq -48(%rsp,%rax), %r9
+; SSE-NEXT: movq 72(%rsp,%rax), %rdx
+; SSE-NEXT: movq 80(%rsp,%rax), %r9
; SSE-NEXT: movq %r9, %rsi
; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: movq -40(%rsp,%rax), %r10
+; SSE-NEXT: movq 88(%rsp,%rax), %r10
; SSE-NEXT: movq %r10, %r8
; SSE-NEXT: shldq %cl, %r9, %r8
-; SSE-NEXT: movq -32(%rsp,%rax), %r9
-; SSE-NEXT: movq %r9, %r11
-; SSE-NEXT: shldq %cl, %r10, %r11
-; SSE-NEXT: movq -24(%rsp,%rax), %r10
+; SSE-NEXT: movq 96(%rsp,%rax), %r11
+; SSE-NEXT: movq %r11, %r9
+; SSE-NEXT: shldq %cl, %r10, %r9
+; SSE-NEXT: movq 104(%rsp,%rax), %r10
; SSE-NEXT: movq %r10, %rbx
-; SSE-NEXT: shldq %cl, %r9, %rbx
-; SSE-NEXT: movq -16(%rsp,%rax), %r9
-; SSE-NEXT: movq %r9, %r14
+; SSE-NEXT: shldq %cl, %r11, %rbx
+; SSE-NEXT: movq 112(%rsp,%rax), %r11
+; SSE-NEXT: movq %r11, %r14
; SSE-NEXT: shldq %cl, %r10, %r14
-; SSE-NEXT: movq -8(%rsp,%rax), %r10
-; SSE-NEXT: shldq %cl, %r9, %r10
-; SSE-NEXT: movq -64(%rsp,%rax), %rax
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: shlq %cl, %r9
+; SSE-NEXT: movq 120(%rsp,%rax), %r10
+; SSE-NEXT: shldq %cl, %r11, %r10
+; SSE-NEXT: movq 64(%rsp,%rax), %rax
+; SSE-NEXT: movq %rax, %r11
+; SSE-NEXT: shlq %cl, %r11
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; SSE-NEXT: shldq %cl, %rax, %rdx
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: movq %r10, 56(%rdi)
; SSE-NEXT: movq %r14, 48(%rdi)
; SSE-NEXT: movq %rbx, 40(%rdi)
-; SSE-NEXT: movq %r11, 32(%rdi)
+; SSE-NEXT: movq %r9, 32(%rdi)
; SSE-NEXT: movq %r8, 24(%rdi)
; SSE-NEXT: movq %rsi, 16(%rdi)
; SSE-NEXT: movq %rdx, 8(%rdi)
-; SSE-NEXT: movq %r9, (%rdi)
-; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: movq %r11, (%rdi)
+; SSE-NEXT: leaq -16(%rbp), %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: shl_1_i512:
; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: movl %esi, %ecx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: shrl $3, %esi
; AVX2-NEXT: andl $56, %esi
; AVX2-NEXT: negl %esi
; AVX2-NEXT: movslq %esi, %r8
-; AVX2-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX2-NEXT: movq -48(%rsp,%r8), %rax
+; AVX2-NEXT: movq 72(%rsp,%r8), %rdx
+; AVX2-NEXT: movq 80(%rsp,%r8), %rax
; AVX2-NEXT: movq %rax, %rsi
; AVX2-NEXT: shldq %cl, %rdx, %rsi
-; AVX2-NEXT: movq -40(%rsp,%r8), %r10
+; AVX2-NEXT: movq 88(%rsp,%r8), %r10
; AVX2-NEXT: movq %r10, %r9
; AVX2-NEXT: shldq %cl, %rax, %r9
-; AVX2-NEXT: movq -32(%rsp,%r8), %rax
+; AVX2-NEXT: movq 96(%rsp,%r8), %rax
; AVX2-NEXT: movq %rax, %r11
; AVX2-NEXT: shldq %cl, %r10, %r11
-; AVX2-NEXT: movq -24(%rsp,%r8), %r10
+; AVX2-NEXT: movq 104(%rsp,%r8), %r10
; AVX2-NEXT: movq %r10, %rbx
; AVX2-NEXT: shldq %cl, %rax, %rbx
-; AVX2-NEXT: movq -16(%rsp,%r8), %rax
+; AVX2-NEXT: movq 112(%rsp,%r8), %rax
; AVX2-NEXT: movq %rax, %r14
; AVX2-NEXT: shldq %cl, %r10, %r14
-; AVX2-NEXT: movq -8(%rsp,%r8), %r10
+; AVX2-NEXT: movq 120(%rsp,%r8), %r10
; AVX2-NEXT: shldq %cl, %rax, %r10
; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movq -64(%rsp,%r8), %rdi
+; AVX2-NEXT: movq 64(%rsp,%r8), %rdi
; AVX2-NEXT: shlxq %rcx, %rdi, %r8
; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX2-NEXT: shldq %cl, %rdi, %rdx
@@ -2215,47 +2370,51 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
; AVX2-NEXT: movq %rsi, 16(%rax)
; AVX2-NEXT: movq %rdx, 8(%rax)
; AVX2-NEXT: movq %r8, (%rax)
-; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: leaq -16(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: shl_1_i512:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $128, %rsp
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm0, (%rsp)
; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %esi, %ecx
; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: shrl $3, %esi
; AVX512F-NEXT: andl $56, %esi
; AVX512F-NEXT: negl %esi
; AVX512F-NEXT: movslq %esi, %r8
-; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
+; AVX512F-NEXT: movq 72(%rsp,%r8), %rdx
+; AVX512F-NEXT: movq 80(%rsp,%r8), %rax
; AVX512F-NEXT: movq %rax, %rsi
; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
+; AVX512F-NEXT: movq 88(%rsp,%r8), %r10
; AVX512F-NEXT: movq %r10, %r9
; AVX512F-NEXT: shldq %cl, %rax, %r9
-; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
+; AVX512F-NEXT: movq 96(%rsp,%r8), %rax
; AVX512F-NEXT: movq %rax, %r11
; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
+; AVX512F-NEXT: movq 104(%rsp,%r8), %r10
; AVX512F-NEXT: movq %r10, %rbx
; AVX512F-NEXT: shldq %cl, %rax, %rbx
-; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
+; AVX512F-NEXT: movq 112(%rsp,%r8), %rax
; AVX512F-NEXT: movq %rax, %r14
; AVX512F-NEXT: shldq %cl, %r10, %r14
-; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
+; AVX512F-NEXT: movq 120(%rsp,%r8), %r10
; AVX512F-NEXT: shldq %cl, %rax, %r10
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
+; AVX512F-NEXT: movq 64(%rsp,%r8), %rdi
; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512F-NEXT: shldq %cl, %rdi, %rdx
@@ -2267,47 +2426,52 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
; AVX512F-NEXT: movq %rsi, 16(%rax)
; AVX512F-NEXT: movq %rdx, 8(%rax)
; AVX512F-NEXT: movq %r8, (%rax)
-; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: leaq -16(%rbp), %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shl_1_i512:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
; AVX512VL-NEXT: pushq %r15
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $160, %rsp
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %esi, %ecx
; AVX512VL-NEXT: andl $63, %ecx
; AVX512VL-NEXT: shrl $3, %esi
; AVX512VL-NEXT: andl $56, %esi
; AVX512VL-NEXT: negl %esi
; AVX512VL-NEXT: movslq %esi, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
+; AVX512VL-NEXT: movq 72(%rsp,%r9), %rdx
+; AVX512VL-NEXT: movq 80(%rsp,%r9), %rax
; AVX512VL-NEXT: movq %rax, %rsi
; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
+; AVX512VL-NEXT: movq 88(%rsp,%r9), %r10
; AVX512VL-NEXT: movq %r10, %r8
; AVX512VL-NEXT: shldq %cl, %rax, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
+; AVX512VL-NEXT: movq 96(%rsp,%r9), %r11
; AVX512VL-NEXT: movq %r11, %rbx
; AVX512VL-NEXT: shldq %cl, %r10, %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
+; AVX512VL-NEXT: movq 104(%rsp,%r9), %rdi
; AVX512VL-NEXT: movq %rdi, %r10
; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
+; AVX512VL-NEXT: movq 64(%rsp,%r9), %r11
+; AVX512VL-NEXT: movq 112(%rsp,%r9), %r14
; AVX512VL-NEXT: movq %r14, %r15
; AVX512VL-NEXT: shldq %cl, %rdi, %r15
-; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
+; AVX512VL-NEXT: movq 120(%rsp,%r9), %rdi
; AVX512VL-NEXT: shldq %cl, %r14, %rdi
; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -2320,48 +2484,54 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
; AVX512VL-NEXT: movq %rsi, 16(%rax)
; AVX512VL-NEXT: movq %rdx, 8(%rax)
; AVX512VL-NEXT: movq %r9, (%rax)
+; AVX512VL-NEXT: leaq -24(%rbp), %rsp
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: shl_1_i512:
; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
; AVX512VBMI-NEXT: pushq %r15
; AVX512VBMI-NEXT: pushq %r14
; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $160, %rsp
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movl %esi, %ecx
; AVX512VBMI-NEXT: andl $63, %ecx
; AVX512VBMI-NEXT: shrl $3, %esi
; AVX512VBMI-NEXT: andl $56, %esi
; AVX512VBMI-NEXT: negl %esi
; AVX512VBMI-NEXT: movslq %esi, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
+; AVX512VBMI-NEXT: movq 72(%rsp,%r9), %rdx
+; AVX512VBMI-NEXT: movq 80(%rsp,%r9), %rax
; AVX512VBMI-NEXT: movq %rax, %rsi
; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
+; AVX512VBMI-NEXT: movq 88(%rsp,%r9), %r10
; AVX512VBMI-NEXT: movq %r10, %r8
; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
+; AVX512VBMI-NEXT: movq 96(%rsp,%r9), %r11
; AVX512VBMI-NEXT: movq %r11, %rbx
; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT: movq 104(%rsp,%r9), %rdi
; AVX512VBMI-NEXT: movq %rdi, %r10
; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
+; AVX512VBMI-NEXT: movq 64(%rsp,%r9), %r11
+; AVX512VBMI-NEXT: movq 112(%rsp,%r9), %r14
; AVX512VBMI-NEXT: movq %r14, %r15
; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT: movq 120(%rsp,%r9), %rdi
; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -2374,9 +2544,11 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
; AVX512VBMI-NEXT: movq %r9, (%rax)
+; AVX512VBMI-NEXT: leaq -24(%rbp), %rsp
; AVX512VBMI-NEXT: popq %rbx
; AVX512VBMI-NEXT: popq %r14
; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = shl i512 1, %a0
@@ -2386,39 +2558,42 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
define i512 @lshr_signbit_i512(i512 %a0) nounwind {
; SSE-LABEL: lshr_signbit_i512:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pushq %rax
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $128, %rsp
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %esi
; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: movq -112(%rsp,%rsi), %rdx
-; SSE-NEXT: movq -120(%rsp,%rsi), %rax
+; SSE-NEXT: movq 16(%rsp,%rsi), %rdx
+; SSE-NEXT: movq 8(%rsp,%rsi), %rax
; SSE-NEXT: movq %rax, %r8
; SSE-NEXT: shrdq %cl, %rdx, %r8
-; SSE-NEXT: movq -104(%rsp,%rsi), %r9
+; SSE-NEXT: movq 24(%rsp,%rsi), %r9
; SSE-NEXT: shrdq %cl, %r9, %rdx
-; SSE-NEXT: movq -96(%rsp,%rsi), %r10
+; SSE-NEXT: movq 32(%rsp,%rsi), %r10
; SSE-NEXT: shrdq %cl, %r10, %r9
-; SSE-NEXT: movq -88(%rsp,%rsi), %r11
+; SSE-NEXT: movq 40(%rsp,%rsi), %r11
; SSE-NEXT: shrdq %cl, %r11, %r10
-; SSE-NEXT: movq -80(%rsp,%rsi), %rbx
+; SSE-NEXT: movq 48(%rsp,%rsi), %rbx
; SSE-NEXT: shrdq %cl, %rbx, %r11
-; SSE-NEXT: movq -72(%rsp,%rsi), %r14
+; SSE-NEXT: movq 56(%rsp,%rsi), %r14
; SSE-NEXT: shrdq %cl, %r14, %rbx
-; SSE-NEXT: movq -128(%rsp,%rsi), %rsi
+; SSE-NEXT: movq (%rsp,%rsi), %rsi
; SSE-NEXT: shrdq %cl, %rax, %rsi
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -2431,40 +2606,44 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
; SSE-NEXT: movq %rdx, 16(%rdi)
; SSE-NEXT: movq %r8, 8(%rdi)
; SSE-NEXT: movq %rsi, (%rdi)
-; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: leaq -16(%rbp), %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: lshr_signbit_i512:
; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: movl %esi, %ecx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: shrl $3, %esi
; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX2-NEXT: movq 16(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq 8(%rsp,%rsi), %rax
; AVX2-NEXT: movq %rax, %r8
; AVX2-NEXT: shrdq %cl, %rdx, %r8
-; AVX2-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX2-NEXT: movq 24(%rsp,%rsi), %r9
; AVX2-NEXT: shrdq %cl, %r9, %rdx
-; AVX2-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX2-NEXT: movq 32(%rsp,%rsi), %r10
; AVX2-NEXT: shrdq %cl, %r10, %r9
-; AVX2-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX2-NEXT: movq 40(%rsp,%rsi), %r11
; AVX2-NEXT: shrdq %cl, %r11, %r10
-; AVX2-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX2-NEXT: movq 48(%rsp,%rsi), %rbx
; AVX2-NEXT: shrdq %cl, %rbx, %r11
-; AVX2-NEXT: movq -128(%rsp,%rsi), %r14
-; AVX2-NEXT: movq -72(%rsp,%rsi), %rsi
+; AVX2-NEXT: movq (%rsp,%rsi), %r14
+; AVX2-NEXT: movq 56(%rsp,%rsi), %rsi
; AVX2-NEXT: shrdq %cl, %rsi, %rbx
; AVX2-NEXT: shrdq %cl, %rax, %r14
; AVX2-NEXT: movq %rdi, %rax
@@ -2477,39 +2656,43 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
; AVX2-NEXT: movq %rdx, 16(%rdi)
; AVX2-NEXT: movq %r8, 8(%rdi)
; AVX2-NEXT: movq %r14, (%rdi)
-; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: leaq -16(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: lshr_signbit_i512:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $128, %rsp
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm0, (%rsp)
; AVX512F-NEXT: movl %esi, %ecx
; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: shrl $3, %esi
; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512F-NEXT: movq 16(%rsp,%rsi), %rdx
+; AVX512F-NEXT: movq 8(%rsp,%rsi), %rax
; AVX512F-NEXT: movq %rax, %r8
; AVX512F-NEXT: shrdq %cl, %rdx, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512F-NEXT: movq 24(%rsp,%rsi), %r9
; AVX512F-NEXT: shrdq %cl, %r9, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512F-NEXT: movq 32(%rsp,%rsi), %r10
; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512F-NEXT: movq 40(%rsp,%rsi), %r11
; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512F-NEXT: movq 48(%rsp,%rsi), %rbx
; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi
+; AVX512F-NEXT: movq (%rsp,%rsi), %r14
+; AVX512F-NEXT: movq 56(%rsp,%rsi), %rsi
; AVX512F-NEXT: shrdq %cl, %rsi, %rbx
; AVX512F-NEXT: shrdq %cl, %rax, %r14
; AVX512F-NEXT: movq %rdi, %rax
@@ -2522,41 +2705,45 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
; AVX512F-NEXT: movq %rdx, 16(%rdi)
; AVX512F-NEXT: movq %r8, 8(%rdi)
; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: leaq -16(%rbp), %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_signbit_i512:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $128, %rsp
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VL-NEXT: movl %esi, %ecx
; AVX512VL-NEXT: andl $63, %ecx
; AVX512VL-NEXT: shrl $3, %esi
; AVX512VL-NEXT: andl $56, %esi
-; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512VL-NEXT: movq 16(%rsp,%rsi), %rdx
+; AVX512VL-NEXT: movq 8(%rsp,%rsi), %rax
; AVX512VL-NEXT: movq %rax, %r8
; AVX512VL-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512VL-NEXT: movq 24(%rsp,%rsi), %r9
; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512VL-NEXT: movq 32(%rsp,%rsi), %r10
; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512VL-NEXT: movq 40(%rsp,%rsi), %r11
; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512VL-NEXT: movq 48(%rsp,%rsi), %rbx
; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14
+; AVX512VL-NEXT: movq 56(%rsp,%rsi), %r14
; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi
+; AVX512VL-NEXT: movq (%rsp,%rsi), %rsi
; AVX512VL-NEXT: shrdq %cl, %rax, %rsi
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
@@ -2568,42 +2755,46 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
; AVX512VL-NEXT: movq %rdx, 16(%rdi)
; AVX512VL-NEXT: movq %r8, 8(%rdi)
; AVX512VL-NEXT: movq %rsi, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
+; AVX512VL-NEXT: leaq -16(%rbp), %rsp
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_signbit_i512:
; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
; AVX512VBMI-NEXT: pushq %r14
; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $128, %rsp
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VBMI-NEXT: movl %esi, %ecx
; AVX512VBMI-NEXT: andl $63, %ecx
; AVX512VBMI-NEXT: shrl $3, %esi
; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512VBMI-NEXT: movq 16(%rsp,%rsi), %rdx
+; AVX512VBMI-NEXT: movq 8(%rsp,%rsi), %rax
; AVX512VBMI-NEXT: movq %rax, %r8
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512VBMI-NEXT: movq 24(%rsp,%rsi), %r9
; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512VBMI-NEXT: movq 32(%rsp,%rsi), %r10
; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512VBMI-NEXT: movq 40(%rsp,%rsi), %r11
; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512VBMI-NEXT: movq 48(%rsp,%rsi), %rbx
; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14
+; AVX512VBMI-NEXT: movq 56(%rsp,%rsi), %r14
; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi
+; AVX512VBMI-NEXT: movq (%rsp,%rsi), %rsi
; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi
; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
@@ -2615,9 +2806,10 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
; AVX512VBMI-NEXT: movq %rsi, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
+; AVX512VBMI-NEXT: leaq -16(%rbp), %rsp
; AVX512VBMI-NEXT: popq %rbx
; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%s = shl i512 1, 511
@@ -2628,43 +2820,46 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
define i512 @ashr_signbit_i512(i512 %a0) nounwind {
; SSE-LABEL: ashr_signbit_i512:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pushq %rax
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $128, %rsp
; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %esi
; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: movq -112(%rsp,%rsi), %rdx
-; SSE-NEXT: movq -120(%rsp,%rsi), %rax
+; SSE-NEXT: movq 16(%rsp,%rsi), %rdx
+; SSE-NEXT: movq 8(%rsp,%rsi), %rax
; SSE-NEXT: movq %rax, %r8
; SSE-NEXT: shrdq %cl, %rdx, %r8
-; SSE-NEXT: movq -104(%rsp,%rsi), %r9
+; SSE-NEXT: movq 24(%rsp,%rsi), %r9
; SSE-NEXT: shrdq %cl, %r9, %rdx
-; SSE-NEXT: movq -96(%rsp,%rsi), %r10
+; SSE-NEXT: movq 32(%rsp,%rsi), %r10
; SSE-NEXT: shrdq %cl, %r10, %r9
-; SSE-NEXT: movq -88(%rsp,%rsi), %r11
+; SSE-NEXT: movq 40(%rsp,%rsi), %r11
; SSE-NEXT: shrdq %cl, %r11, %r10
-; SSE-NEXT: movq -80(%rsp,%rsi), %rbx
+; SSE-NEXT: movq 48(%rsp,%rsi), %rbx
; SSE-NEXT: shrdq %cl, %rbx, %r11
-; SSE-NEXT: movq -72(%rsp,%rsi), %r14
+; SSE-NEXT: movq 56(%rsp,%rsi), %r14
; SSE-NEXT: shrdq %cl, %r14, %rbx
-; SSE-NEXT: movq -128(%rsp,%rsi), %rsi
+; SSE-NEXT: movq (%rsp,%rsi), %rsi
; SSE-NEXT: shrdq %cl, %rax, %rsi
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -2677,41 +2872,45 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
; SSE-NEXT: movq %rdx, 16(%rdi)
; SSE-NEXT: movq %r8, 8(%rdi)
; SSE-NEXT: movq %rsi, (%rdi)
-; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: leaq -16(%rbp), %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: ashr_signbit_i512:
; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $128, %rsp
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: movl %esi, %ecx
; AVX2-NEXT: andl $63, %ecx
; AVX2-NEXT: shrl $3, %esi
; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX2-NEXT: movq 16(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq 8(%rsp,%rsi), %rax
; AVX2-NEXT: movq %rax, %r8
; AVX2-NEXT: shrdq %cl, %rdx, %r8
-; AVX2-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX2-NEXT: movq 24(%rsp,%rsi), %r9
; AVX2-NEXT: shrdq %cl, %r9, %rdx
-; AVX2-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX2-NEXT: movq 32(%rsp,%rsi), %r10
; AVX2-NEXT: shrdq %cl, %r10, %r9
-; AVX2-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX2-NEXT: movq 40(%rsp,%rsi), %r11
; AVX2-NEXT: shrdq %cl, %r11, %r10
-; AVX2-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX2-NEXT: movq 48(%rsp,%rsi), %rbx
; AVX2-NEXT: shrdq %cl, %rbx, %r11
-; AVX2-NEXT: movq -128(%rsp,%rsi), %r14
-; AVX2-NEXT: movq -72(%rsp,%rsi), %rsi
+; AVX2-NEXT: movq (%rsp,%rsi), %r14
+; AVX2-NEXT: movq 56(%rsp,%rsi), %rsi
; AVX2-NEXT: shrdq %cl, %rsi, %rbx
; AVX2-NEXT: shrdq %cl, %rax, %r14
; AVX2-NEXT: movq %rdi, %rax
@@ -2724,39 +2923,43 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
; AVX2-NEXT: movq %rdx, 16(%rdi)
; AVX2-NEXT: movq %r8, 8(%rdi)
; AVX2-NEXT: movq %r14, (%rdi)
-; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: leaq -16(%rbp), %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: ashr_signbit_i512:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $128, %rsp
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = -1
-; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm0, (%rsp)
; AVX512F-NEXT: movl %esi, %ecx
; AVX512F-NEXT: andl $63, %ecx
; AVX512F-NEXT: shrl $3, %esi
; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512F-NEXT: movq 16(%rsp,%rsi), %rdx
+; AVX512F-NEXT: movq 8(%rsp,%rsi), %rax
; AVX512F-NEXT: movq %rax, %r8
; AVX512F-NEXT: shrdq %cl, %rdx, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512F-NEXT: movq 24(%rsp,%rsi), %r9
; AVX512F-NEXT: shrdq %cl, %r9, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512F-NEXT: movq 32(%rsp,%rsi), %r10
; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512F-NEXT: movq 40(%rsp,%rsi), %r11
; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512F-NEXT: movq 48(%rsp,%rsi), %rbx
; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi
+; AVX512F-NEXT: movq (%rsp,%rsi), %r14
+; AVX512F-NEXT: movq 56(%rsp,%rsi), %rsi
; AVX512F-NEXT: shrdq %cl, %rsi, %rbx
; AVX512F-NEXT: shrdq %cl, %rax, %r14
; AVX512F-NEXT: movq %rdi, %rax
@@ -2769,42 +2972,46 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
; AVX512F-NEXT: movq %rdx, 16(%rdi)
; AVX512F-NEXT: movq %r8, 8(%rdi)
; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: leaq -16(%rbp), %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_signbit_i512:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $128, %rsp
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VL-NEXT: movl %esi, %ecx
; AVX512VL-NEXT: andl $63, %ecx
; AVX512VL-NEXT: shrl $3, %esi
; AVX512VL-NEXT: andl $56, %esi
-; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512VL-NEXT: movq 16(%rsp,%rsi), %rdx
+; AVX512VL-NEXT: movq 8(%rsp,%rsi), %rax
; AVX512VL-NEXT: movq %rax, %r8
; AVX512VL-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512VL-NEXT: movq 24(%rsp,%rsi), %r9
; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512VL-NEXT: movq 32(%rsp,%rsi), %r10
; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512VL-NEXT: movq 40(%rsp,%rsi), %r11
; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512VL-NEXT: movq 48(%rsp,%rsi), %rbx
; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14
+; AVX512VL-NEXT: movq 56(%rsp,%rsi), %r14
; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi
+; AVX512VL-NEXT: movq (%rsp,%rsi), %rsi
; AVX512VL-NEXT: shrdq %cl, %rax, %rsi
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
@@ -2816,43 +3023,47 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
; AVX512VL-NEXT: movq %rdx, 16(%rdi)
; AVX512VL-NEXT: movq %r8, 8(%rdi)
; AVX512VL-NEXT: movq %rsi, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
+; AVX512VL-NEXT: leaq -16(%rbp), %rsp
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_signbit_i512:
; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
; AVX512VBMI-NEXT: pushq %r14
; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $128, %rsp
; AVX512VBMI-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VBMI-NEXT: movl %esi, %ecx
; AVX512VBMI-NEXT: andl $63, %ecx
; AVX512VBMI-NEXT: shrl $3, %esi
; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512VBMI-NEXT: movq 16(%rsp,%rsi), %rdx
+; AVX512VBMI-NEXT: movq 8(%rsp,%rsi), %rax
; AVX512VBMI-NEXT: movq %rax, %r8
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512VBMI-NEXT: movq 24(%rsp,%rsi), %r9
; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512VBMI-NEXT: movq 32(%rsp,%rsi), %r10
; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512VBMI-NEXT: movq 40(%rsp,%rsi), %r11
; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512VBMI-NEXT: movq 48(%rsp,%rsi), %rbx
; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14
+; AVX512VBMI-NEXT: movq 56(%rsp,%rsi), %r14
; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi
+; AVX512VBMI-NEXT: movq (%rsp,%rsi), %rsi
; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi
; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
@@ -2864,9 +3075,10 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
; AVX512VBMI-NEXT: movq %rsi, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
+; AVX512VBMI-NEXT: leaq -16(%rbp), %rsp
; AVX512VBMI-NEXT: popq %rbx
; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%s = shl i512 1, 511
@@ -2877,130 +3089,150 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
define i64 @lshr_extract_i512_i64(i512 %a0, i512 %a1) nounwind {
; SSE-LABEL: lshr_extract_i512_i64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $160, %rsp
+; SSE-NEXT: movq 32(%rbp), %r10
+; SSE-NEXT: movaps 16(%rbp), %xmm0
; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdi, (%rsp)
; SSE-NEXT: movl %r10d, %ecx
; SSE-NEXT: shrl $3, %ecx
; SSE-NEXT: andl $56, %ecx
-; SSE-NEXT: movq -128(%rsp,%rcx), %rax
-; SSE-NEXT: movq -120(%rsp,%rcx), %rdx
+; SSE-NEXT: movq (%rsp,%rcx), %rax
+; SSE-NEXT: movq 8(%rsp,%rcx), %rdx
; SSE-NEXT: movl %r10d, %ecx
; SSE-NEXT: shrdq %cl, %rdx, %rax
-; SSE-NEXT: popq %rcx
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: lshr_extract_i512_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $160, %rsp
+; AVX2-NEXT: movq 32(%rbp), %r10
+; AVX2-NEXT: vmovaps 16(%rbp), %xmm0
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdi, (%rsp)
; AVX2-NEXT: movl %r10d, %ecx
; AVX2-NEXT: shrl $3, %ecx
; AVX2-NEXT: andl $56, %ecx
-; AVX2-NEXT: movq -128(%rsp,%rcx), %rax
-; AVX2-NEXT: movq -120(%rsp,%rcx), %rdx
+; AVX2-NEXT: movq (%rsp,%rcx), %rax
+; AVX2-NEXT: movq 8(%rsp,%rcx), %rdx
; AVX2-NEXT: movl %r10d, %ecx
; AVX2-NEXT: shrdq %cl, %rdx, %rax
-; AVX2-NEXT: popq %rcx
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: lshr_extract_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $160, %rsp
+; AVX512F-NEXT: movq 32(%rbp), %r10
+; AVX512F-NEXT: vmovaps 16(%rbp), %xmm0
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdi, (%rsp)
; AVX512F-NEXT: movl %r10d, %ecx
; AVX512F-NEXT: shrl $3, %ecx
; AVX512F-NEXT: andl $56, %ecx
-; AVX512F-NEXT: movq -128(%rsp,%rcx), %rax
-; AVX512F-NEXT: movq -120(%rsp,%rcx), %rdx
+; AVX512F-NEXT: movq (%rsp,%rcx), %rax
+; AVX512F-NEXT: movq 8(%rsp,%rcx), %rdx
; AVX512F-NEXT: movl %r10d, %ecx
; AVX512F-NEXT: shrdq %cl, %rdx, %rax
-; AVX512F-NEXT: popq %rcx
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_extract_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $160, %rsp
+; AVX512VL-NEXT: movq 32(%rbp), %r10
+; AVX512VL-NEXT: vmovaps 16(%rbp), %xmm0
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdi, (%rsp)
; AVX512VL-NEXT: movl %r10d, %ecx
; AVX512VL-NEXT: shrl $3, %ecx
; AVX512VL-NEXT: andl $56, %ecx
-; AVX512VL-NEXT: movq -128(%rsp,%rcx), %rax
-; AVX512VL-NEXT: movq -120(%rsp,%rcx), %rdx
+; AVX512VL-NEXT: movq (%rsp,%rcx), %rax
+; AVX512VL-NEXT: movq 8(%rsp,%rcx), %rdx
; AVX512VL-NEXT: movl %r10d, %ecx
; AVX512VL-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_extract_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $160, %rsp
+; AVX512VBMI-NEXT: movq 32(%rbp), %r10
+; AVX512VBMI-NEXT: vmovaps 16(%rbp), %xmm0
; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, (%rsp)
; AVX512VBMI-NEXT: movl %r10d, %ecx
; AVX512VBMI-NEXT: shrl $3, %ecx
; AVX512VBMI-NEXT: andl $56, %ecx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rcx), %rax
-; AVX512VBMI-NEXT: movq -120(%rsp,%rcx), %rdx
+; AVX512VBMI-NEXT: movq (%rsp,%rcx), %rax
+; AVX512VBMI-NEXT: movq 8(%rsp,%rcx), %rdx
; AVX512VBMI-NEXT: movl %r10d, %ecx
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: movq %rbp, %rsp
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%b = lshr i512 %a0, %a1
@@ -3011,36 +3243,40 @@ define i64 @lshr_extract_i512_i64(i512 %a0, i512 %a1) nounwind {
define i64 @ashr_extract_i512_i64(i512 %a0, i512 %a1) nounwind {
; CHECK-LABEL: ashr_extract_i512_i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: andq $-32, %rsp
+; CHECK-NEXT: subq $160, %rsp
; CHECK-NEXT: movq %rcx, %rax
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; CHECK-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq 32(%rbp), %rcx
+; CHECK-NEXT: movq 16(%rbp), %r10
+; CHECK-NEXT: movq 24(%rbp), %r11
+; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdi, (%rsp)
; CHECK-NEXT: sarq $63, %r11
-; CHECK-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r11, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl %ecx, %edx
; CHECK-NEXT: shrl $3, %edx
; CHECK-NEXT: andl $56, %edx
-; CHECK-NEXT: movq -128(%rsp,%rdx), %rax
-; CHECK-NEXT: movq -120(%rsp,%rdx), %rdx
+; CHECK-NEXT: movq (%rsp,%rdx), %rax
+; CHECK-NEXT: movq 8(%rsp,%rdx), %rdx
; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NEXT: shrdq %cl, %rdx, %rax
-; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
; CHECK-NEXT: retq
%b = ashr i512 %a0, %a1
%r = trunc i512 %b to i64
@@ -3050,112 +3286,132 @@ define i64 @ashr_extract_i512_i64(i512 %a0, i512 %a1) nounwind {
define i64 @lshr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
; SSE-LABEL: lshr_extract_load_i512_i64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $160, %rsp
; SSE-NEXT: movq %rsi, %rcx
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
; SSE-NEXT: movaps 48(%rdi), %xmm3
; SSE-NEXT: xorps %xmm4, %xmm4
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: movl %ecx, %edx
; SSE-NEXT: shrl $3, %edx
; SSE-NEXT: andl $56, %edx
-; SSE-NEXT: movq -128(%rsp,%rdx), %rax
-; SSE-NEXT: movq -120(%rsp,%rdx), %rdx
+; SSE-NEXT: movq (%rsp,%rdx), %rax
+; SSE-NEXT: movq 8(%rsp,%rdx), %rdx
; SSE-NEXT: # kill: def $cl killed $cl killed $rcx
; SSE-NEXT: shrdq %cl, %rdx, %rax
-; SSE-NEXT: popq %rcx
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: lshr_extract_load_i512_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $160, %rsp
; AVX2-NEXT: movq %rsi, %rcx
-; AVX2-NEXT: vmovups (%rdi), %ymm0
-; AVX2-NEXT: vmovups 32(%rdi), %ymm1
+; AVX2-NEXT: vmovaps (%rdi), %ymm0
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: movl %ecx, %edx
; AVX2-NEXT: shrl $3, %edx
; AVX2-NEXT: andl $56, %edx
-; AVX2-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX2-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX2-NEXT: movq (%rsp,%rdx), %rax
+; AVX2-NEXT: movq 8(%rsp,%rdx), %rdx
; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX2-NEXT: shrdq %cl, %rdx, %rax
-; AVX2-NEXT: popq %rcx
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: lshr_extract_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $160, %rsp
; AVX512F-NEXT: movq %rsi, %rcx
; AVX512F-NEXT: vmovups (%rdi), %zmm0
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm0, (%rsp)
; AVX512F-NEXT: movl %ecx, %edx
; AVX512F-NEXT: shrl $3, %edx
; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512F-NEXT: movq (%rsp,%rdx), %rax
+; AVX512F-NEXT: movq 8(%rsp,%rdx), %rdx
; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512F-NEXT: shrdq %cl, %rdx, %rax
-; AVX512F-NEXT: popq %rcx
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_extract_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $160, %rsp
+; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
+; AVX512VL-NEXT: vmovaps 32(%rdi), %ymm1
; AVX512VL-NEXT: movq %rsi, %rcx
; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VL-NEXT: movl %ecx, %edx
; AVX512VL-NEXT: shrl $3, %edx
; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VL-NEXT: movq (%rsp,%rdx), %rax
+; AVX512VL-NEXT: movq 8(%rsp,%rdx), %rdx
; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VL-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_extract_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $160, %rsp
+; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
+; AVX512VBMI-NEXT: vmovaps 32(%rdi), %ymm1
; AVX512VBMI-NEXT: movq %rsi, %rcx
; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VBMI-NEXT: movl %ecx, %edx
; AVX512VBMI-NEXT: shrl $3, %edx
; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT: movq (%rsp,%rdx), %rax
+; AVX512VBMI-NEXT: movq 8(%rsp,%rdx), %rdx
; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: movq %rbp, %rsp
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -3167,161 +3423,181 @@ define i64 @lshr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
define i64 @ashr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
; SSE-LABEL: ashr_extract_load_i512_i64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $160, %rsp
; SSE-NEXT: movq %rsi, %rcx
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
; SSE-NEXT: movq 48(%rdi), %rax
; SSE-NEXT: movq 56(%rdi), %rdx
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: sarq $63, %rdx
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %ecx, %edx
; SSE-NEXT: shrl $3, %edx
; SSE-NEXT: andl $56, %edx
-; SSE-NEXT: movq -128(%rsp,%rdx), %rax
-; SSE-NEXT: movq -120(%rsp,%rdx), %rdx
+; SSE-NEXT: movq (%rsp,%rdx), %rax
+; SSE-NEXT: movq 8(%rsp,%rdx), %rdx
; SSE-NEXT: # kill: def $cl killed $cl killed $rcx
; SSE-NEXT: shrdq %cl, %rdx, %rax
-; SSE-NEXT: popq %rcx
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: ashr_extract_load_i512_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $160, %rsp
; AVX2-NEXT: movq %rsi, %rcx
-; AVX2-NEXT: vmovups (%rdi), %ymm0
+; AVX2-NEXT: vmovaps (%rdi), %ymm0
; AVX2-NEXT: vmovaps 32(%rdi), %xmm1
; AVX2-NEXT: movq 48(%rdi), %rax
; AVX2-NEXT: movq 56(%rdi), %rdx
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: sarq $63, %rdx
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; AVX2-NEXT: movl %ecx, %edx
; AVX2-NEXT: shrl $3, %edx
; AVX2-NEXT: andl $56, %edx
-; AVX2-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX2-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX2-NEXT: movq (%rsp,%rdx), %rax
+; AVX2-NEXT: movq 8(%rsp,%rdx), %rdx
; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX2-NEXT: shrdq %cl, %rdx, %rax
-; AVX2-NEXT: popq %rcx
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: ashr_extract_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $160, %rsp
; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: vmovaps (%rdi), %ymm0
; AVX512F-NEXT: vmovaps 32(%rdi), %xmm1
; AVX512F-NEXT: movq 48(%rdi), %rax
; AVX512F-NEXT: movq 56(%rdi), %rdx
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
; AVX512F-NEXT: sarq $63, %rdx
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %ecx, %edx
; AVX512F-NEXT: shrl $3, %edx
; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512F-NEXT: movq (%rsp,%rdx), %rax
+; AVX512F-NEXT: movq 8(%rsp,%rdx), %rdx
; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512F-NEXT: shrdq %cl, %rdx, %rax
-; AVX512F-NEXT: popq %rcx
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_extract_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $160, %rsp
; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
+; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
; AVX512VL-NEXT: vmovaps 32(%rdi), %xmm1
; AVX512VL-NEXT: movq 48(%rdi), %rax
; AVX512VL-NEXT: movq 56(%rdi), %rdx
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: sarq $63, %rdx
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %ecx, %edx
; AVX512VL-NEXT: shrl $3, %edx
; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VL-NEXT: movq (%rsp,%rdx), %rax
+; AVX512VL-NEXT: movq 8(%rsp,%rdx), %rdx
; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VL-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_extract_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $160, %rsp
; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
+; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
; AVX512VBMI-NEXT: vmovaps 32(%rdi), %xmm1
; AVX512VBMI-NEXT: movq 48(%rdi), %rax
; AVX512VBMI-NEXT: movq 56(%rdi), %rdx
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VBMI-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: sarq $63, %rdx
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movl %ecx, %edx
; AVX512VBMI-NEXT: shrl $3, %edx
; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT: movq (%rsp,%rdx), %rax
+; AVX512VBMI-NEXT: movq 8(%rsp,%rdx), %rdx
; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: movq %rbp, %rsp
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -3333,82 +3609,102 @@ define i64 @ashr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
define i64 @lshr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
; SSE-LABEL: lshr_extract_idx_load_i512_i64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $160, %rsp
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
; SSE-NEXT: movaps 48(%rdi), %xmm3
; SSE-NEXT: xorps %xmm4, %xmm4
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: andl $7, %esi
-; SSE-NEXT: movq -128(%rsp,%rsi,8), %rax
-; SSE-NEXT: popq %rcx
+; SSE-NEXT: movq (%rsp,%rsi,8), %rax
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: lshr_extract_idx_load_i512_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: vmovups (%rdi), %ymm0
-; AVX2-NEXT: vmovups 32(%rdi), %ymm1
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $160, %rsp
+; AVX2-NEXT: vmovaps (%rdi), %ymm0
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
; AVX2-NEXT: andl $7, %esi
-; AVX2-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX2-NEXT: popq %rcx
+; AVX2-NEXT: movq (%rsp,%rsi,8), %rax
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: lshr_extract_idx_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $160, %rsp
; AVX512F-NEXT: vmovups (%rdi), %zmm0
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm0, (%rsp)
; AVX512F-NEXT: andl $7, %esi
-; AVX512F-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512F-NEXT: popq %rcx
+; AVX512F-NEXT: movq (%rsp,%rsi,8), %rax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_extract_idx_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $160, %rsp
+; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
+; AVX512VL-NEXT: vmovaps 32(%rdi), %ymm1
; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VL-NEXT: andl $7, %esi
-; AVX512VL-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: movq (%rsp,%rsi,8), %rax
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_extract_idx_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $160, %rsp
+; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
+; AVX512VBMI-NEXT: vmovaps 32(%rdi), %ymm1
; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsp)
; AVX512VBMI-NEXT: andl $7, %esi
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: movq (%rsp,%rsi,8), %rax
+; AVX512VBMI-NEXT: movq %rbp, %rsp
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -3421,131 +3717,151 @@ define i64 @lshr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
define i64 @ashr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
; SSE-LABEL: ashr_extract_idx_load_i512_i64:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $160, %rsp
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
; SSE-NEXT: movq 48(%rdi), %rax
; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
; SSE-NEXT: sarq $63, %rcx
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; SSE-NEXT: andl $7, %esi
-; SSE-NEXT: movq -128(%rsp,%rsi,8), %rax
-; SSE-NEXT: popq %rcx
+; SSE-NEXT: movq (%rsp,%rsi,8), %rax
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: ashr_extract_idx_load_i512_i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: vmovups (%rdi), %ymm0
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $160, %rsp
+; AVX2-NEXT: vmovaps (%rdi), %ymm0
; AVX2-NEXT: vmovaps 32(%rdi), %xmm1
; AVX2-NEXT: movq 48(%rdi), %rax
; AVX2-NEXT: movq 56(%rdi), %rcx
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; AVX2-NEXT: andl $7, %esi
-; AVX2-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX2-NEXT: popq %rcx
+; AVX2-NEXT: movq (%rsp,%rsi,8), %rax
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: ashr_extract_idx_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $160, %rsp
+; AVX512F-NEXT: vmovaps (%rdi), %ymm0
; AVX512F-NEXT: vmovaps 32(%rdi), %xmm1
; AVX512F-NEXT: movq 48(%rdi), %rax
; AVX512F-NEXT: movq 56(%rdi), %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: sarq $63, %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: andl $7, %esi
-; AVX512F-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512F-NEXT: popq %rcx
+; AVX512F-NEXT: movq (%rsp,%rsi,8), %rax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_extract_idx_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $160, %rsp
+; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
; AVX512VL-NEXT: vmovaps 32(%rdi), %xmm1
; AVX512VL-NEXT: movq 48(%rdi), %rax
; AVX512VL-NEXT: movq 56(%rdi), %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: sarq $63, %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: andl $7, %esi
-; AVX512VL-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: movq (%rsp,%rsi,8), %rax
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_extract_idx_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
+; AVX512VBMI-NEXT: andq $-32, %rsp
+; AVX512VBMI-NEXT: subq $160, %rsp
+; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
; AVX512VBMI-NEXT: vmovaps 32(%rdi), %xmm1
; AVX512VBMI-NEXT: movq 48(%rdi), %rax
; AVX512VBMI-NEXT: movq 56(%rdi), %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VBMI-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: sarq $63, %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: andl $7, %esi
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: movq (%rsp,%rsi,8), %rax
+; AVX512VBMI-NEXT: movq %rbp, %rsp
+; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index df167338268c4..672aacc4771d0 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -696,143 +696,117 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
; X64-LABEL: smul_ovf:
; X64: # %bb.0:
; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
; X64-NEXT: pushq %r15
; X64-NEXT: pushq %r14
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $192, %rsp
+; X64-NEXT: movq %r9, %r13
+; X64-NEXT: movq %r8, %r9
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %rcx, %r14
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rsi, %r12
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; X64-NEXT: andl $1, %r13d
-; X64-NEXT: negq %r13
; X64-NEXT: andl $1, %r14d
; X64-NEXT: negq %r14
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rdx, %r12
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: addq %rax, %r12
-; X64-NEXT: adcq %rdx, %r11
-; X64-NEXT: setb %cl
-; X64-NEXT: movzbl %cl, %ecx
-; X64-NEXT: addq %rax, %r11
-; X64-NEXT: adcq %rdx, %rcx
-; X64-NEXT: addq %rdi, %r11
-; X64-NEXT: adcq %r12, %rcx
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r10, %rbp
-; X64-NEXT: adcq $0, %r8
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: addq %rbp, %rax
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r8, %rsi
-; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %ebp
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rsi, %r10
-; X64-NEXT: adcq %rbp, %r8
-; X64-NEXT: addq %rdi, %r10
-; X64-NEXT: adcq %r12, %r8
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rsi, %rbx
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: addq %r9, %rbx
-; X64-NEXT: adcq %rsi, %r15
-; X64-NEXT: setb %sil
-; X64-NEXT: movzbl %sil, %esi
-; X64-NEXT: addq %rax, %r15
-; X64-NEXT: adcq %rdx, %rsi
-; X64-NEXT: addq %r9, %r15
-; X64-NEXT: adcq %rbx, %rsi
-; X64-NEXT: addq %r9, %r10
-; X64-NEXT: adcq %r8, %rbx
-; X64-NEXT: adcq $0, %r15
+; X64-NEXT: movq 16(%rbp), %r15
+; X64-NEXT: andl $1, %r15d
+; X64-NEXT: negq %r15
+; X64-NEXT: subq $8, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r13
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: movq %rbx, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r15, %r9
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r15
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq %r14, %rdx
+; X64-NEXT: movq %r14, %rcx
+; X64-NEXT: movq %r14, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r13
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq %r14, %rdx
+; X64-NEXT: movq %r14, %rcx
+; X64-NEXT: movq %r14, %r8
+; X64-NEXT: movq %r15, %r9
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r15
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $32, %rsp
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq $0, %r9
; X64-NEXT: adcq $0, %rsi
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: sarq $63, %rax
-; X64-NEXT: movq %rcx, %rdi
-; X64-NEXT: sarq $63, %rdi
-; X64-NEXT: addq %r11, %r15
-; X64-NEXT: adcq %rcx, %rsi
-; X64-NEXT: movq %rdi, %r9
-; X64-NEXT: adcq %rax, %r9
-; X64-NEXT: adcq %rax, %rdi
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r13
-; X64-NEXT: movq %rax, %r8
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rdx, %rcx
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: addq %rax, %rcx
-; X64-NEXT: adcq %rdx, %r11
-; X64-NEXT: setb %al
-; X64-NEXT: addq %r8, %r11
-; X64-NEXT: movzbl %al, %r12d
-; X64-NEXT: adcq %rdx, %r12
-; X64-NEXT: movq %r13, %rax
-; X64-NEXT: imulq %r14
-; X64-NEXT: addq %rax, %rax
-; X64-NEXT: adcq %rdx, %rdx
-; X64-NEXT: addq %r11, %rax
-; X64-NEXT: adcq %r12, %rdx
-; X64-NEXT: addq %r8, %r15
-; X64-NEXT: adcq %rsi, %rcx
-; X64-NEXT: adcq %r9, %rax
-; X64-NEXT: adcq %rdi, %rdx
-; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %rsi, %r10
+; X64-NEXT: sarq $63, %r10
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq $0, %rdx
+; X64-NEXT: adcq $0, %rdi
+; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: sarq $63, %r8
+; X64-NEXT: addq %r9, %rdx
+; X64-NEXT: adcq %rsi, %rdi
+; X64-NEXT: movq %r10, %r9
+; X64-NEXT: adcq %r8, %r9
+; X64-NEXT: adcq %r10, %r8
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movq %rcx, %rsi
; X64-NEXT: sarq $63, %rsi
-; X64-NEXT: xorq %rsi, %rax
-; X64-NEXT: xorq %rsi, %r15
-; X64-NEXT: orq %rax, %r15
-; X64-NEXT: xorq %rsi, %rdx
-; X64-NEXT: xorq %rcx, %rsi
-; X64-NEXT: orq %rdx, %rsi
-; X64-NEXT: orq %r15, %rsi
-; X64-NEXT: movl %r10d, %edx
+; X64-NEXT: xorq %rsi, %rdi
+; X64-NEXT: xorq %rsi, %r8
+; X64-NEXT: orq %rdi, %r8
+; X64-NEXT: xorq %rsi, %r9
+; X64-NEXT: xorq %rdx, %rsi
+; X64-NEXT: orq %r9, %rsi
+; X64-NEXT: orq %r8, %rsi
+; X64-NEXT: movl %r11d, %edx
; X64-NEXT: andl $1, %edx
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: negq %rcx
-; X64-NEXT: xorq %rcx, %rbx
-; X64-NEXT: xorq %r10, %rcx
-; X64-NEXT: orq %rbx, %rcx
-; X64-NEXT: orq %rsi, %rcx
+; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, 8(%rax)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, (%rax)
+; X64-NEXT: movaps %xmm0, (%rax)
; X64-NEXT: movb %dl, 16(%rax)
+; X64-NEXT: negq %rdx
+; X64-NEXT: xorq %rdx, %rcx
+; X64-NEXT: xorq %r11, %rdx
+; X64-NEXT: orq %rcx, %rdx
+; X64-NEXT: orq %rsi, %rdx
; X64-NEXT: setne 32(%rax)
+; X64-NEXT: leaq -40(%rbp), %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 13596e1b18768..631c3b3c44358 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -296,215 +296,121 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
; X64: ## %bb.0:
; X64-NEXT: pushq %rbp
; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: .cfi_offset %rbp, -16
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: .cfi_def_cfa_register %rbp
; X64-NEXT: pushq %r15
-; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: pushq %r14
-; X64-NEXT: .cfi_def_cfa_offset 32
; X64-NEXT: pushq %r13
-; X64-NEXT: .cfi_def_cfa_offset 40
; X64-NEXT: pushq %r12
-; X64-NEXT: .cfi_def_cfa_offset 48
; X64-NEXT: pushq %rbx
-; X64-NEXT: .cfi_def_cfa_offset 56
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $192, %rsp
; X64-NEXT: .cfi_offset %rbx, -56
; X64-NEXT: .cfi_offset %r12, -48
; X64-NEXT: .cfi_offset %r13, -40
; X64-NEXT: .cfi_offset %r14, -32
; X64-NEXT: .cfi_offset %r15, -24
-; X64-NEXT: .cfi_offset %rbp, -16
-; X64-NEXT: movq %r8, %r12
-; X64-NEXT: movq %rcx, %rbx
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rsi, %r10
-; X64-NEXT: movq %rdi, %r11
-; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: movq %r9, %r14
+; X64-NEXT: movq %r8, %r13
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rsi, %r14
-; X64-NEXT: adcq $0, %rcx
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r14, %r13
-; X64-NEXT: adcq %rcx, %rsi
-; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %ecx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rsi, %r14
-; X64-NEXT: adcq %rcx, %r8
-; X64-NEXT: movq %rbx, %rcx
-; X64-NEXT: sarq $63, %rcx
-; X64-NEXT: movq %r9, %rsi
-; X64-NEXT: imulq %rcx, %rsi
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: addq %rax, %r15
-; X64-NEXT: addq %rsi, %r15
-; X64-NEXT: addq %rax, %r14
-; X64-NEXT: adcq %r8, %r15
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: addq %rsi, %r12
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: addq %r12, %rax
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %rdi, %rbx
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: addq %rbx, %rsi
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: adcq %rax, %rbp
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT: adcq %r13, %rbp
-; X64-NEXT: adcq $0, %r14
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq %r15, %r12
+; X64-NEXT: movq %rsi, %r15
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: subq $8, %rsp
+; X64-NEXT: movq %rcx, %r12
; X64-NEXT: sarq $63, %r12
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rdi, %r9
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rdi
-; X64-NEXT: movq %rdi, %r11
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: addq %r9, %rax
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: adcq %r13, %rdi
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r10, %rax
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rdi, %r13
-; X64-NEXT: movzbl %r8b, %eax
-; X64-NEXT: adcq %rax, %r9
-; X64-NEXT: movq %r11, %rdi
-; X64-NEXT: movq %r11, %r8
-; X64-NEXT: sarq $63, %rdi
-; X64-NEXT: imulq %rdi, %r10
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload
-; X64-NEXT: movq %rdx, %r11
-; X64-NEXT: addq %r10, %r11
-; X64-NEXT: addq %rax, %r11
-; X64-NEXT: addq %rax, %r13
-; X64-NEXT: adcq %r9, %r11
-; X64-NEXT: addq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
-; X64-NEXT: adcq %rbp, %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: adcq $0, %r11
-; X64-NEXT: movq %r11, %rbp
-; X64-NEXT: sarq $63, %rbp
-; X64-NEXT: addq %r14, %r13
-; X64-NEXT: adcq %r15, %r11
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: adcq %rbp, %rax
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT: adcq %r12, %rbp
-; X64-NEXT: movq %r8, %rbx
-; X64-NEXT: imulq %rcx, %r8
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %rcx
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rax, %rsi
-; X64-NEXT: addq %r8, %rsi
-; X64-NEXT: movq %rdi, %rcx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
-; X64-NEXT: imulq %r12, %rcx
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
-; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rcx, %rdx
+; X64-NEXT: movq %r12, %rcx
+; X64-NEXT: movq %r12, %r8
+; X64-NEXT: movq %r13, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq ___multi5
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r13, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq ___multi5
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: movq 24(%rbp), %rax
; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: addq %rcx, %r10
-; X64-NEXT: addq %rax, %r10
-; X64-NEXT: addq %r9, %r14
-; X64-NEXT: adcq %rsi, %r10
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, %rcx
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: sarq $63, %r14
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %rbx, %rsi
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq 16(%rbp), %r13
+; X64-NEXT: movq %r13, %r9
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %rax
; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rdi, %r15
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: movq %rdx, %r8
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %r15, %rdi
-; X64-NEXT: adcq %r9, %r8
-; X64-NEXT: setb %cl
-; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rbx
-; X64-NEXT: addq %r8, %rax
-; X64-NEXT: movzbl %cl, %ecx
-; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %r14, %rax
-; X64-NEXT: adcq %r10, %rdx
-; X64-NEXT: addq %r13, %rsi
-; X64-NEXT: adcq %r11, %rdi
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
-; X64-NEXT: adcq %rbp, %rdx
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
-; X64-NEXT: movq %r8, %rcx
-; X64-NEXT: sarq $63, %rcx
-; X64-NEXT: xorq %rcx, %rax
-; X64-NEXT: xorq %rcx, %rsi
-; X64-NEXT: orq %rax, %rsi
-; X64-NEXT: xorq %rcx, %rdx
-; X64-NEXT: xorq %rdi, %rcx
-; X64-NEXT: orq %rdx, %rcx
-; X64-NEXT: orq %rsi, %rcx
+; X64-NEXT: callq ___multi5
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; X64-NEXT: movq %r12, %rcx
+; X64-NEXT: movq %r12, %r8
+; X64-NEXT: movq %r13, %r9
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r15
+; X64-NEXT: callq ___multi5
+; X64-NEXT: addq $32, %rsp
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq %r8, 24(%rax)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT: movq %rcx, (%rax)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT: movq %rcx, 8(%rax)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT: movq %rcx, 16(%rax)
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %r8, %r10
+; X64-NEXT: sarq $63, %r10
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: adcq $0, %rdx
+; X64-NEXT: adcq $0, %rsi
+; X64-NEXT: movq %rsi, %rdi
+; X64-NEXT: sarq $63, %rdi
+; X64-NEXT: addq %r9, %rdx
+; X64-NEXT: adcq %r8, %rsi
+; X64-NEXT: movq %r10, %r8
+; X64-NEXT: adcq %rdi, %r8
+; X64-NEXT: adcq %r10, %rdi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: movq 32(%rbp), %r9
+; X64-NEXT: movq %rax, 24(%r9)
+; X64-NEXT: sarq $63, %rax
+; X64-NEXT: xorq %rax, %rsi
+; X64-NEXT: xorq %rax, %rdi
+; X64-NEXT: orq %rsi, %rdi
+; X64-NEXT: xorq %rax, %r8
+; X64-NEXT: xorq %rdx, %rax
+; X64-NEXT: orq %r8, %rax
+; X64-NEXT: orq %rdi, %rax
; X64-NEXT: setne %al
+; X64-NEXT: movaps %xmm0, (%r9)
+; X64-NEXT: movq %rcx, 16(%r9)
+; X64-NEXT: leaq -40(%rbp), %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/udivmodei5.ll b/llvm/test/CodeGen/X86/udivmodei5.ll
index 2c30357180e40..dd76235e29eef 100644
--- a/llvm/test/CodeGen/X86/udivmodei5.ll
+++ b/llvm/test/CodeGen/X86/udivmodei5.ll
@@ -1,10 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
; On i686, this is expanded into a loop. On x86_64, this calls __udivti3.
define i65 @udiv65(i65 %a, i65 %b) nounwind {
; X86-LABEL: udiv65:
-; X86-NOT: call
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $128, %esp
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl 20(%ebp), %ebx
+; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl 24(%ebp), %edx
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: sete (%esp) # 1-byte Folded Spill
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: shldl $31, %edx, %edi
+; X86-NEXT: shldl $31, %ebx, %edx
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB0_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %edx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB0_3
+; X86-NEXT: .LBB0_1:
+; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB0_3: # %_udiv-special-cases
+; X86-NEXT: shll $31, %ebx
+; X86-NEXT: movl $64, %esi
+; X86-NEXT: jne .LBB0_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: movl $64, %ebx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: je .LBB0_7
+; X86-NEXT: jmp .LBB0_8
+; X86-NEXT: .LBB0_4:
+; X86-NEXT: bsrl %ebx, %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: jne .LBB0_8
+; X86-NEXT: .LBB0_7: # %_udiv-special-cases
+; X86-NEXT: addl $64, %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: .LBB0_8: # %_udiv-special-cases
+; X86-NEXT: movl 16(%ebp), %edi
+; X86-NEXT: shldl $31, %ecx, %edi
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: shldl $31, %ebx, %edx
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB0_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: bsrl %edx, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: jmp .LBB0_11
+; X86-NEXT: .LBB0_9:
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: .LBB0_11: # %_udiv-special-cases
+; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NEXT: shll $31, %ebx
+; X86-NEXT: je .LBB0_13
+; X86-NEXT: # %bb.12:
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB0_13: # %_udiv-special-cases
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB0_15
+; X86-NEXT: # %bb.14: # %_udiv-special-cases
+; X86-NEXT: addl $64, %esi
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: .LBB0_15: # %_udiv-special-cases
+; X86-NEXT: movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT: subl %edi, %eax
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: jne .LBB0_16
+; X86-NEXT: # %bb.17: # %select.false.sink
+; X86-NEXT: movl $64, %ecx
+; X86-NEXT: cmpl %eax, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edi, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB0_18: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: jne .LBB0_20
+; X86-NEXT: # %bb.19: # %select.end
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: .LBB0_20: # %select.end
+; X86-NEXT: jne .LBB0_21
+; X86-NEXT: # %bb.27: # %select.end
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl $64, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: je .LBB0_28
+; X86-NEXT: # %bb.25: # %udiv-bb1
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: addl $1, %edx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movb $64, %cl
+; X86-NEXT: subb %al, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 96(%esp,%eax), %edi
+; X86-NEXT: movl 100(%esp,%eax), %esi
+; X86-NEXT: movl 104(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: je .LBB0_26
+; X86-NEXT: # %bb.22: # %udiv-preheader
+; X86-NEXT: andl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movl 56(%esp,%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%edx), %eax
+; X86-NEXT: movl 52(%esp,%edx), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edx, %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebp), %ecx
+; X86-NEXT: addl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $1, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB0_23: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edi
+; X86-NEXT: shrl $31, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: leal (%ecx,%eax,2), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: shrl $31, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: negl %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: andl 24(%ebp), %ebx
+; X86-NEXT: andl 20(%ebp), %esi
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl $-1, %esi
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: adcl $1, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: jne .LBB0_23
+; X86-NEXT: .LBB0_24: # %udiv-loop-exit
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %esi
+; X86-NEXT: shrl $31, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: leal (%edx,%eax,2), %edx
+; X86-NEXT: .LBB0_28: # %udiv-end
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+; X86-NEXT: .LBB0_16:
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB0_18
+; X86-NEXT: .LBB0_26:
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB0_24
+; X86-NEXT: .LBB0_21:
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: jmp .LBB0_28
;
; X64-LABEL: udiv65:
; X64: # %bb.0:
@@ -18,53 +303,4712 @@ define i65 @udiv65(i65 %a, i65 %b) nounwind {
ret i65 %res
}
+; On both i686 and x86_64, i129 division is expanded inline (ExpandLargeDivRem).
+; MaxDivRemBitWidthSupported=128 on x86_64, so i129 exceeds the DAG limit.
define i129 @udiv129(i129 %a, i129 %b) nounwind {
; X86-LABEL: udiv129:
-; X86-NOT: call
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $240, %esp
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: orl 44(%ebp), %ecx
+; X86-NEXT: movl 40(%ebp), %ebx
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: orl 24(%ebp), %ecx
+; X86-NEXT: movl 12(%ebp), %esi
+; X86-NEXT: orl 20(%ebp), %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl 44(%ebp), %edx
+; X86-NEXT: shldl $31, %edx, %eax
+; X86-NEXT: shldl $31, %ebx, %edx
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB1_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %edx, %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: orl $32, %ebx
+; X86-NEXT: jmp .LBB1_3
+; X86-NEXT: .LBB1_1:
+; X86-NEXT: bsrl %eax, %ebx
+; X86-NEXT: xorl $31, %ebx
+; X86-NEXT: .LBB1_3: # %_udiv-special-cases
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: shldl $31, %edi, %esi
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: shldl $31, %ecx, %edi
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB1_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB1_6
+; X86-NEXT: .LBB1_4:
+; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB1_6: # %_udiv-special-cases
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: jne .LBB1_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB1_8: # %_udiv-special-cases
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: orb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl 20(%ebp), %ebx
+; X86-NEXT: je .LBB1_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: je .LBB1_11
+; X86-NEXT: .LBB1_12: # %_udiv-special-cases
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: jne .LBB1_14
+; X86-NEXT: .LBB1_13: # %_udiv-special-cases
+; X86-NEXT: subl $-128, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB1_14: # %_udiv-special-cases
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: shldl $31, %eax, %ecx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shldl $31, %ebx, %edi
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB1_15
+; X86-NEXT: # %bb.16: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: jmp .LBB1_17
+; X86-NEXT: .LBB1_9: # %_udiv-special-cases
+; X86-NEXT: movl $64, %ecx
+; X86-NEXT: jne .LBB1_12
+; X86-NEXT: .LBB1_11: # %_udiv-special-cases
+; X86-NEXT: movl $128, %ecx
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: je .LBB1_13
+; X86-NEXT: jmp .LBB1_14
+; X86-NEXT: .LBB1_15:
+; X86-NEXT: bsrl %ecx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: .LBB1_17: # %_udiv-special-cases
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: shldl $31, %ecx, %ebx
+; X86-NEXT: shldl $31, %eax, %ecx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB1_18
+; X86-NEXT: # %bb.19: # %_udiv-special-cases
+; X86-NEXT: bsrl %ecx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB1_20
+; X86-NEXT: .LBB1_18:
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB1_20: # %_udiv-special-cases
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: jne .LBB1_22
+; X86-NEXT: # %bb.21: # %_udiv-special-cases
+; X86-NEXT: orl $64, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: .LBB1_22: # %_udiv-special-cases
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: jne .LBB1_23
+; X86-NEXT: # %bb.24: # %_udiv-special-cases
+; X86-NEXT: movl $128, %eax
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: je .LBB1_26
+; X86-NEXT: jmp .LBB1_27
+; X86-NEXT: .LBB1_23:
+; X86-NEXT: bsrl %eax, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: jne .LBB1_27
+; X86-NEXT: .LBB1_26: # %_udiv-special-cases
+; X86-NEXT: subl $-128, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: .LBB1_27: # %_udiv-special-cases
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: subl %edx, %edi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB1_28
+; X86-NEXT: # %bb.29: # %select.false.sink
+; X86-NEXT: movl $128, %eax
+; X86-NEXT: cmpl %edi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: setb %cl
+; X86-NEXT: .LBB1_30: # %select.end
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: jne .LBB1_32
+; X86-NEXT: # %bb.31: # %select.end
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebp), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: .LBB1_32: # %select.end
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB1_33
+; X86-NEXT: # %bb.39: # %select.end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: xorl $128, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: je .LBB1_40
+; X86-NEXT: # %bb.37: # %udiv-bb1
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: addl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movb $-128, %cl
+; X86-NEXT: subb %bl, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $28, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %edx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 200(%esp,%edx), %eax
+; X86-NEXT: movl 204(%esp,%edx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl 196(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 192(%esp,%edx), %eax
+; X86-NEXT: movl 208(%esp,%edx), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: je .LBB1_38
+; X86-NEXT: # %bb.34: # %udiv-preheader
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movl 112(%esp,%edx,4), %eax
+; X86-NEXT: movl 108(%esp,%edx,4), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: movl 104(%esp,%edx,4), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 96(%esp,%edx,4), %eax
+; X86-NEXT: movl 100(%esp,%edx,4), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shrdl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $1, %esi
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB1_35: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shrl $31, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: leal (%eax,%edx,2), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl $31, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: addl %esi, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: andl 44(%ebp), %edi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl 40(%ebp), %eax
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: andl 36(%ebp), %esi
+; X86-NEXT: andl 32(%ebp), %ebx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: subl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %esi, %edx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl $-1, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: adcl $1, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jne .LBB1_35
+; X86-NEXT: .LBB1_36: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: leal (%ebx,%edi,2), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: .LBB1_40: # %udiv-end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, (%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movb %cl, 16(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB1_28:
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movb $1, %cl
+; X86-NEXT: jmp .LBB1_30
+; X86-NEXT: .LBB1_38:
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB1_36
+; X86-NEXT: .LBB1_33:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jmp .LBB1_40
;
; X64-LABEL: udiv129:
-; X64-NOT: call
+; X64: # %bb.0: # %_udiv-special-cases
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $224, %rsp
+; X64-NEXT: movq %rcx, %r13
+; X64-NEXT: movl %edx, %r14d
+; X64-NEXT: andl $1, %r14d
+; X64-NEXT: movl %r9d, %r12d
+; X64-NEXT: andl $1, %r12d
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: orq %r12, %rax
+; X64-NEXT: orq %r8, %rax
+; X64-NEXT: sete %cl
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: orq %r14, %rax
+; X64-NEXT: orq %rsi, %rax
+; X64-NEXT: sete %al
+; X64-NEXT: orb %cl, %al
+; X64-NEXT: shldq $63, %r8, %r9
+; X64-NEXT: bsrq %r9, %rcx
+; X64-NEXT: xorq $63, %rcx
+; X64-NEXT: movq %r8, %r11
+; X64-NEXT: shldq $63, %r13, %r11
+; X64-NEXT: bsrq %r11, %r10
+; X64-NEXT: xorq $63, %r10
+; X64-NEXT: orq $64, %r10
+; X64-NEXT: testq %r9, %r9
+; X64-NEXT: cmovneq %rcx, %r10
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: shlq $63, %rcx
+; X64-NEXT: bsrq %rcx, %r15
+; X64-NEXT: xorq $63, %r15
+; X64-NEXT: testq %rcx, %rcx
+; X64-NEXT: movl $128, %ecx
+; X64-NEXT: cmoveq %rcx, %r15
+; X64-NEXT: subq $-128, %r15
+; X64-NEXT: orq %r9, %r11
+; X64-NEXT: cmovneq %r10, %r15
+; X64-NEXT: shldq $63, %rsi, %rdx
+; X64-NEXT: bsrq %rdx, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: movq %rsi, %r9
+; X64-NEXT: shldq $63, %rdi, %r9
+; X64-NEXT: bsrq %r9, %r10
+; X64-NEXT: xorq $63, %r10
+; X64-NEXT: orq $64, %r10
+; X64-NEXT: testq %rdx, %rdx
+; X64-NEXT: cmovneq %r11, %r10
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: shlq $63, %rbx
+; X64-NEXT: bsrq %rbx, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: testq %rbx, %rbx
+; X64-NEXT: cmoveq %rcx, %r11
+; X64-NEXT: subq $-128, %r11
+; X64-NEXT: orq %rdx, %r9
+; X64-NEXT: cmovneq %r10, %r11
+; X64-NEXT: xorl %r9d, %r9d
+; X64-NEXT: subq %r11, %r15
+; X64-NEXT: movl $0, %ebx
+; X64-NEXT: sbbq %rbx, %rbx
+; X64-NEXT: sbbq %r9, %r9
+; X64-NEXT: andl $1, %r9d
+; X64-NEXT: testb %al, %al
+; X64-NEXT: jne .LBB1_1
+; X64-NEXT: # %bb.2: # %select.false.sink
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpq %r15, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %rbx, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %r9, %rcx
+; X64-NEXT: sbbq %rax, %rax
+; X64-NEXT: setb %al
+; X64-NEXT: .LBB1_3: # %select.end
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: testb %al, %al
+; X64-NEXT: movq %rsi, %rdx
+; X64-NEXT: cmovneq %rcx, %rdx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: cmovneq %rcx, %rax
+; X64-NEXT: cmoveq %r14, %rcx
+; X64-NEXT: jne .LBB1_9
+; X64-NEXT: # %bb.4: # %select.end
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r15, %r10
+; X64-NEXT: xorq $128, %r10
+; X64-NEXT: orq %r9, %r10
+; X64-NEXT: orq %rbx, %r10
+; X64-NEXT: je .LBB1_9
+; X64-NEXT: # %bb.5: # %udiv-bb1
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: addq $1, %rdx
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: andl $1, %r9d
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT: movb $-128, %cl
+; X64-NEXT: subb %r15b, %cl
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: shrb $3, %al
+; X64-NEXT: andb $24, %al
+; X64-NEXT: negb %al
+; X64-NEXT: movsbq %al, %r10
+; X64-NEXT: movq 160(%rsp,%r10), %rax
+; X64-NEXT: movq 168(%rsp,%r10), %r12
+; X64-NEXT: movq 176(%rsp,%r10), %r11
+; X64-NEXT: shldq %cl, %r12, %r11
+; X64-NEXT: shldq %cl, %rax, %r12
+; X64-NEXT: shlq %cl, %rax
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: orq %r9, %rcx
+; X64-NEXT: orq %rbx, %rcx
+; X64-NEXT: je .LBB1_10
+; X64-NEXT: # %bb.6: # %udiv-preheader
+; X64-NEXT: andl $1, %r11d
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrb $6, %cl
+; X64-NEXT: movzbl %cl, %ecx
+; X64-NEXT: movq 80(%rsp,%rcx,8), %rdi
+; X64-NEXT: movq 64(%rsp,%rcx,8), %rsi
+; X64-NEXT: movq 72(%rsp,%rcx,8), %r10
+; X64-NEXT: movq %r10, %r14
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrdq %cl, %rdi, %r14
+; X64-NEXT: shrdq %cl, %r10, %rsi
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: addq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r8, %rcx
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $1, %rcx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: xorl %r13d, %r13d
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: xorl %r15d, %r15d
+; X64-NEXT: movq %r14, %r10
+; X64-NEXT: movq %r12, %rcx
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB1_7: # %udiv-do-while
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: shldq $1, %rsi, %r10
+; X64-NEXT: shrq $63, %r14
+; X64-NEXT: andl $1, %r11d
+; X64-NEXT: leaq (%r11,%rsi,2), %rsi
+; X64-NEXT: shldq $1, %rax, %rcx
+; X64-NEXT: orq %rdi, %rcx
+; X64-NEXT: shrq $63, %r12
+; X64-NEXT: addq %rax, %rax
+; X64-NEXT: orq %r13, %rax
+; X64-NEXT: orl %r12d, %r15d
+; X64-NEXT: movl %r15d, %r11d
+; X64-NEXT: andl $1, %r11d
+; X64-NEXT: cmpq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: sbbq %r10, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: sbbq %r14, %rdi
+; X64-NEXT: andl $1, %edi
+; X64-NEXT: negq %rdi
+; X64-NEXT: movl %edi, %r13d
+; X64-NEXT: andl $1, %r13d
+; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT: subq %rdi, %rsi
+; X64-NEXT: sbbq %r8, %r10
+; X64-NEXT: addq $-1, %rdx
+; X64-NEXT: adcq $-1, %rbx
+; X64-NEXT: adcq $1, %r9
+; X64-NEXT: andl $1, %r9d
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: orq %r9, %rdi
+; X64-NEXT: orq %rbx, %rdi
+; X64-NEXT: movl $0, %edi
+; X64-NEXT: movl $0, %r15d
+; X64-NEXT: movq %r10, %r14
+; X64-NEXT: movq %rcx, %r12
+; X64-NEXT: jne .LBB1_7
+; X64-NEXT: .LBB1_8: # %udiv-loop-exit
+; X64-NEXT: movq %rcx, %rdx
+; X64-NEXT: shldq $1, %rax, %rdx
+; X64-NEXT: shrq $63, %rcx
+; X64-NEXT: leaq (%r13,%rax,2), %rax
+; X64-NEXT: .LBB1_9: # %udiv-end
+; X64-NEXT: leaq -40(%rbp), %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+; X64-NEXT: .LBB1_1:
+; X64-NEXT: movb $1, %al
+; X64-NEXT: jmp .LBB1_3
+; X64-NEXT: .LBB1_10:
+; X64-NEXT: xorl %r13d, %r13d
+; X64-NEXT: movq %r12, %rcx
+; X64-NEXT: jmp .LBB1_8
%res = udiv i129 %a, %b
ret i129 %res
}
define i129 @urem129(i129 %a, i129 %b) nounwind {
; X86-LABEL: urem129:
-; X86-NOT: call
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $240, %esp
+; X86-NEXT: movl 28(%ebp), %ebx
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: movl 48(%ebp), %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: orl 44(%ebp), %eax
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: orl 24(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %esi
+; X86-NEXT: orl 20(%ebp), %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: shldl $31, %ecx, %edx
+; X86-NEXT: shldl $31, %edi, %ecx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB2_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %ecx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB2_3
+; X86-NEXT: .LBB2_1:
+; X86-NEXT: bsrl %edx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB2_3: # %_udiv-special-cases
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl 36(%ebp), %edi
+; X86-NEXT: shldl $31, %edi, %esi
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: shldl $31, %eax, %edi
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB2_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB2_6
+; X86-NEXT: .LBB2_4:
+; X86-NEXT: bsrl %esi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB2_6: # %_udiv-special-cases
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: jne .LBB2_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB2_8: # %_udiv-special-cases
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: orb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: shll $31, %ecx
+; X86-NEXT: bsrl %ecx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl 20(%ebp), %ebx
+; X86-NEXT: je .LBB2_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: je .LBB2_11
+; X86-NEXT: .LBB2_12: # %_udiv-special-cases
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: jne .LBB2_14
+; X86-NEXT: .LBB2_13: # %_udiv-special-cases
+; X86-NEXT: subl $-128, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: .LBB2_14: # %_udiv-special-cases
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebp), %eax
+; X86-NEXT: shldl $31, %ecx, %eax
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: shldl $31, %ebx, %edi
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB2_15
+; X86-NEXT: # %bb.16: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: jmp .LBB2_17
+; X86-NEXT: .LBB2_9: # %_udiv-special-cases
+; X86-NEXT: movl $64, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jne .LBB2_12
+; X86-NEXT: .LBB2_11: # %_udiv-special-cases
+; X86-NEXT: movl $128, %eax
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: je .LBB2_13
+; X86-NEXT: jmp .LBB2_14
+; X86-NEXT: .LBB2_15:
+; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: .LBB2_17: # %_udiv-special-cases
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: shldl $31, %eax, %ebx
+; X86-NEXT: shldl $31, %ecx, %eax
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB2_18
+; X86-NEXT: # %bb.19: # %_udiv-special-cases
+; X86-NEXT: bsrl %eax, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB2_20
+; X86-NEXT: .LBB2_18:
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB2_20: # %_udiv-special-cases
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB2_22
+; X86-NEXT: # %bb.21: # %_udiv-special-cases
+; X86-NEXT: orl $64, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: .LBB2_22: # %_udiv-special-cases
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: shll $31, %ecx
+; X86-NEXT: jne .LBB2_23
+; X86-NEXT: # %bb.24: # %_udiv-special-cases
+; X86-NEXT: movl $128, %ecx
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: je .LBB2_26
+; X86-NEXT: jmp .LBB2_27
+; X86-NEXT: .LBB2_23:
+; X86-NEXT: bsrl %ecx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: jne .LBB2_27
+; X86-NEXT: .LBB2_26: # %_udiv-special-cases
+; X86-NEXT: subl $-128, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: .LBB2_27: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB2_28
+; X86-NEXT: # %bb.29: # %select.false.sink
+; X86-NEXT: movl $128, %eax
+; X86-NEXT: cmpl %ecx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: setb %al
+; X86-NEXT: .LBB2_30: # %select.end
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: testb %al, %al
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: jne .LBB2_32
+; X86-NEXT: # %bb.31: # %select.end
+; X86-NEXT: movl 16(%ebp), %ebx
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: .LBB2_32: # %select.end
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB2_38
+; X86-NEXT: # %bb.33: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl $128, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: je .LBB2_38
+; X86-NEXT: # %bb.34: # %udiv-bb1
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: addl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movb $-128, %cl
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: subb %al, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $28, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 200(%esp,%eax), %esi
+; X86-NEXT: movl 204(%esp,%eax), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl 196(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 192(%esp,%eax), %esi
+; X86-NEXT: movl 208(%esp,%eax), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: je .LBB2_39
+; X86-NEXT: # %bb.35: # %udiv-preheader
+; X86-NEXT: andl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 112(%esp,%eax,4), %edx
+; X86-NEXT: movl 108(%esp,%eax,4), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl 104(%esp,%eax,4), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 96(%esp,%eax,4), %edx
+; X86-NEXT: movl 100(%esp,%eax,4), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: addl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebp), %ecx
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $1, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB2_36: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shrl $31, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: leal (%edx,%eax,2), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl $31, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: addl %edx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: negl %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: andl 44(%ebp), %esi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: andl 40(%ebp), %ebx
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: andl 36(%ebp), %edx
+; X86-NEXT: andl 32(%ebp), %edi
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: subl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl $-1, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: adcl $1, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: jne .LBB2_36
+; X86-NEXT: .LBB2_37: # %udiv-loop-exit
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: leal (%esi,%eax,2), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl $31, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: .LBB2_38: # %udiv-end
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl 32(%ebp), %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: setb %cl
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: imull 44(%ebp), %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: imull 36(%ebp), %edi
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: imull 40(%ebp), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: imull 32(%ebp), %eax
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl 12(%ebp), %edi
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl 16(%ebp), %edx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 20(%ebp), %esi
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: movb %bl, 16(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB2_28:
+; X86-NEXT: movb $1, %al
+; X86-NEXT: jmp .LBB2_30
+; X86-NEXT: .LBB2_39:
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB2_37
;
; X64-LABEL: urem129:
-; X64-NOT: call
+; X64: # %bb.0: # %_udiv-special-cases
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $224, %rsp
+; X64-NEXT: movq %r8, %r14
+; X64-NEXT: movq %rsi, %r10
+; X64-NEXT: movq %rdi, %r15
+; X64-NEXT: movl %edx, %r13d
+; X64-NEXT: andl $1, %r13d
+; X64-NEXT: movl %r9d, %esi
+; X64-NEXT: andl $1, %esi
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %rsi, %rax
+; X64-NEXT: orq %r8, %rax
+; X64-NEXT: sete %sil
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: orq %r13, %rax
+; X64-NEXT: orq %r10, %rax
+; X64-NEXT: sete %al
+; X64-NEXT: orb %sil, %al
+; X64-NEXT: shldq $63, %r8, %r9
+; X64-NEXT: bsrq %r9, %r8
+; X64-NEXT: xorq $63, %r8
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: shldq $63, %rcx, %rsi
+; X64-NEXT: bsrq %rsi, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: orq $64, %rdi
+; X64-NEXT: testq %r9, %r9
+; X64-NEXT: cmovneq %r8, %rdi
+; X64-NEXT: movq %rcx, %r8
+; X64-NEXT: shlq $63, %r8
+; X64-NEXT: bsrq %r8, %rbx
+; X64-NEXT: xorq $63, %rbx
+; X64-NEXT: testq %r8, %r8
+; X64-NEXT: movl $128, %r11d
+; X64-NEXT: cmoveq %r11, %rbx
+; X64-NEXT: subq $-128, %rbx
+; X64-NEXT: orq %r9, %rsi
+; X64-NEXT: cmovneq %rdi, %rbx
+; X64-NEXT: shldq $63, %r10, %rdx
+; X64-NEXT: bsrq %rdx, %rsi
+; X64-NEXT: xorq $63, %rsi
+; X64-NEXT: movq %r10, %rdi
+; X64-NEXT: shldq $63, %r15, %rdi
+; X64-NEXT: bsrq %rdi, %r8
+; X64-NEXT: xorq $63, %r8
+; X64-NEXT: orq $64, %r8
+; X64-NEXT: testq %rdx, %rdx
+; X64-NEXT: cmovneq %rsi, %r8
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: shlq $63, %rsi
+; X64-NEXT: bsrq %rsi, %r9
+; X64-NEXT: xorq $63, %r9
+; X64-NEXT: testq %rsi, %rsi
+; X64-NEXT: cmoveq %r11, %r9
+; X64-NEXT: subq $-128, %r9
+; X64-NEXT: orq %rdx, %rdi
+; X64-NEXT: cmovneq %r8, %r9
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: subq %r9, %rbx
+; X64-NEXT: movl $0, %r9d
+; X64-NEXT: sbbq %r9, %r9
+; X64-NEXT: sbbq %rdx, %rdx
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: testb %al, %al
+; X64-NEXT: jne .LBB2_1
+; X64-NEXT: # %bb.2: # %select.false.sink
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpq %rbx, %r11
+; X64-NEXT: movl $0, %esi
+; X64-NEXT: sbbq %r9, %rsi
+; X64-NEXT: movl $0, %esi
+; X64-NEXT: sbbq %rdx, %rsi
+; X64-NEXT: sbbq %rax, %rax
+; X64-NEXT: setb %al
+; X64-NEXT: .LBB2_3: # %select.end
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: testb %al, %al
+; X64-NEXT: movq %r10, %rsi
+; X64-NEXT: cmovneq %rdi, %rsi
+; X64-NEXT: movq %r15, %r8
+; X64-NEXT: cmovneq %rdi, %r8
+; X64-NEXT: cmoveq %r13, %rdi
+; X64-NEXT: movq %rcx, (%rsp) # 8-byte Spill
+; X64-NEXT: jne .LBB2_9
+; X64-NEXT: # %bb.4: # %select.end
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: xorq $128, %rax
+; X64-NEXT: orq %rdx, %rax
+; X64-NEXT: orq %r9, %rax
+; X64-NEXT: je .LBB2_9
+; X64-NEXT: # %bb.5: # %udiv-bb1
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: addq $1, %rax
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: adcq $0, %rdx
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: movq %r15, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r13, {{[0-9]+}}(%rsp)
+; X64-NEXT: movb $-128, %cl
+; X64-NEXT: subb %bl, %cl
+; X64-NEXT: movl %ecx, %esi
+; X64-NEXT: shrb $3, %sil
+; X64-NEXT: andb $24, %sil
+; X64-NEXT: negb %sil
+; X64-NEXT: movsbq %sil, %rsi
+; X64-NEXT: movq 160(%rsp,%rsi), %rbx
+; X64-NEXT: movq 168(%rsp,%rsi), %r12
+; X64-NEXT: movq 176(%rsp,%rsi), %r11
+; X64-NEXT: shldq %cl, %r12, %r11
+; X64-NEXT: shldq %cl, %rbx, %r12
+; X64-NEXT: shlq %cl, %rbx
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: orq %rdx, %rcx
+; X64-NEXT: orq %r9, %rcx
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: je .LBB2_10
+; X64-NEXT: # %bb.6: # %udiv-preheader
+; X64-NEXT: andl $1, %r11d
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r15, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r13, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: shrb $6, %cl
+; X64-NEXT: movzbl %cl, %ecx
+; X64-NEXT: movq 80(%rsp,%rcx,8), %rsi
+; X64-NEXT: movq 64(%rsp,%rcx,8), %r13
+; X64-NEXT: movq 72(%rsp,%rcx,8), %rdi
+; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: shrdq %cl, %rsi, %r10
+; X64-NEXT: shrdq %cl, %rdi, %r13
+; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT: addq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, %rcx
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: adcq $1, %rsi
+; X64-NEXT: andl $1, %esi
+; X64-NEXT: xorl %r15d, %r15d
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r14d, %r14d
+; X64-NEXT: movq %r10, %r8
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB2_7: # %udiv-do-while
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: shldq $1, %r13, %r8
+; X64-NEXT: shrq $63, %r10
+; X64-NEXT: andl $1, %r11d
+; X64-NEXT: leaq (%r11,%r13,2), %r13
+; X64-NEXT: shldq $1, %rbx, %rdi
+; X64-NEXT: orq %rcx, %rdi
+; X64-NEXT: shrq $63, %r12
+; X64-NEXT: addq %rbx, %rbx
+; X64-NEXT: orq %r15, %rbx
+; X64-NEXT: orl %r12d, %r14d
+; X64-NEXT: movl %r14d, %r11d
+; X64-NEXT: andl $1, %r11d
+; X64-NEXT: cmpq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %r8, %rcx
+; X64-NEXT: movq %rsi, %rcx
+; X64-NEXT: sbbq %r10, %rcx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: negq %rcx
+; X64-NEXT: movl %ecx, %r15d
+; X64-NEXT: andl $1, %r15d
+; X64-NEXT: movq %rcx, %r10
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT: andq (%rsp), %rcx # 8-byte Folded Reload
+; X64-NEXT: subq %rcx, %r13
+; X64-NEXT: sbbq %r10, %r8
+; X64-NEXT: addq $-1, %rax
+; X64-NEXT: adcq $-1, %r9
+; X64-NEXT: adcq $1, %rdx
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: orq %rdx, %rcx
+; X64-NEXT: orq %r9, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: movl $0, %r14d
+; X64-NEXT: movq %r8, %r10
+; X64-NEXT: movq %rdi, %r12
+; X64-NEXT: jne .LBB2_7
+; X64-NEXT: .LBB2_8: # %udiv-loop-exit
+; X64-NEXT: movq %rdi, %rsi
+; X64-NEXT: shldq $1, %rbx, %rsi
+; X64-NEXT: shrq $63, %rdi
+; X64-NEXT: leaq (%r15,%rbx,2), %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: .LBB2_9: # %udiv-end
+; X64-NEXT: movq (%rsp), %r12 # 8-byte Reload
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %r14, %r11
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r9, %r14
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq %r12, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: adcq %rbx, %rdx
+; X64-NEXT: imulq %rsi, %r11
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: imulq %r8, %rsi
+; X64-NEXT: imulq %r12, %rdi
+; X64-NEXT: addq %rsi, %rdi
+; X64-NEXT: addq %r11, %rdi
+; X64-NEXT: addq %rdx, %rdi
+; X64-NEXT: subq %rcx, %r15
+; X64-NEXT: sbbq %rax, %r10
+; X64-NEXT: sbbq %rdi, %r13
+; X64-NEXT: andl $1, %r13d
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r10, %rdx
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: leaq -40(%rbp), %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+; X64-NEXT: .LBB2_1:
+; X64-NEXT: movb $1, %al
+; X64-NEXT: jmp .LBB2_3
+; X64-NEXT: .LBB2_10:
+; X64-NEXT: xorl %r15d, %r15d
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: jmp .LBB2_8
%res = urem i129 %a, %b
ret i129 %res
}
define i129 @sdiv129(i129 %a, i129 %b) nounwind {
; X86-LABEL: sdiv129:
-; X86-NOT: call
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $256, %esp # imm = 0x100
+; X86-NEXT: movl 48(%ebp), %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 28(%ebp), %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: negl %ecx
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: xorl %ecx, %edi
+; X86-NEXT: movl 16(%ebp), %esi
+; X86-NEXT: xorl %ecx, %esi
+; X86-NEXT: movl 12(%ebp), %ebx
+; X86-NEXT: xorl %ecx, %ebx
+; X86-NEXT: subl %ecx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ecx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%ebp), %ebx
+; X86-NEXT: xorl %eax, %ebx
+; X86-NEXT: movl 44(%ebp), %edi
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: movl 40(%ebp), %esi
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: movl 36(%ebp), %edx
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: movl 32(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: subl %eax, %ecx
+; X86-NEXT: sbbl %eax, %edx
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: shldl $31, %edi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: shldl $31, %esi, %ecx
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB3_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %ecx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: jmp .LBB3_3
+; X86-NEXT: .LBB3_1:
+; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: .LBB3_3: # %_udiv-special-cases
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $31, %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $31, %eax, %ebx
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: jne .LBB3_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %ebx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB3_6
+; X86-NEXT: .LBB3_4:
+; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB3_6: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: jne .LBB3_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: .LBB3_8: # %_udiv-special-cases
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: orb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: je .LBB3_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: je .LBB3_11
+; X86-NEXT: .LBB3_12: # %_udiv-special-cases
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: jne .LBB3_14
+; X86-NEXT: .LBB3_13: # %_udiv-special-cases
+; X86-NEXT: subl $-128, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: .LBB3_14: # %_udiv-special-cases
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $31, %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $31, %edi, %edx
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB3_15
+; X86-NEXT: # %bb.16: # %_udiv-special-cases
+; X86-NEXT: bsrl %edx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB3_17
+; X86-NEXT: .LBB3_9: # %_udiv-special-cases
+; X86-NEXT: movl $64, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: jne .LBB3_12
+; X86-NEXT: .LBB3_11: # %_udiv-special-cases
+; X86-NEXT: movl $128, %ecx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: je .LBB3_13
+; X86-NEXT: jmp .LBB3_14
+; X86-NEXT: .LBB3_15:
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB3_17: # %_udiv-special-cases
+; X86-NEXT: shldl $31, %eax, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $31, %eax, %ebx
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: jne .LBB3_18
+; X86-NEXT: # %bb.19: # %_udiv-special-cases
+; X86-NEXT: bsrl %ebx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB3_20
+; X86-NEXT: .LBB3_18:
+; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB3_20: # %_udiv-special-cases
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB3_22
+; X86-NEXT: # %bb.21: # %_udiv-special-cases
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: .LBB3_22: # %_udiv-special-cases
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: jne .LBB3_23
+; X86-NEXT: # %bb.24: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $128, %ecx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: je .LBB3_26
+; X86-NEXT: jmp .LBB3_27
+; X86-NEXT: .LBB3_23:
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: jne .LBB3_27
+; X86-NEXT: .LBB3_26: # %_udiv-special-cases
+; X86-NEXT: subl $-128, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: .LBB3_27: # %_udiv-special-cases
+; X86-NEXT: subl %esi, %eax
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB3_28
+; X86-NEXT: # %bb.29: # %select.false.sink
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $128, %eax
+; X86-NEXT: cmpl %edi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: setb %al
+; X86-NEXT: .LBB3_30: # %select.end
+; X86-NEXT: testb %al, %al
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jne .LBB3_32
+; X86-NEXT: # %bb.31: # %select.end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: .LBB3_32: # %select.end
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB3_38
+; X86-NEXT: # %bb.33: # %select.end
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl $128, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: je .LBB3_38
+; X86-NEXT: # %bb.34: # %udiv-bb1
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: addl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movb $-128, %cl
+; X86-NEXT: subb %al, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $28, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %edx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 216(%esp,%edx), %eax
+; X86-NEXT: movl 220(%esp,%edx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl 212(%esp,%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 208(%esp,%edx), %eax
+; X86-NEXT: movl 224(%esp,%edx), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: je .LBB3_39
+; X86-NEXT: # %bb.35: # %udiv-preheader
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movl 128(%esp,%edx,4), %eax
+; X86-NEXT: movl 124(%esp,%edx,4), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: movl 120(%esp,%edx,4), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 112(%esp,%edx,4), %eax
+; X86-NEXT: movl 116(%esp,%edx,4), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shrdl %cl, %esi, %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shrdl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $1, %esi
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB3_36: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shrl $31, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: leal (%eax,%edx,2), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl $31, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: negl %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: subl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %esi, %edx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl $-1, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: adcl $1, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: jne .LBB3_36
+; X86-NEXT: .LBB3_37: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: leal (%edx,%ecx,2), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl $31, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB3_38: # %udiv-end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl %edx, 12(%eax)
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movb %cl, 16(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB3_28:
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movb $1, %al
+; X86-NEXT: jmp .LBB3_30
+; X86-NEXT: .LBB3_39:
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB3_37
;
; X64-LABEL: sdiv129:
-; X64-NOT: call
+; X64: # %bb.0: # %_udiv-special-cases
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $224, %rsp
+; X64-NEXT: movq %rcx, %r14
+; X64-NEXT: movl %r9d, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: negq %rax
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: negq %rcx
+; X64-NEXT: xorq %rcx, %rdx
+; X64-NEXT: xorq %rcx, %rsi
+; X64-NEXT: xorq %rcx, %rdi
+; X64-NEXT: subq %rcx, %rdi
+; X64-NEXT: sbbq %rcx, %rsi
+; X64-NEXT: sbbq %rcx, %rdx
+; X64-NEXT: movl %edx, %r12d
+; X64-NEXT: andl $1, %r12d
+; X64-NEXT: xorq %rax, %r9
+; X64-NEXT: xorq %rax, %r8
+; X64-NEXT: xorq %rax, %r14
+; X64-NEXT: subq %rax, %r14
+; X64-NEXT: sbbq %rax, %r8
+; X64-NEXT: sbbq %rax, %r9
+; X64-NEXT: movl %r9d, %r10d
+; X64-NEXT: andl $1, %r10d
+; X64-NEXT: xorq %rax, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r10, %rax
+; X64-NEXT: orq %r8, %rax
+; X64-NEXT: sete %cl
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: orq %r12, %rax
+; X64-NEXT: orq %rsi, %rax
+; X64-NEXT: sete %al
+; X64-NEXT: orb %cl, %al
+; X64-NEXT: shldq $63, %r8, %r9
+; X64-NEXT: bsrq %r9, %rcx
+; X64-NEXT: xorq $63, %rcx
+; X64-NEXT: movq %r8, %r11
+; X64-NEXT: shldq $63, %r14, %r11
+; X64-NEXT: bsrq %r11, %r10
+; X64-NEXT: xorq $63, %r10
+; X64-NEXT: orq $64, %r10
+; X64-NEXT: testq %r9, %r9
+; X64-NEXT: cmovneq %rcx, %r10
+; X64-NEXT: movq %r14, %rcx
+; X64-NEXT: shlq $63, %rcx
+; X64-NEXT: bsrq %rcx, %r13
+; X64-NEXT: xorq $63, %r13
+; X64-NEXT: testq %rcx, %rcx
+; X64-NEXT: movl $128, %ecx
+; X64-NEXT: cmoveq %rcx, %r13
+; X64-NEXT: subq $-128, %r13
+; X64-NEXT: orq %r9, %r11
+; X64-NEXT: cmovneq %r10, %r13
+; X64-NEXT: shldq $63, %rsi, %rdx
+; X64-NEXT: bsrq %rdx, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: movq %rsi, %r9
+; X64-NEXT: shldq $63, %rdi, %r9
+; X64-NEXT: bsrq %r9, %r10
+; X64-NEXT: xorq $63, %r10
+; X64-NEXT: orq $64, %r10
+; X64-NEXT: testq %rdx, %rdx
+; X64-NEXT: cmovneq %r11, %r10
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: shlq $63, %rbx
+; X64-NEXT: bsrq %rbx, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: testq %rbx, %rbx
+; X64-NEXT: cmoveq %rcx, %r11
+; X64-NEXT: subq $-128, %r11
+; X64-NEXT: orq %rdx, %r9
+; X64-NEXT: cmovneq %r10, %r11
+; X64-NEXT: xorl %r9d, %r9d
+; X64-NEXT: subq %r11, %r13
+; X64-NEXT: movl $0, %r15d
+; X64-NEXT: sbbq %r15, %r15
+; X64-NEXT: sbbq %r9, %r9
+; X64-NEXT: andl $1, %r9d
+; X64-NEXT: testb %al, %al
+; X64-NEXT: jne .LBB3_1
+; X64-NEXT: # %bb.2: # %select.false.sink
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpq %r13, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %r15, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %r9, %rcx
+; X64-NEXT: sbbq %rax, %rax
+; X64-NEXT: setb %al
+; X64-NEXT: .LBB3_3: # %select.end
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: testb %al, %al
+; X64-NEXT: movq %rsi, %rdx
+; X64-NEXT: cmovneq %rcx, %rdx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: cmovneq %rcx, %rax
+; X64-NEXT: cmoveq %r12, %rcx
+; X64-NEXT: jne .LBB3_9
+; X64-NEXT: # %bb.4: # %select.end
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r13, %r8
+; X64-NEXT: xorq $128, %r8
+; X64-NEXT: orq %r9, %r8
+; X64-NEXT: orq %r15, %r8
+; X64-NEXT: je .LBB3_9
+; X64-NEXT: # %bb.5: # %udiv-bb1
+; X64-NEXT: movq %r14, %r11
+; X64-NEXT: movq %r13, %rdx
+; X64-NEXT: addq $1, %rdx
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: andl $1, %r9d
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT: movb $-128, %cl
+; X64-NEXT: subb %r13b, %cl
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: shrb $3, %al
+; X64-NEXT: andb $24, %al
+; X64-NEXT: negb %al
+; X64-NEXT: movsbq %al, %r8
+; X64-NEXT: movq 160(%rsp,%r8), %rax
+; X64-NEXT: movq 168(%rsp,%r8), %rbx
+; X64-NEXT: movq 176(%rsp,%r8), %r14
+; X64-NEXT: shldq %cl, %rbx, %r14
+; X64-NEXT: shldq %cl, %rax, %rbx
+; X64-NEXT: shlq %cl, %rax
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: orq %r9, %rcx
+; X64-NEXT: orq %r15, %rcx
+; X64-NEXT: je .LBB3_10
+; X64-NEXT: # %bb.6: # %udiv-preheader
+; X64-NEXT: andl $1, %r14d
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrb $6, %cl
+; X64-NEXT: movzbl %cl, %ecx
+; X64-NEXT: movq 80(%rsp,%rcx,8), %rdi
+; X64-NEXT: movq 64(%rsp,%rcx,8), %rsi
+; X64-NEXT: movq 72(%rsp,%rcx,8), %r10
+; X64-NEXT: movq %r10, %r8
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrdq %cl, %rdi, %r8
+; X64-NEXT: shrdq %cl, %r10, %rsi
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq $-1, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: adcq $1, %r12
+; X64-NEXT: andl $1, %r12d
+; X64-NEXT: xorl %r11d, %r11d
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: xorl %r13d, %r13d
+; X64-NEXT: movq %r8, %r10
+; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB3_7: # %udiv-do-while
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: shldq $1, %rsi, %r10
+; X64-NEXT: shrq $63, %r8
+; X64-NEXT: andl $1, %r14d
+; X64-NEXT: leaq (%r14,%rsi,2), %rsi
+; X64-NEXT: shldq $1, %rax, %rcx
+; X64-NEXT: orq %rdi, %rcx
+; X64-NEXT: shrq $63, %rbx
+; X64-NEXT: addq %rax, %rax
+; X64-NEXT: orq %r11, %rax
+; X64-NEXT: orl %ebx, %r13d
+; X64-NEXT: movl %r13d, %r14d
+; X64-NEXT: andl $1, %r14d
+; X64-NEXT: cmpq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: sbbq %r10, %rdi
+; X64-NEXT: movq %r12, %rdi
+; X64-NEXT: sbbq %r8, %rdi
+; X64-NEXT: andl $1, %edi
+; X64-NEXT: negq %rdi
+; X64-NEXT: movl %edi, %r11d
+; X64-NEXT: andl $1, %r11d
+; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; X64-NEXT: subq %rdi, %rsi
+; X64-NEXT: sbbq %r8, %r10
+; X64-NEXT: addq $-1, %rdx
+; X64-NEXT: adcq $-1, %r15
+; X64-NEXT: adcq $1, %r9
+; X64-NEXT: andl $1, %r9d
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: orq %r9, %rdi
+; X64-NEXT: orq %r15, %rdi
+; X64-NEXT: movl $0, %edi
+; X64-NEXT: movl $0, %r13d
+; X64-NEXT: movq %r10, %r8
+; X64-NEXT: movq %rcx, %rbx
+; X64-NEXT: jne .LBB3_7
+; X64-NEXT: .LBB3_8: # %udiv-loop-exit
+; X64-NEXT: movq %rcx, %rdx
+; X64-NEXT: shldq $1, %rax, %rdx
+; X64-NEXT: shrq $63, %rcx
+; X64-NEXT: leaq (%r11,%rax,2), %rax
+; X64-NEXT: .LBB3_9: # %udiv-end
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: xorq %rdi, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: xorq %rsi, %rdx
+; X64-NEXT: xorq %rsi, %rax
+; X64-NEXT: subq %rsi, %rax
+; X64-NEXT: sbbq %rsi, %rdx
+; X64-NEXT: sbbq %rdi, %rcx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: leaq -40(%rbp), %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+; X64-NEXT: .LBB3_1:
+; X64-NEXT: movb $1, %al
+; X64-NEXT: jmp .LBB3_3
+; X64-NEXT: .LBB3_10:
+; X64-NEXT: xorl %r11d, %r11d
+; X64-NEXT: movq %rbx, %rcx
+; X64-NEXT: jmp .LBB3_8
%res = sdiv i129 %a, %b
ret i129 %res
}
define i129 @srem129(i129 %a, i129 %b) nounwind {
; X86-LABEL: srem129:
-; X86-NOT: call
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $288, %esp # imm = 0x120
+; X86-NEXT: movl 48(%ebp), %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: negl %edx
+; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: negl %eax
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebp), %esi
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl 12(%ebp), %ebx
+; X86-NEXT: xorl %eax, %ebx
+; X86-NEXT: subl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%ebp), %ecx
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: movl 44(%ebp), %ebx
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: shldl $31, %ebx, %edx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: shldl $31, %edi, %eax
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB4_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: jmp .LBB4_3
+; X86-NEXT: .LBB4_1:
+; X86-NEXT: bsrl %edx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: .LBB4_3: # %_udiv-special-cases
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $31, %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $31, %edx, %edi
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: jne .LBB4_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB4_6
+; X86-NEXT: .LBB4_4:
+; X86-NEXT: bsrl %esi, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB4_6: # %_udiv-special-cases
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB4_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB4_8: # %_udiv-special-cases
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT: orb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: je .LBB4_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: je .LBB4_11
+; X86-NEXT: .LBB4_12: # %_udiv-special-cases
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: jne .LBB4_14
+; X86-NEXT: .LBB4_13: # %_udiv-special-cases
+; X86-NEXT: subl $-128, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB4_14: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $31, %eax, %ecx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: shldl $31, %ebx, %edi
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB4_15
+; X86-NEXT: # %bb.16: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: orl $32, %edx
+; X86-NEXT: jmp .LBB4_17
+; X86-NEXT: .LBB4_9: # %_udiv-special-cases
+; X86-NEXT: movl $64, %ecx
+; X86-NEXT: jne .LBB4_12
+; X86-NEXT: .LBB4_11: # %_udiv-special-cases
+; X86-NEXT: movl $128, %ecx
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: je .LBB4_13
+; X86-NEXT: jmp .LBB4_14
+; X86-NEXT: .LBB4_15:
+; X86-NEXT: bsrl %ecx, %edx
+; X86-NEXT: xorl $31, %edx
+; X86-NEXT: .LBB4_17: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $31, %ecx, %ebx
+; X86-NEXT: shldl $31, %eax, %ecx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: jne .LBB4_18
+; X86-NEXT: # %bb.19: # %_udiv-special-cases
+; X86-NEXT: bsrl %ecx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB4_20
+; X86-NEXT: .LBB4_18:
+; X86-NEXT: bsrl %ebx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB4_20: # %_udiv-special-cases
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: jne .LBB4_22
+; X86-NEXT: # %bb.21: # %_udiv-special-cases
+; X86-NEXT: orl $64, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: .LBB4_22: # %_udiv-special-cases
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: jne .LBB4_23
+; X86-NEXT: # %bb.24: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $128, %esi
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: je .LBB4_26
+; X86-NEXT: jmp .LBB4_27
+; X86-NEXT: .LBB4_23:
+; X86-NEXT: bsrl %eax, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: jne .LBB4_27
+; X86-NEXT: .LBB4_26: # %_udiv-special-cases
+; X86-NEXT: subl $-128, %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: .LBB4_27: # %_udiv-special-cases
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB4_28
+; X86-NEXT: # %bb.29: # %select.false.sink
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl $128, %eax
+; X86-NEXT: cmpl %esi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: setb %al
+; X86-NEXT: .LBB4_30: # %select.end
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: testb %al, %al
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: jne .LBB4_32
+; X86-NEXT: # %bb.31: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: .LBB4_32: # %select.end
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB4_38
+; X86-NEXT: # %bb.33: # %select.end
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: xorl $128, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: je .LBB4_38
+; X86-NEXT: # %bb.34: # %udiv-bb1
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: addl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movb $-128, %cl
+; X86-NEXT: subb %bl, %cl
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $28, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 248(%esp,%eax), %esi
+; X86-NEXT: movl 252(%esp,%eax), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl 244(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 240(%esp,%eax), %esi
+; X86-NEXT: movl 256(%esp,%eax), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: je .LBB4_39
+; X86-NEXT: # %bb.35: # %udiv-preheader
+; X86-NEXT: andl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $5, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 160(%esp,%eax,4), %edx
+; X86-NEXT: movl 156(%esp,%eax,4), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: movl 152(%esp,%eax,4), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrdl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 144(%esp,%eax,4), %ebx
+; X86-NEXT: movl 148(%esp,%eax,4), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $1, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB4_36: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: shrl $31, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: leal (%ebx,%eax,2), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl $31, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: addl %ebx, %ebx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: negl %edi
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: subl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: addl $-1, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $1, %edi
+; X86-NEXT: andl $1, %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: jne .LBB4_36
+; X86-NEXT: .LBB4_37: # %udiv-loop-exit
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: leal (%ecx,%eax,2), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl $31, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB4_38: # %udiv-end
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: setb %cl
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: imull %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: imull %ecx, %edi
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: addl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl %ebx, (%edx)
+; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %esi, 8(%edx)
+; X86-NEXT: movl %ecx, 12(%edx)
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movb %al, 16(%edx)
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB4_28:
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movb $1, %al
+; X86-NEXT: jmp .LBB4_30
+; X86-NEXT: .LBB4_39:
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB4_37
;
; X64-LABEL: srem129:
-; X64-NOT: call
+; X64: # %bb.0: # %_udiv-special-cases
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $256, %rsp # imm = 0x100
+; X64-NEXT: movq %r8, %r14
+; X64-NEXT: movq %rsi, %r10
+; X64-NEXT: movq %rdi, %r11
+; X64-NEXT: movl %r9d, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: negq %rax
+; X64-NEXT: movl %edx, %r15d
+; X64-NEXT: andl $1, %r15d
+; X64-NEXT: movq %r15, %r12
+; X64-NEXT: negq %r12
+; X64-NEXT: xorq %r12, %rdx
+; X64-NEXT: xorq %r12, %r10
+; X64-NEXT: xorq %r12, %r11
+; X64-NEXT: subq %r12, %r11
+; X64-NEXT: sbbq %r12, %r10
+; X64-NEXT: sbbq %r12, %rdx
+; X64-NEXT: movl %edx, %r13d
+; X64-NEXT: andl $1, %r13d
+; X64-NEXT: xorq %rax, %r9
+; X64-NEXT: xorq %rax, %r14
+; X64-NEXT: xorq %rax, %rcx
+; X64-NEXT: subq %rax, %rcx
+; X64-NEXT: sbbq %rax, %r14
+; X64-NEXT: sbbq %rax, %r9
+; X64-NEXT: movl %r9d, %esi
+; X64-NEXT: andl $1, %esi
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %rsi, %rax
+; X64-NEXT: orq %r14, %rax
+; X64-NEXT: sete %sil
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: orq %r13, %rax
+; X64-NEXT: orq %r10, %rax
+; X64-NEXT: sete %al
+; X64-NEXT: orb %sil, %al
+; X64-NEXT: shldq $63, %r14, %r9
+; X64-NEXT: bsrq %r9, %r8
+; X64-NEXT: xorq $63, %r8
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: shldq $63, %rcx, %rsi
+; X64-NEXT: bsrq %rsi, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: orq $64, %rdi
+; X64-NEXT: testq %r9, %r9
+; X64-NEXT: cmovneq %r8, %rdi
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: shlq $63, %rcx
+; X64-NEXT: bsrq %rcx, %rbx
+; X64-NEXT: xorq $63, %rbx
+; X64-NEXT: testq %rcx, %rcx
+; X64-NEXT: movl $128, %ecx
+; X64-NEXT: cmoveq %rcx, %rbx
+; X64-NEXT: subq $-128, %rbx
+; X64-NEXT: orq %r9, %rsi
+; X64-NEXT: cmovneq %rdi, %rbx
+; X64-NEXT: shldq $63, %r10, %rdx
+; X64-NEXT: bsrq %rdx, %rsi
+; X64-NEXT: xorq $63, %rsi
+; X64-NEXT: movq %r10, %rdi
+; X64-NEXT: shldq $63, %r11, %rdi
+; X64-NEXT: bsrq %rdi, %r8
+; X64-NEXT: xorq $63, %r8
+; X64-NEXT: orq $64, %r8
+; X64-NEXT: testq %rdx, %rdx
+; X64-NEXT: cmovneq %rsi, %r8
+; X64-NEXT: movq %r11, %rsi
+; X64-NEXT: shlq $63, %rsi
+; X64-NEXT: bsrq %rsi, %r9
+; X64-NEXT: xorq $63, %r9
+; X64-NEXT: testq %rsi, %rsi
+; X64-NEXT: cmoveq %rcx, %r9
+; X64-NEXT: subq $-128, %r9
+; X64-NEXT: orq %rdx, %rdi
+; X64-NEXT: cmovneq %r8, %r9
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: subq %r9, %rbx
+; X64-NEXT: movl $0, %r9d
+; X64-NEXT: sbbq %r9, %r9
+; X64-NEXT: sbbq %rdx, %rdx
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: testb %al, %al
+; X64-NEXT: jne .LBB4_1
+; X64-NEXT: # %bb.2: # %select.false.sink
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpq %rbx, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %r9, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: sbbq %rdx, %rcx
+; X64-NEXT: sbbq %rax, %rax
+; X64-NEXT: setb %al
+; X64-NEXT: .LBB4_3: # %select.end
+; X64-NEXT: xorl %esi, %esi
+; X64-NEXT: testb %al, %al
+; X64-NEXT: movq %r10, %r8
+; X64-NEXT: cmovneq %rsi, %r8
+; X64-NEXT: movq %r11, %rdi
+; X64-NEXT: cmovneq %rsi, %rdi
+; X64-NEXT: cmoveq %r13, %rsi
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: jne .LBB4_9
+; X64-NEXT: # %bb.4: # %select.end
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: xorq $128, %rax
+; X64-NEXT: orq %rdx, %rax
+; X64-NEXT: orq %r9, %rax
+; X64-NEXT: je .LBB4_9
+; X64-NEXT: # %bb.5: # %udiv-bb1
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: addq $1, %rax
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: adcq $0, %rdx
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movb $-128, %cl
+; X64-NEXT: subb %bl, %cl
+; X64-NEXT: movl %ecx, %esi
+; X64-NEXT: shrb $3, %sil
+; X64-NEXT: andb $24, %sil
+; X64-NEXT: negb %sil
+; X64-NEXT: movsbq %sil, %rsi
+; X64-NEXT: movq 192(%rsp,%rsi), %rbx
+; X64-NEXT: movq 200(%rsp,%rsi), %r15
+; X64-NEXT: movq 208(%rsp,%rsi), %r8
+; X64-NEXT: shldq %cl, %r15, %r8
+; X64-NEXT: shldq %cl, %rbx, %r15
+; X64-NEXT: shlq %cl, %rbx
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: orq %rdx, %rcx
+; X64-NEXT: orq %r9, %rcx
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: je .LBB4_10
+; X64-NEXT: # %bb.6: # %udiv-preheader
+; X64-NEXT: andl $1, %r8d
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: shrb $6, %cl
+; X64-NEXT: movzbl %cl, %ecx
+; X64-NEXT: movq 112(%rsp,%rcx,8), %rsi
+; X64-NEXT: movq 96(%rsp,%rcx,8), %r13
+; X64-NEXT: movq 104(%rsp,%rcx,8), %rdi
+; X64-NEXT: movq %rdi, %r10
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: shrdq %cl, %rsi, %r10
+; X64-NEXT: shrdq %cl, %rdi, %r13
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: addq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, %rcx
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: adcq $1, %r11
+; X64-NEXT: andl $1, %r11d
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r14d, %r14d
+; X64-NEXT: movq %r10, %r12
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB4_7: # %udiv-do-while
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: shldq $1, %r13, %r12
+; X64-NEXT: shrq $63, %r10
+; X64-NEXT: andl $1, %r8d
+; X64-NEXT: leaq (%r8,%r13,2), %r13
+; X64-NEXT: shldq $1, %rbx, %rsi
+; X64-NEXT: orq %rcx, %rsi
+; X64-NEXT: shrq $63, %r15
+; X64-NEXT: addq %rbx, %rbx
+; X64-NEXT: orq %rdi, %rbx
+; X64-NEXT: orl %r15d, %r14d
+; X64-NEXT: movl %r14d, %r8d
+; X64-NEXT: andl $1, %r8d
+; X64-NEXT: cmpq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %r12, %rcx
+; X64-NEXT: movq %r11, %rcx
+; X64-NEXT: sbbq %r10, %rcx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: negq %rcx
+; X64-NEXT: movl %ecx, %edi
+; X64-NEXT: andl $1, %edi
+; X64-NEXT: movq %rcx, %r10
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: subq %rcx, %r13
+; X64-NEXT: sbbq %r10, %r12
+; X64-NEXT: addq $-1, %rax
+; X64-NEXT: adcq $-1, %r9
+; X64-NEXT: adcq $1, %rdx
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: orq %rdx, %rcx
+; X64-NEXT: orq %r9, %rcx
+; X64-NEXT: movl $0, %ecx
+; X64-NEXT: movl $0, %r14d
+; X64-NEXT: movq %r12, %r10
+; X64-NEXT: movq %rsi, %r15
+; X64-NEXT: jne .LBB4_7
+; X64-NEXT: .LBB4_8: # %udiv-loop-exit
+; X64-NEXT: movq %rsi, %r8
+; X64-NEXT: shldq $1, %rbx, %r8
+; X64-NEXT: shrq $63, %rsi
+; X64-NEXT: leaq (%rdi,%rbx,2), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: .LBB4_9: # %udiv-end
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %r14, %r13
+; X64-NEXT: movq %rax, %r14
+; X64-NEXT: addq %r9, %r14
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: addq %r14, %rax
+; X64-NEXT: adcq %rbx, %rdx
+; X64-NEXT: imulq %r8, %r13
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: imulq %rdi, %r8
+; X64-NEXT: imulq %r15, %rsi
+; X64-NEXT: addq %r8, %rsi
+; X64-NEXT: addq %r13, %rsi
+; X64-NEXT: addq %rdx, %rsi
+; X64-NEXT: subq %rcx, %r11
+; X64-NEXT: sbbq %rax, %r10
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: sbbq %rsi, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: xorq %rax, %rcx
+; X64-NEXT: xorq %r12, %r10
+; X64-NEXT: xorq %r12, %r11
+; X64-NEXT: subq %r12, %r11
+; X64-NEXT: sbbq %r12, %r10
+; X64-NEXT: sbbq %rax, %rcx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq %r10, %rdx
+; X64-NEXT: leaq -40(%rbp), %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+; X64-NEXT: .LBB4_1:
+; X64-NEXT: movb $1, %al
+; X64-NEXT: jmp .LBB4_3
+; X64-NEXT: .LBB4_10:
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: movq %r15, %rsi
+; X64-NEXT: jmp .LBB4_8
%res = srem i129 %a, %b
ret i129 %res
}
-; Some higher sizes
+; i257 is also expanded inline (wider than MaxDivRemBitWidthSupported=128).
+; The inline expansion uses i256 shifts which are expanded via ExpandToParts.
define i257 @sdiv257(i257 %a, i257 %b) nounwind {
; X86-LABEL: sdiv257:
-; X86-NOT: call
+; X86: # %bb.0: # %_udiv-special-cases
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $480, %esp # imm = 0x1E0
+; X86-NEXT: movl 80(%ebp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: negl %edx
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebp), %esi
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebp), %edi
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: movl 28(%ebp), %ecx
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%ebp), %ebx
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebp), %ebx
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: movl 16(%ebp), %esi
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 76(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 72(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 68(%ebp), %ebx
+; X86-NEXT: xorl %eax, %ebx
+; X86-NEXT: movl 64(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%ebp), %esi
+; X86-NEXT: xorl %eax, %esi
+; X86-NEXT: movl 56(%ebp), %edi
+; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: movl 52(%ebp), %ecx
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: movl 48(%ebp), %edx
+; X86-NEXT: xorl %eax, %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: shldl $31, %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: shldl $31, %edx, %ecx
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB5_1
+; X86-NEXT: # %bb.2: # %_udiv-special-cases
+; X86-NEXT: bsrl %ecx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB5_3
+; X86-NEXT: .LBB5_1:
+; X86-NEXT: bsrl %eax, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB5_3: # %_udiv-special-cases
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $31, %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $31, %edx, %eax
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB5_4
+; X86-NEXT: # %bb.5: # %_udiv-special-cases
+; X86-NEXT: bsrl %eax, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB5_6
+; X86-NEXT: .LBB5_4:
+; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB5_6: # %_udiv-special-cases
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB5_8
+; X86-NEXT: # %bb.7: # %_udiv-special-cases
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB5_8: # %_udiv-special-cases
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $31, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl $31, %edx, %eax
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB5_9
+; X86-NEXT: # %bb.10: # %_udiv-special-cases
+; X86-NEXT: bsrl %eax, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl $32, %esi
+; X86-NEXT: jmp .LBB5_11
+; X86-NEXT: .LBB5_9:
+; X86-NEXT: bsrl %ecx, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: .LBB5_11: # %_udiv-special-cases
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: shldl $31, %ebx, %edx
+; X86-NEXT: shldl $31, %edi, %ebx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB5_12
+; X86-NEXT: # %bb.13: # %_udiv-special-cases
+; X86-NEXT: bsrl %ebx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB5_14
+; X86-NEXT: .LBB5_12:
+; X86-NEXT: bsrl %edx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB5_14: # %_udiv-special-cases
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB5_16
+; X86-NEXT: # %bb.15: # %_udiv-special-cases
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: .LBB5_16: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: jne .LBB5_18
+; X86-NEXT: # %bb.17: # %_udiv-special-cases
+; X86-NEXT: orl $128, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB5_18: # %_udiv-special-cases
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: orb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shll $31, %ecx
+; X86-NEXT: bsrl %ecx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: jne .LBB5_20
+; X86-NEXT: # %bb.19: # %_udiv-special-cases
+; X86-NEXT: movl $64, %eax
+; X86-NEXT: .LBB5_20: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: je .LBB5_21
+; X86-NEXT: # %bb.22: # %_udiv-special-cases
+; X86-NEXT: je .LBB5_23
+; X86-NEXT: .LBB5_24: # %_udiv-special-cases
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: jne .LBB5_26
+; X86-NEXT: .LBB5_25: # %_udiv-special-cases
+; X86-NEXT: addl $256, %eax # imm = 0x100
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB5_26: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $31, %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $31, %eax, %edi
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB5_27
+; X86-NEXT: # %bb.28: # %_udiv-special-cases
+; X86-NEXT: bsrl %edi, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: orl $32, %edi
+; X86-NEXT: jmp .LBB5_29
+; X86-NEXT: .LBB5_21: # %_udiv-special-cases
+; X86-NEXT: movl $128, %eax
+; X86-NEXT: jne .LBB5_24
+; X86-NEXT: .LBB5_23: # %_udiv-special-cases
+; X86-NEXT: movl $256, %eax # imm = 0x100
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: je .LBB5_25
+; X86-NEXT: jmp .LBB5_26
+; X86-NEXT: .LBB5_27:
+; X86-NEXT: bsrl %edx, %edi
+; X86-NEXT: xorl $31, %edi
+; X86-NEXT: .LBB5_29: # %_udiv-special-cases
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $31, %ecx, %eax
+; X86-NEXT: shldl $31, %ebx, %ecx
+; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB5_30
+; X86-NEXT: # %bb.31: # %_udiv-special-cases
+; X86-NEXT: bsrl %ecx, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB5_32
+; X86-NEXT: .LBB5_30:
+; X86-NEXT: bsrl %eax, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB5_32: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: jne .LBB5_34
+; X86-NEXT: # %bb.33: # %_udiv-special-cases
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB5_34: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $31, %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $31, %edi, %ecx
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB5_35
+; X86-NEXT: # %bb.36: # %_udiv-special-cases
+; X86-NEXT: bsrl %ecx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: orl $32, %ecx
+; X86-NEXT: jmp .LBB5_37
+; X86-NEXT: .LBB5_35:
+; X86-NEXT: bsrl %ebx, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: .LBB5_37: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $31, %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $31, %eax, %esi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB5_38
+; X86-NEXT: # %bb.39: # %_udiv-special-cases
+; X86-NEXT: bsrl %esi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
+; X86-NEXT: jmp .LBB5_40
+; X86-NEXT: .LBB5_38:
+; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: .LBB5_40: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: jne .LBB5_42
+; X86-NEXT: # %bb.41: # %_udiv-special-cases
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: .LBB5_42: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jne .LBB5_44
+; X86-NEXT: # %bb.43: # %_udiv-special-cases
+; X86-NEXT: orl $128, %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: .LBB5_44: # %_udiv-special-cases
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shll $31, %eax
+; X86-NEXT: jne .LBB5_45
+; X86-NEXT: # %bb.46: # %_udiv-special-cases
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl $256, %ecx # imm = 0x100
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: je .LBB5_48
+; X86-NEXT: jmp .LBB5_49
+; X86-NEXT: .LBB5_45:
+; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: xorl $31, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: jne .LBB5_49
+; X86-NEXT: .LBB5_48: # %_udiv-special-cases
+; X86-NEXT: addl $256, %ecx # imm = 0x100
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: .LBB5_49: # %_udiv-special-cases
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB5_50
+; X86-NEXT: # %bb.51: # %select.false.sink
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl $256, %eax # imm = 0x100
+; X86-NEXT: cmpl %edx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %ebx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: setb %al
+; X86-NEXT: .LBB5_52: # %select.end
+; X86-NEXT: testb %al, %al
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: jne .LBB5_54
+; X86-NEXT: # %bb.53: # %select.end
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: .LBB5_54: # %select.end
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: jne .LBB5_55
+; X86-NEXT: # %bb.61: # %select.end
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: xorl $256, %edi # imm = 0x100
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: je .LBB5_62
+; X86-NEXT: # %bb.59: # %udiv-bb1
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: addl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $256, %edx # imm = 0x100
+; X86-NEXT: subl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: andl $60, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: subl %edx, %ebx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebx), %esi
+; X86-NEXT: movl 28(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl 20(%ebx), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%ebx), %esi
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebx), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebx), %esi
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 404(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%ebx), %esi
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: je .LBB5_60
+; X86-NEXT: # %bb.56: # %udiv-preheader
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: andl $60, %eax
+; X86-NEXT: movl 240(%esp,%eax), %esi
+; X86-NEXT: movl 236(%esp,%eax), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 232(%esp,%eax), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 228(%esp,%eax), %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: shrdl %cl, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 224(%esp,%eax), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shrdl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 220(%esp,%eax), %edi
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: shrdl %cl, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 216(%esp,%eax), %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: shrdl %cl, %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 208(%esp,%eax), %edi
+; X86-NEXT: movl 212(%esp,%eax), %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shrdl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: addl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $1, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: xorl %ebx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: .p2align 4
+; X86-NEXT: .LBB5_57: # %udiv-do-while
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shrl $31, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: leal (%esi,%eax,2), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl $31, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: addl %esi, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl $1, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: subl %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %edi
+; X86-NEXT: adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: adcl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $-1, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: adcl $-1, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $1, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jne .LBB5_57
+; X86-NEXT: .LBB5_58: # %udiv-loop-exit
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl $1, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: leal (%edi,%esi,2), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shrl $31, %edx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: .LBB5_62: # %udiv-end
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: xorl %edx, %eax
+; X86-NEXT: xorl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: xorl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: xorl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: xorl %edx, %edi
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %edx, %ecx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: sbbl %edx, %edi
+; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: sbbl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 16(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 20(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 24(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 28(%eax)
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movb %dl, 32(%eax)
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+; X86-NEXT: .LBB5_50:
+; X86-NEXT: movb $1, %al
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jmp .LBB5_52
+; X86-NEXT: .LBB5_60:
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: jmp .LBB5_58
+; X86-NEXT: .LBB5_55:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: jmp .LBB5_62
;
; X64-LABEL: sdiv257:
-; X64-NOT: call
+; X64: # %bb.0: # %_udiv-special-cases
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: pushq %r15
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %r13
+; X64-NEXT: pushq %r12
+; X64-NEXT: pushq %rbx
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $480, %rsp # imm = 0x1E0
+; X64-NEXT: movq %rcx, %r15
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 48(%rbp), %rax
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: negq %rcx
+; X64-NEXT: movl %r9d, %r12d
+; X64-NEXT: andl $1, %r12d
+; X64-NEXT: negq %r12
+; X64-NEXT: xorq %r12, %r9
+; X64-NEXT: xorq %r12, %r8
+; X64-NEXT: xorq %r12, %r15
+; X64-NEXT: xorq %r12, %r13
+; X64-NEXT: xorq %r12, %rsi
+; X64-NEXT: subq %r12, %rsi
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: sbbq %r12, %r13
+; X64-NEXT: sbbq %r12, %r15
+; X64-NEXT: sbbq %r12, %r8
+; X64-NEXT: sbbq %r12, %r9
+; X64-NEXT: movl %r9d, %edi
+; X64-NEXT: andl $1, %edi
+; X64-NEXT: xorq %rcx, %rax
+; X64-NEXT: movq 40(%rbp), %r14
+; X64-NEXT: xorq %rcx, %r14
+; X64-NEXT: movq 32(%rbp), %r11
+; X64-NEXT: xorq %rcx, %r11
+; X64-NEXT: movq 24(%rbp), %r10
+; X64-NEXT: xorq %rcx, %r10
+; X64-NEXT: movq 16(%rbp), %rbx
+; X64-NEXT: xorq %rcx, %rbx
+; X64-NEXT: subq %rcx, %rbx
+; X64-NEXT: sbbq %rcx, %r10
+; X64-NEXT: sbbq %rcx, %r11
+; X64-NEXT: sbbq %rcx, %r14
+; X64-NEXT: sbbq %rcx, %rax
+; X64-NEXT: xorq %rcx, %r12
+; X64-NEXT: movl %eax, %esi
+; X64-NEXT: andl $1, %esi
+; X64-NEXT: movl %r12d, %ecx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r10, %rcx
+; X64-NEXT: orq %r14, %rcx
+; X64-NEXT: movq %rbx, %rdx
+; X64-NEXT: orq %r11, %rdx
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %rsi, %rdx
+; X64-NEXT: orq %rcx, %rdx
+; X64-NEXT: sete %dl
+; X64-NEXT: movq %r13, %rcx
+; X64-NEXT: orq %r8, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: orq %r15, %rsi
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %rdi, %rsi
+; X64-NEXT: orq %rcx, %rsi
+; X64-NEXT: sete %cl
+; X64-NEXT: orb %dl, %cl
+; X64-NEXT: shldq $63, %r14, %rax
+; X64-NEXT: bsrq %rax, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, %rdx
+; X64-NEXT: shldq $63, %r11, %rdx
+; X64-NEXT: bsrq %rdx, %rsi
+; X64-NEXT: xorq $63, %rsi
+; X64-NEXT: orq $64, %rsi
+; X64-NEXT: testq %rax, %rax
+; X64-NEXT: cmovneq %rdi, %rsi
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r11, %rdi
+; X64-NEXT: shldq $63, %r10, %rdi
+; X64-NEXT: bsrq %rdi, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: shldq $63, %rbx, %r10
+; X64-NEXT: bsrq %r10, %r14
+; X64-NEXT: xorq $63, %r14
+; X64-NEXT: orq $64, %r14
+; X64-NEXT: testq %rdi, %rdi
+; X64-NEXT: cmovneq %r11, %r14
+; X64-NEXT: orq $128, %r14
+; X64-NEXT: orq %rdx, %r10
+; X64-NEXT: orq %rax, %rdx
+; X64-NEXT: cmovneq %rsi, %r14
+; X64-NEXT: orq %rax, %rdi
+; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: shlq $63, %rax
+; X64-NEXT: bsrq %rax, %rsi
+; X64-NEXT: xorq $63, %rsi
+; X64-NEXT: testq %rax, %rax
+; X64-NEXT: movl $128, %edx
+; X64-NEXT: cmoveq %rdx, %rsi
+; X64-NEXT: movl $256, %eax # imm = 0x100
+; X64-NEXT: cmoveq %rax, %rsi
+; X64-NEXT: addq $256, %rsi # imm = 0x100
+; X64-NEXT: orq %r10, %rdi
+; X64-NEXT: cmovneq %r14, %rsi
+; X64-NEXT: shldq $63, %r8, %r9
+; X64-NEXT: bsrq %r9, %rdi
+; X64-NEXT: xorq $63, %rdi
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r8, %r11
+; X64-NEXT: shldq $63, %r15, %r11
+; X64-NEXT: bsrq %r11, %rbx
+; X64-NEXT: xorq $63, %rbx
+; X64-NEXT: orq $64, %rbx
+; X64-NEXT: testq %r9, %r9
+; X64-NEXT: cmovneq %rdi, %rbx
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r15, %rdi
+; X64-NEXT: shldq $63, %r13, %rdi
+; X64-NEXT: bsrq %rdi, %r14
+; X64-NEXT: xorq $63, %r14
+; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: shldq $63, %r15, %r13
+; X64-NEXT: bsrq %r13, %r8
+; X64-NEXT: xorq $63, %r8
+; X64-NEXT: orq $64, %r8
+; X64-NEXT: testq %rdi, %rdi
+; X64-NEXT: cmovneq %r14, %r8
+; X64-NEXT: orq $128, %r8
+; X64-NEXT: orq %r11, %r13
+; X64-NEXT: orq %r9, %r11
+; X64-NEXT: cmovneq %rbx, %r8
+; X64-NEXT: movq %r15, %rbx
+; X64-NEXT: orq %r9, %rdi
+; X64-NEXT: movq %r15, %r9
+; X64-NEXT: shlq $63, %r9
+; X64-NEXT: bsrq %r9, %r11
+; X64-NEXT: xorq $63, %r11
+; X64-NEXT: testq %r9, %r9
+; X64-NEXT: cmoveq %rdx, %r11
+; X64-NEXT: cmoveq %rax, %r11
+; X64-NEXT: addq $256, %r11 # imm = 0x100
+; X64-NEXT: orq %r13, %rdi
+; X64-NEXT: cmovneq %r8, %r11
+; X64-NEXT: xorl %r9d, %r9d
+; X64-NEXT: subq %r11, %rsi
+; X64-NEXT: movl $0, %r10d
+; X64-NEXT: sbbq %r10, %r10
+; X64-NEXT: movl $0, %r15d
+; X64-NEXT: sbbq %r15, %r15
+; X64-NEXT: movl $0, %r13d
+; X64-NEXT: sbbq %r13, %r13
+; X64-NEXT: sbbq %r9, %r9
+; X64-NEXT: andl $1, %r9d
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: jne .LBB5_1
+; X64-NEXT: # %bb.2: # %select.false.sink
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: cmpq %rsi, %rax
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: sbbq %r10, %rax
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: sbbq %r15, %rax
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: sbbq %r13, %rax
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: sbbq %r9, %rax
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: sbbq %rax, %rax
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: sbbq %rax, %rax
+; X64-NEXT: sbbq %rcx, %rcx
+; X64-NEXT: setb %al
+; X64-NEXT: .LBB5_3: # %select.end
+; X64-NEXT: xorl %edi, %edi
+; X64-NEXT: testb %al, %al
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT: cmovneq %rdi, %r11
+; X64-NEXT: movq %rbx, %rdx
+; X64-NEXT: cmovneq %rdi, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: cmovneq %rdi, %rcx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: cmovneq %rdi, %r14
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; X64-NEXT: cmoveq %rbx, %rdi
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: jne .LBB5_4
+; X64-NEXT: # %bb.10: # %select.end
+; X64-NEXT: movq %r10, %rdi
+; X64-NEXT: orq %r13, %rdi
+; X64-NEXT: movq %rsi, %r8
+; X64-NEXT: xorq $256, %r8 # imm = 0x100
+; X64-NEXT: orq %r15, %r8
+; X64-NEXT: orq %r9, %r8
+; X64-NEXT: orq %rdi, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: je .LBB5_11
+; X64-NEXT: # %bb.8: # %udiv-bb1
+; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rsi, %r14
+; X64-NEXT: addq $1, %r14
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: adcq $0, %r15
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: andl $1, %r9d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl $256, %eax # imm = 0x100
+; X64-NEXT: subl %esi, %eax
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: andl $63, %ecx
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: andl $56, %eax
+; X64-NEXT: negl %eax
+; X64-NEXT: cltq
+; X64-NEXT: movq 400(%rsp,%rax), %rdi
+; X64-NEXT: movq 408(%rsp,%rax), %rbx
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: movq %rbx, %rdx
+; X64-NEXT: shldq %cl, %rdi, %rdx
+; X64-NEXT: movq 384(%rsp,%rax), %r11
+; X64-NEXT: movq 392(%rsp,%rax), %r13
+; X64-NEXT: shldq %cl, %r13, %rdi
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq 416(%rsp,%rax), %rdi
+; X64-NEXT: shldq %cl, %rbx, %rdi
+; X64-NEXT: shldq %cl, %r11, %r13
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: shlq %cl, %r11
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %rsi, %r10
+; X64-NEXT: movq %r14, %rcx
+; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r15, %rcx
+; X64-NEXT: orq %r9, %rcx
+; X64-NEXT: orq %r10, %rcx
+; X64-NEXT: je .LBB5_9
+; X64-NEXT: # %bb.5: # %udiv-preheader
+; X64-NEXT: movq %r9, %rsi
+; X64-NEXT: andl $1, %edi
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; X64-NEXT: movl %r14d, %ecx
+; X64-NEXT: andl $63, %ecx
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movl %r14d, %r10d
+; X64-NEXT: shrl $3, %r10d
+; X64-NEXT: andl $56, %r10d
+; X64-NEXT: movq 224(%rsp,%r10), %r8
+; X64-NEXT: movq 216(%rsp,%r10), %r9
+; X64-NEXT: movq %r9, %rax
+; X64-NEXT: shrdq %cl, %r8, %rax
+; X64-NEXT: movq 208(%rsp,%r10), %r8
+; X64-NEXT: movq %r8, %r14
+; X64-NEXT: shrdq %cl, %r9, %r14
+; X64-NEXT: movq 192(%rsp,%r10), %r15
+; X64-NEXT: movq 200(%rsp,%r10), %r9
+; X64-NEXT: movq %r9, %r12
+; X64-NEXT: shrdq %cl, %r8, %r12
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: shrdq %cl, %r9, %r15
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: addq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $-1, %rcx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: adcq $1, %rcx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT: movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT: movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT: xorl %r9d, %r9d
+; X64-NEXT: xorl %ebx, %ebx
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: .p2align 4
+; X64-NEXT: .LBB5_6: # %udiv-do-while
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: shldq $1, %r14, %r11
+; X64-NEXT: shldq $1, %r12, %r14
+; X64-NEXT: shldq $1, %r15, %r12
+; X64-NEXT: shrq $63, %rax
+; X64-NEXT: andl $1, %edi
+; X64-NEXT: leaq (%rdi,%r15,2), %r15
+; X64-NEXT: shldq $1, %rcx, %r8
+; X64-NEXT: orq %r9, %r8
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: shldq $1, %r13, %rcx
+; X64-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT: shldq $1, %r10, %r13
+; X64-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT: shrq $63, %rdx
+; X64-NEXT: addq %r10, %r10
+; X64-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; X64-NEXT: orl %edx, %ebx
+; X64-NEXT: movl %ebx, %edi
+; X64-NEXT: andl $1, %edi
+; X64-NEXT: cmpq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: sbbq %r12, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: sbbq %r14, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: sbbq %r11, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: sbbq %rax, %rdx
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: negq %rdx
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT: subq %rdx, %r15
+; X64-NEXT: sbbq %r9, %r12
+; X64-NEXT: sbbq %r8, %r14
+; X64-NEXT: sbbq %rax, %r11
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: addq $-1, %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: adcq $-1, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: adcq $-1, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: adcq $-1, %r9
+; X64-NEXT: adcq $1, %rsi
+; X64-NEXT: andl $1, %esi
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r9, %rax
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: orq %r8, %rdx
+; X64-NEXT: orq %rsi, %rdx
+; X64-NEXT: orq %rax, %rdx
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movl $0, %eax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movl $0, %r9d
+; X64-NEXT: movl $0, %ebx
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: jne .LBB5_6
+; X64-NEXT: .LBB5_7: # %udiv-loop-exit
+; X64-NEXT: movq %r8, %r14
+; X64-NEXT: shldq $1, %rcx, %r14
+; X64-NEXT: shldq $1, %r13, %rcx
+; X64-NEXT: shldq $1, %r10, %r13
+; X64-NEXT: shrq $63, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: leaq (%rax,%r10,2), %rdx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT: movq %r13, %r11
+; X64-NEXT: .LBB5_11: # %udiv-end
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: xorq %rsi, %r8
+; X64-NEXT: xorq %r12, %r14
+; X64-NEXT: xorq %r12, %rcx
+; X64-NEXT: xorq %r12, %r11
+; X64-NEXT: xorq %r12, %rdx
+; X64-NEXT: subq %r12, %rdx
+; X64-NEXT: sbbq %r12, %r11
+; X64-NEXT: sbbq %r12, %rcx
+; X64-NEXT: sbbq %r12, %r14
+; X64-NEXT: sbbq %rsi, %r8
+; X64-NEXT: movq %rdx, (%rax)
+; X64-NEXT: movq %r11, 8(%rax)
+; X64-NEXT: movq %rcx, 16(%rax)
+; X64-NEXT: movq %r14, 24(%rax)
+; X64-NEXT: andl $1, %r8d
+; X64-NEXT: movb %r8b, 32(%rax)
+; X64-NEXT: leaq -40(%rbp), %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r12
+; X64-NEXT: popq %r13
+; X64-NEXT: popq %r14
+; X64-NEXT: popq %r15
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
+; X64-NEXT: .LBB5_1:
+; X64-NEXT: movb $1, %al
+; X64-NEXT: jmp .LBB5_3
+; X64-NEXT: .LBB5_9:
+; X64-NEXT: movq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: jmp .LBB5_7
+; X64-NEXT: .LBB5_4:
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT: jmp .LBB5_11
%res = sdiv i257 %a, %b
ret i257 %res
}
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index ccabb360a990c..0b567dca3b362 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -514,99 +514,117 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X64-LABEL: test4:
; X64: # %bb.0:
; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
; X64-NEXT: pushq %r15
; X64-NEXT: pushq %r14
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r8, %r11
-; X64-NEXT: movq %rcx, %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r14
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rbx, %r15
-; X64-NEXT: adcq $0, %r14
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r15, %rbx
-; X64-NEXT: adcq %r14, %rbp
-; X64-NEXT: setb %al
-; X64-NEXT: movzbl %al, %r10d
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r12
-; X64-NEXT: movq %rdx, %r12
-; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %rbp, %r13
-; X64-NEXT: adcq %r10, %r12
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r9
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $256, %rsp # imm = 0x100
+; X64-NEXT: movq %r9, %r13
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rcx, (%rsp) # 8-byte Spill
; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %r15, %r10
-; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r10, %r15
-; X64-NEXT: adcq %rbp, %rdx
-; X64-NEXT: imulq %r9, %r11
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT: addq %r13, %r14
-; X64-NEXT: adcq %r12, %r15
-; X64-NEXT: adcq %rdx, %r11
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: movq %rax, %r12
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r13
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %r10, %rbp
-; X64-NEXT: adcq $0, %r13
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: addq %rbp, %rax
-; X64-NEXT: adcq %r13, %rdx
-; X64-NEXT: imulq %r10, %rcx
-; X64-NEXT: addq %rdx, %rcx
-; X64-NEXT: addq %r14, %r12
-; X64-NEXT: adcq %r15, %rax
-; X64-NEXT: adcq %r11, %rcx
-; X64-NEXT: imulq %r9, %r8
+; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rsi, %r14
+; X64-NEXT: movq %rdi, %rbx
+; X64-NEXT: movabsq $17592186044415, %r12 # imm = 0xFFFFFFFFFFF
+; X64-NEXT: andq %r12, %r13
+; X64-NEXT: andq 48(%rbp), %r12
+; X64-NEXT: movq 16(%rbp), %r9
+; X64-NEXT: subq $8, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %rcx, %rsi
+; X64-NEXT: movq %r8, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq 24(%rbp)
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq 16(%rbp), %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq 24(%rbp)
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r14, %rsi
+; X64-NEXT: movq %r15, %rdx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq 32(%rbp), %r15
+; X64-NEXT: movq %r15, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: movq 40(%rbp), %r14
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT: addq %rdx, %rsi
-; X64-NEXT: addq %r8, %rsi
-; X64-NEXT: addq %rcx, %rsi
-; X64-NEXT: movq %rbx, 8(%rdi)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT: movq %rcx, (%rdi)
-; X64-NEXT: movq %r12, 16(%rdi)
-; X64-NEXT: movq %rax, 24(%rdi)
-; X64-NEXT: movl %esi, 32(%rdi)
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq %r15, %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq %r14
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r13, %rsi
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq 16(%rbp), %r9
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq 24(%rbp)
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq %r12, %rsi
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq $0
+; X64-NEXT: pushq {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Reload
+; X64-NEXT: callq __multi5 at PLT
+; X64-NEXT: addq $32, %rsp
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: adcq $0, %rax
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: addq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT: addq %rax, %rsi
+; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: movaps %xmm0, (%rbx)
+; X64-NEXT: movq %rcx, 16(%rbx)
+; X64-NEXT: movq %rdx, 24(%rbx)
+; X64-NEXT: movl %esi, 32(%rbx)
; X64-NEXT: shrq $32, %rsi
; X64-NEXT: andl $4095, %esi # imm = 0xFFF
-; X64-NEXT: movw %si, 36(%rdi)
-; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: movw %si, 36(%rbx)
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: leaq -40(%rbp), %rsp
; X64-NEXT: popq %rbx
; X64-NEXT: popq %r12
; X64-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 65b602801b365..c179c220fa16a 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -2730,7 +2730,11 @@ define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes:
; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -2738,22 +2742,22 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%r9), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
@@ -2766,7 +2770,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
@@ -2777,11 +2781,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -2789,20 +2799,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%rax), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -2811,10 +2821,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -2822,28 +2838,28 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rsi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, (%rsp,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax
@@ -2852,10 +2868,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -2863,20 +2885,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rax), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax
@@ -2884,24 +2906,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes:
; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 8(%rsp,%r9), %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
@@ -2910,11 +2938,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rsp,%r9), %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
@@ -2931,28 +2959,34 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rsp,%rax), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
@@ -2962,31 +2996,37 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %cl
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %cl, %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -72(%rsp,%rcx), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, (%rsp,%rcx), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rcx), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rcx), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rcx), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rcx), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rcx), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rcx), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rcx,%rcx), %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11
@@ -2999,27 +3039,33 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rax), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
@@ -3028,21 +3074,27 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes:
; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rcx,8), %eax
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 8(%rsp,%r9), %r8
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
@@ -3051,11 +3103,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rsp,%r9), %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
@@ -3072,26 +3124,32 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes:
; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rsp,%rax), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
@@ -3101,60 +3159,72 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes:
; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rcx,8), %eax
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %cl, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, (%rsp,%rcx), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 8(%rsp,%rcx), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rsp,%rcx), %r9
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rsp,%rcx), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rcx,%rcx), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r8, %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rax, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rcx, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes:
; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rsp,%rax), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
@@ -3163,6 +3233,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
@@ -4210,7 +4282,11 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -4219,22 +4295,22 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9,4), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%r9,4), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9,4), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%r9,4), %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
@@ -4247,7 +4323,7 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%r9,4), %r9
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
@@ -4258,11 +4334,17 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -4271,20 +4353,20 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%rax,4), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax
@@ -4292,10 +4374,16 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -4304,28 +4392,28 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi,4), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rsi,4), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, (%rsp,%rsi,4), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rsi,4), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax
@@ -4334,10 +4422,16 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -4346,20 +4440,20 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rax,4), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax
@@ -4367,25 +4461,31 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %cl
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 8(%rsp,%r9,4), %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
@@ -4394,11 +4494,11 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rsp,%r9,4), %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rsp,%r9,4), %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
@@ -4415,29 +4515,35 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %al
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rsp,%rax,4), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 8(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
@@ -4446,32 +4552,38 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %eax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %cl
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %cl, %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -72(%rsp,%rcx,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, (%rsp,%rcx,4), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rcx,4), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rcx,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rcx,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rcx,4), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rcx,4), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rcx,%rcx), %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11
@@ -4484,28 +4596,34 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %al
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rax,4), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
@@ -4514,22 +4632,28 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $6, %cl
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 8(%rsp,%r9,4), %r8
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
@@ -4538,11 +4662,11 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rsp,%r9,4), %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rsp,%r9,4), %r9
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
@@ -4559,27 +4683,33 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $6, %al
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rsp,%rax,4), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 8(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
@@ -4588,62 +4718,74 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %al
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %cl, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, (%rsp,%rcx,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 8(%rsp,%rcx,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rsp,%rcx,4), %r9
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rsp,%rcx,4), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rcx,%rcx), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r8, %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rax, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rcx, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %al
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rsp,%rax,4), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 8(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
@@ -4652,6 +4794,8 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
@@ -4769,58 +4913,76 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: lshr_32bytes_qwordOff:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pushq %rbp
+; X64-SSE2-NEXT: movq %rsp, %rbp
+; X64-SSE2-NEXT: andq $-32, %rsp
+; X64-SSE2-NEXT: subq $96, %rsp
; X64-SSE2-NEXT: movq (%rdi), %rax
; X64-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-SSE2-NEXT: movq 16(%rdi), %r8
; X64-SSE2-NEXT: movq 24(%rdi), %rdi
; X64-SSE2-NEXT: movzbl (%rsi), %esi
; X64-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rax, (%rsp)
; X64-SSE2-NEXT: andl $3, %esi
-; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax
-; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx
-; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi
-; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-SSE2-NEXT: movq (%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq 8(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq 24(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq 16(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-SSE2-NEXT: movq %rax, (%rdx)
; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT: movq %rbp, %rsp
+; X64-SSE2-NEXT: popq %rbp
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: lshr_32bytes_qwordOff:
; X64-SSE42: # %bb.0:
+; X64-SSE42-NEXT: pushq %rbp
+; X64-SSE42-NEXT: movq %rsp, %rbp
+; X64-SSE42-NEXT: andq $-32, %rsp
+; X64-SSE42-NEXT: subq $96, %rsp
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movzbl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm2, %xmm2
-; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, (%rsp)
; X64-SSE42-NEXT: andl $3, %eax
-; X64-SSE42-NEXT: movups -72(%rsp,%rax,8), %xmm0
-; X64-SSE42-NEXT: movups -56(%rsp,%rax,8), %xmm1
+; X64-SSE42-NEXT: movups (%rsp,%rax,8), %xmm0
+; X64-SSE42-NEXT: movups 16(%rsp,%rax,8), %xmm1
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
+; X64-SSE42-NEXT: movq %rbp, %rsp
+; X64-SSE42-NEXT: popq %rbp
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: lshr_32bytes_qwordOff:
; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: pushq %rbp
+; X64-AVX-NEXT: movq %rsp, %rbp
+; X64-AVX-NEXT: andq $-32, %rsp
+; X64-AVX-NEXT: subq $96, %rsp
; X64-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-AVX-NEXT: movzbl (%rsi), %eax
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX-NEXT: andl $3, %eax
-; X64-AVX-NEXT: vmovups -72(%rsp,%rax,8), %xmm0
-; X64-AVX-NEXT: vmovups -56(%rsp,%rax,8), %xmm1
+; X64-AVX-NEXT: vmovups (%rsp,%rax,8), %xmm0
+; X64-AVX-NEXT: vmovups 16(%rsp,%rax,8), %xmm1
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT: movq %rbp, %rsp
+; X64-AVX-NEXT: popq %rbp
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
;
@@ -4938,7 +5100,11 @@ define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes:
; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -4946,17 +5112,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %sil
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r10), %r8
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r10), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rsp,%r10), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rsp,%r10), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
@@ -4967,10 +5133,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r10), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rsp,%r10), %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r10), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rsp,%r10), %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
@@ -4988,11 +5154,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -5000,20 +5172,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %sil
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -5022,10 +5194,16 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -5033,26 +5211,26 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rdi), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rsp,%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rsp,%rdi), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, 56(%rsp,%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rsp,%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi
@@ -5064,10 +5242,16 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -5075,20 +5259,20 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %rcx
@@ -5096,28 +5280,34 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes:
; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rsp,%r8), %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rsp,%r8), %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
@@ -5125,8 +5315,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %r9
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 32(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 40(%rsp,%r8), %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
@@ -5145,27 +5335,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbq %al, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
@@ -5175,32 +5371,38 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %sil
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %sil, %rsi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, 56(%rsp,%rsi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rsp,%rsi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 32(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 40(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rsi, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rsi, %rsi
@@ -5213,27 +5415,33 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %al, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -5242,25 +5450,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes:
; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rcx,8), %eax
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm1, (%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movsbq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rsp,%r8), %r9
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rsp,%r8), %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
@@ -5268,8 +5482,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r8), %r9
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 32(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 40(%rsp,%r8), %r8
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r11
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r11
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
@@ -5288,25 +5502,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes:
; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm1, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbq %al, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9
@@ -5316,30 +5536,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes:
; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm1, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %sil
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %sil, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, 56(%rsp,%rsi), %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rsp,%rsi), %r8
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %r8
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 32(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 40(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rsi, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rsi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rsi, %rsi
@@ -5352,25 +5578,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes:
; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm1, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %al, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -5379,6 +5611,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
@@ -6436,7 +6670,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -6445,18 +6683,18 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $2, %sil
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %sil
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r10), %r8
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r10), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rsp,%r10), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rsp,%r10), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
@@ -6467,10 +6705,10 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r10), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rsp,%r10), %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r10), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rsp,%r10), %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
@@ -6488,11 +6726,17 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -6501,21 +6745,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $2, %sil
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %sil
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
@@ -6523,10 +6767,16 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -6535,27 +6785,27 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $2, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rdi), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rsp,%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rsp,%rdi), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, 56(%rsp,%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rsp,%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi
@@ -6567,10 +6817,16 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -6579,21 +6835,21 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $2, %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %rcx
@@ -6601,30 +6857,36 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $2, %cl
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rsp,%r8), %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rsp,%r8), %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
@@ -6632,8 +6894,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %r9
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 32(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 40(%rsp,%r8), %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
@@ -6652,29 +6914,35 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $2, %al
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbq %al, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
@@ -6683,34 +6951,40 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %eax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $2, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %sil
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %sil, %rsi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, 56(%rsp,%rsi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rsp,%rsi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 32(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 40(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rsi, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rsi, %rsi
@@ -6723,29 +6997,35 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $2, %al
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %al, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -6754,27 +7034,33 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm1, (%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $2, %cl
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movsbq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rsp,%r8), %r9
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rsp,%r8), %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
@@ -6782,8 +7068,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r8), %r9
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 32(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 40(%rsp,%r8), %r8
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r11
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r11
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
@@ -6802,27 +7088,33 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm1, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $2, %al
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbq %al, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9
@@ -6831,32 +7123,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %eax
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %al
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm1, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $2, %sil
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %sil
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %sil, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, 56(%rsp,%rsi), %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rsp,%rsi), %r8
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %r8
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 32(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 40(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rsi, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rsi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rsi, %rsi
@@ -6869,27 +7167,33 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm1, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $2, %al
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %al, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -6898,6 +7202,8 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
@@ -7024,67 +7330,85 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: shl_32bytes_qwordOff:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pushq %rbp
+; X64-SSE2-NEXT: movq %rsp, %rbp
+; X64-SSE2-NEXT: andq $-32, %rsp
+; X64-SSE2-NEXT: subq $96, %rsp
; X64-SSE2-NEXT: movq (%rdi), %rax
; X64-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-SSE2-NEXT: movq 16(%rdi), %r8
; X64-SSE2-NEXT: movq 24(%rdi), %rdi
; X64-SSE2-NEXT: movzbl (%rsi), %esi
; X64-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: shlb $3, %sil
; X64-SSE2-NEXT: andb $24, %sil
; X64-SSE2-NEXT: negb %sil
; X64-SSE2-NEXT: movsbq %sil, %rax
-; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rcx
-; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rsi
-; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
-; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rax
+; X64-SSE2-NEXT: movq 32(%rsp,%rax), %rcx
+; X64-SSE2-NEXT: movq 40(%rsp,%rax), %rsi
+; X64-SSE2-NEXT: movq 56(%rsp,%rax), %rdi
+; X64-SSE2-NEXT: movq 48(%rsp,%rax), %rax
; X64-SSE2-NEXT: movq %rax, 16(%rdx)
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-SSE2-NEXT: movq %rcx, (%rdx)
; X64-SSE2-NEXT: movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT: movq %rbp, %rsp
+; X64-SSE2-NEXT: popq %rbp
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: shl_32bytes_qwordOff:
; X64-SSE42: # %bb.0:
+; X64-SSE42-NEXT: pushq %rbp
+; X64-SSE42-NEXT: movq %rsp, %rbp
+; X64-SSE42-NEXT: andq $-32, %rsp
+; X64-SSE42-NEXT: subq $96, %rsp
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movzbl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm2, %xmm2
-; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, (%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: shlb $3, %al
; X64-SSE42-NEXT: andb $24, %al
; X64-SSE42-NEXT: negb %al
; X64-SSE42-NEXT: movsbq %al, %rax
-; X64-SSE42-NEXT: movups -40(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -24(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT: movups 32(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT: movups 48(%rsp,%rax), %xmm1
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
+; X64-SSE42-NEXT: movq %rbp, %rsp
+; X64-SSE42-NEXT: popq %rbp
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: shl_32bytes_qwordOff:
; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: pushq %rbp
+; X64-AVX-NEXT: movq %rsp, %rbp
+; X64-AVX-NEXT: andq $-32, %rsp
+; X64-AVX-NEXT: subq $96, %rsp
; X64-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-AVX-NEXT: movzbl (%rsi), %eax
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps %ymm1, (%rsp)
+; X64-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-AVX-NEXT: shlb $3, %al
; X64-AVX-NEXT: andb $24, %al
; X64-AVX-NEXT: negb %al
; X64-AVX-NEXT: movsbq %al, %rax
-; X64-AVX-NEXT: vmovups -40(%rsp,%rax), %xmm0
-; X64-AVX-NEXT: vmovups -24(%rsp,%rax), %xmm1
+; X64-AVX-NEXT: vmovups 32(%rsp,%rax), %xmm0
+; X64-AVX-NEXT: vmovups 48(%rsp,%rax), %xmm1
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT: movq %rbp, %rsp
+; X64-AVX-NEXT: popq %rbp
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
;
@@ -7211,32 +7535,36 @@ define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes:
; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%r9), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
@@ -7249,7 +7577,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
@@ -7260,34 +7588,40 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%rax), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
@@ -7296,41 +7630,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rsi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, (%rsp,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax
@@ -7339,33 +7679,39 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rax), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax
@@ -7373,28 +7719,34 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes:
; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %sil
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 8(%rsp,%r9), %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
@@ -7403,11 +7755,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rsp,%r9), %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
@@ -7424,32 +7776,38 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %sil
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rsp,%rax), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
@@ -7459,35 +7817,41 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, (%rsp,%rsi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rsi,%rsi), %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11
@@ -7500,31 +7864,37 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rax), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
@@ -7533,28 +7903,34 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes:
; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, (%rsp)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %sil
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq (%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 8(%rsp,%r9), %r8
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
@@ -7563,11 +7939,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rsp,%r9), %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
@@ -7584,32 +7960,38 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes:
; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %sil
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rsp,%rax), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
@@ -7619,35 +8001,41 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes:
; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, (%rsp,%rsi), %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 8(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rsp,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rsi,%rsi), %r11
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11
@@ -7660,31 +8048,37 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes:
; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rsp,%rax), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq (%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
@@ -7693,6 +8087,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes:
@@ -8870,7 +9266,11 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -8878,25 +9278,25 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9,4), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%r9,4), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9,4), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%r9,4), %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
@@ -8909,7 +9309,7 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%r9,4), %r9
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
@@ -8920,11 +9320,17 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -8932,23 +9338,23 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%rax,4), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax
@@ -8956,10 +9362,16 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -8967,31 +9379,31 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi,4), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rsi,4), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, (%rsp,%rsi,4), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rsi,4), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax
@@ -9000,10 +9412,16 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -9011,23 +9429,23 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rax,4), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax
@@ -9035,29 +9453,35 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %sil
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 8(%rsp,%r9,4), %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
@@ -9066,11 +9490,11 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rsp,%r9,4), %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rsp,%r9,4), %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
@@ -9087,33 +9511,39 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %sil
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rsp,%rax,4), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 8(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
@@ -9122,36 +9552,42 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %eax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %sil
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, (%rsp,%rsi,4), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi,4), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rsi,4), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rsi,4), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rsi,%rsi), %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11
@@ -9164,32 +9600,38 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %sil
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rax,4), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
@@ -9198,29 +9640,35 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, (%rsp)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $6, %sil
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9,4), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq (%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 8(%rsp,%r9,4), %r8
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
@@ -9229,11 +9677,11 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rsp,%r9,4), %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rsp,%r9,4), %r9
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
@@ -9250,33 +9698,39 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq -8(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $6, %sil
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rsp,%rax,4), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 8(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
@@ -9285,36 +9739,42 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %eax
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %sil
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, (%rsp,%rsi,4), %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rsi,4), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 8(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rsp,%rsi,4), %r9
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rsp,%rsi,4), %rsi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rsi,%rsi), %r11
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11
@@ -9327,32 +9787,38 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subq $96, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %sil
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rsp,%rax,4), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq (%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 8(%rsp,%rax,4), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
@@ -9361,6 +9827,8 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbp, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X86-SSE2-LABEL: ashr_32bytes_dwordOff:
@@ -9521,71 +9989,89 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: ashr_32bytes_qwordOff:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pushq %rbp
+; X64-SSE2-NEXT: movq %rsp, %rbp
+; X64-SSE2-NEXT: andq $-32, %rsp
+; X64-SSE2-NEXT: subq $96, %rsp
; X64-SSE2-NEXT: movq (%rdi), %rax
; X64-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-SSE2-NEXT: movq 16(%rdi), %r8
; X64-SSE2-NEXT: movq 24(%rdi), %rdi
; X64-SSE2-NEXT: movzbl (%rsi), %esi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rax, (%rsp)
; X64-SSE2-NEXT: sarq $63, %rdi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: andl $3, %esi
-; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax
-; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx
-; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi
-; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-SSE2-NEXT: movq (%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq 8(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq 24(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq 16(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-SSE2-NEXT: movq %rax, (%rdx)
; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT: movq %rbp, %rsp
+; X64-SSE2-NEXT: popq %rbp
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: ashr_32bytes_qwordOff:
; X64-SSE42: # %bb.0:
+; X64-SSE42-NEXT: pushq %rbp
+; X64-SSE42-NEXT: movq %rsp, %rbp
+; X64-SSE42-NEXT: andq $-32, %rsp
+; X64-SSE42-NEXT: subq $96, %rsp
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movq 16(%rdi), %rax
; X64-SSE42-NEXT: movq 24(%rdi), %rcx
; X64-SSE42-NEXT: movzbl (%rsi), %esi
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, (%rsp)
; X64-SSE42-NEXT: sarq $63, %rcx
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: andl $3, %esi
-; X64-SSE42-NEXT: movups -72(%rsp,%rsi,8), %xmm0
-; X64-SSE42-NEXT: movups -56(%rsp,%rsi,8), %xmm1
+; X64-SSE42-NEXT: movups (%rsp,%rsi,8), %xmm0
+; X64-SSE42-NEXT: movups 16(%rsp,%rsi,8), %xmm1
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
+; X64-SSE42-NEXT: movq %rbp, %rsp
+; X64-SSE42-NEXT: popq %rbp
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: ashr_32bytes_qwordOff:
; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: pushq %rbp
+; X64-AVX-NEXT: movq %rsp, %rbp
+; X64-AVX-NEXT: andq $-32, %rsp
+; X64-AVX-NEXT: subq $96, %rsp
; X64-AVX-NEXT: vmovups (%rdi), %xmm0
; X64-AVX-NEXT: movq 16(%rdi), %rax
; X64-AVX-NEXT: movq 24(%rdi), %rcx
; X64-AVX-NEXT: movzbl (%rsi), %esi
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps %xmm0, (%rsp)
; X64-AVX-NEXT: sarq $63, %rcx
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-AVX-NEXT: andl $3, %esi
-; X64-AVX-NEXT: vmovups -72(%rsp,%rsi,8), %xmm0
-; X64-AVX-NEXT: vmovups -56(%rsp,%rsi,8), %xmm1
+; X64-AVX-NEXT: vmovups (%rsp,%rsi,8), %xmm0
+; X64-AVX-NEXT: vmovups 16(%rsp,%rsi,8), %xmm1
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX-NEXT: movq %rbp, %rsp
+; X64-AVX-NEXT: popq %rbp
; X64-AVX-NEXT: retq
;
; X86-SSE2-LABEL: ashr_32bytes_qwordOff:
@@ -9746,11 +10232,15 @@ define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: subq $160, %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
@@ -9761,29 +10251,29 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %edi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, (%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rdi,8), %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%rdi), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%rdi), %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r9
@@ -9794,11 +10284,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%rdi), %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r15
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rsp,%rdi), %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
@@ -9809,11 +10299,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rsp,%rdi), %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r12
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rdi), %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rsp,%rdi), %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r13,%r13), %r15
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15
@@ -9826,7 +10316,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rsp,%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rdi,%rdi), %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14
@@ -9841,18 +10331,24 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -9863,61 +10359,66 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%rax), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r10, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rbx, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rsp,%rax), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rsp,%rax), %r14
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r15
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rbx, %r15
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 48(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 56(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -9928,36 +10429,36 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %esi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rax), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r8, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r10,%r10), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, (%rsp,%rax), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rax), %r11
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rsp,%rax), %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r9
@@ -9965,9 +10466,9 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r10, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rsp,%rax), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rsp,%rax), %r15
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r15,%r15), %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r12, %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r12
@@ -9976,7 +10477,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rsp,%rax), %rax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rax,%rax), %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r14, %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %rcx
@@ -9989,18 +10490,23 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq -32(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -10011,37 +10517,37 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rax), %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rsp,%rax), %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -10054,57 +10560,62 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes:
; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: subq $192, %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %r8d
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%r8,8), %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %r8d
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%r8), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 32(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 40(%rsp,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %rdi, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rsp,%r8), %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%r8), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 64(%rsp,%r8), %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%r8), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rsp,%r8), %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r14
@@ -10112,12 +10623,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%r8), %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 72(%rsp,%r8), %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r13
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%r8), %rbp
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 80(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %r15
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %r15
@@ -10128,13 +10639,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 88(%rsp,%r8), %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %rbx, %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
@@ -10150,7 +10662,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13
@@ -10161,42 +10673,46 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rsp,%rax), %r9
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rsp,%rax), %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r8
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 40(%rsp,%rax), %r11
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rsp,%rax), %r11
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r15
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14
@@ -10210,126 +10726,138 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r13
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, (%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 32(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r10, %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r12
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 40(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %rbx, %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r13, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r14, %r14
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %rbx, %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r15, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rsp,%rax), %rax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rax,%rax), %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %r15, %r15
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rcx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %r10, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r8, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %rax, %rax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r12, 40(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r13
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rax), %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rax), %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 40(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r11, %r10
@@ -10343,51 +10871,56 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes:
; X64-NO-SHLD-NO-BMI2-AVX1: # %bb.0:
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: subq $192, %rsp
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %r9d
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (,%r9,8), %eax
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 32(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 40(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %rdi, %rdi
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 56(%rsp,%r9), %r10
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %rbx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 64(%rsp,%r9), %r12
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r11
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r11
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 48(%rsp,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, %r14
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r14
@@ -10395,12 +10928,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 72(%rsp,%r9), %r14
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r13
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%r9), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 80(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r8,%r8), %r15
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r15
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r13, %r15
@@ -10411,13 +10944,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r14
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r12, %r14
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 88(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r9,%r9), %r12
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r8, %r12
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r8
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %rbx, %rbx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
@@ -10433,7 +10967,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r13
@@ -10445,36 +10979,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes:
; X64-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 24(%rsp,%rax), %r9
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 16(%rsp,%rax), %r10
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %r8
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 40(%rsp,%rax), %r11
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 56(%rsp,%rax), %r11
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, %r15
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14
@@ -10488,116 +11026,128 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes:
; X64-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r13
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %eax
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rsi), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rsi), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rsi), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, (%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 8(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 16(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 24(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 32(%rsp,%rsi), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r10, %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rsi), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 40(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 48(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %rbx, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r13, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r14, %r14
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rbx, %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r15, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 56(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%rsi,%rsi), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r15, %r15
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r9, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r10, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %rsi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r10, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r8, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 48(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r12, 40(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r13
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 24(%rsp,%rax), %r9
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 16(%rsp,%rax), %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 40(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 56(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r11, %r10
@@ -10611,49 +11161,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: subq $192, %rsp
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %r9d
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%r9,8), %eax
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 32(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 40(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %rdi, %rdi
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 56(%rsp,%r9), %r10
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %rbx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 64(%rsp,%r9), %r12
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r11
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 48(%rsp,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r14
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r14
@@ -10661,12 +11216,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 72(%rsp,%r9), %r14
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r13
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%r9), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 80(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r8,%r8), %r15
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %r15
@@ -10677,13 +11232,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r14
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 88(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r9,%r9), %r12
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r8, %r12
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %rbx, %rbx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
@@ -10699,7 +11255,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13
@@ -10711,33 +11267,37 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %edi
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rdi,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %edi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%rdi), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 32(%rsp,%rdi), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 24(%rsp,%rdi), %r9
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rax
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%rdi), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 16(%rsp,%rdi), %r10
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %r8
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%rdi), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%rdi), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 48(%rsp,%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 40(%rsp,%rdi), %r11
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%rdi), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 56(%rsp,%rdi), %r11
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%rdi), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq (%rsp,%rdi), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 8(%rsp,%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r15
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %r14
@@ -10751,110 +11311,122 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r13
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %eax
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rsi), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rsi), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rsi), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, (%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 8(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 16(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 24(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 32(%rsp,%rsi), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r10, %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rsi), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 40(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 48(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rbx, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r13, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r14, %r14
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rbx, %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r15, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 56(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%rsi,%rsi), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r15, %r15
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r9, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rsi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rcx, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r10, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r8, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rcx, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r12, 40(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r13
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 24(%rsp,%rax), %r9
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 16(%rsp,%rax), %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 40(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 56(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %r10
@@ -10868,9 +11440,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq
;
@@ -13378,7 +13952,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: lshr_64bytes_qwordOff:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pushq %rbp
+; X64-SSE2-NEXT: movq %rsp, %rbp
; X64-SSE2-NEXT: pushq %rbx
+; X64-SSE2-NEXT: andq $-32, %rsp
+; X64-SSE2-NEXT: subq $160, %rsp
; X64-SSE2-NEXT: movq (%rdi), %rax
; X64-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-SSE2-NEXT: movq 16(%rdi), %r8
@@ -13389,27 +13967,27 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
; X64-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-SSE2-NEXT: movl (%rsi), %esi
; X64-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rax, (%rsp)
; X64-SSE2-NEXT: andl $7, %esi
-; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax
-; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx
-; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi
-; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8
-; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9
-; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10
-; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11
-; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi
+; X64-SSE2-NEXT: movq (%rsp,%rsi,8), %rax
+; X64-SSE2-NEXT: movq 8(%rsp,%rsi,8), %rcx
+; X64-SSE2-NEXT: movq 24(%rsp,%rsi,8), %rdi
+; X64-SSE2-NEXT: movq 16(%rsp,%rsi,8), %r8
+; X64-SSE2-NEXT: movq 40(%rsp,%rsi,8), %r9
+; X64-SSE2-NEXT: movq 32(%rsp,%rsi,8), %r10
+; X64-SSE2-NEXT: movq 56(%rsp,%rsi,8), %r11
+; X64-SSE2-NEXT: movq 48(%rsp,%rsi,8), %rsi
; X64-SSE2-NEXT: movq %rsi, 48(%rdx)
; X64-SSE2-NEXT: movq %r11, 56(%rdx)
; X64-SSE2-NEXT: movq %r10, 32(%rdx)
@@ -13418,80 +13996,94 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-SSE2-NEXT: movq %rax, (%rdx)
; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT: leaq -8(%rbp), %rsp
; X64-SSE2-NEXT: popq %rbx
+; X64-SSE2-NEXT: popq %rbp
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: lshr_64bytes_qwordOff:
; X64-SSE42: # %bb.0:
-; X64-SSE42-NEXT: pushq %rax
+; X64-SSE42-NEXT: pushq %rbp
+; X64-SSE42-NEXT: movq %rsp, %rbp
+; X64-SSE42-NEXT: andq $-32, %rsp
+; X64-SSE42-NEXT: subq $160, %rsp
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
; X64-SSE42-NEXT: movups 48(%rdi), %xmm3
; X64-SSE42-NEXT: movl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm4, %xmm4
-; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, (%rsp)
; X64-SSE42-NEXT: andl $7, %eax
-; X64-SSE42-NEXT: movups -128(%rsp,%rax,8), %xmm0
-; X64-SSE42-NEXT: movups -112(%rsp,%rax,8), %xmm1
-; X64-SSE42-NEXT: movups -96(%rsp,%rax,8), %xmm2
-; X64-SSE42-NEXT: movups -80(%rsp,%rax,8), %xmm3
+; X64-SSE42-NEXT: movups (%rsp,%rax,8), %xmm0
+; X64-SSE42-NEXT: movups 16(%rsp,%rax,8), %xmm1
+; X64-SSE42-NEXT: movups 32(%rsp,%rax,8), %xmm2
+; X64-SSE42-NEXT: movups 48(%rsp,%rax,8), %xmm3
; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
-; X64-SSE42-NEXT: popq %rax
+; X64-SSE42-NEXT: movq %rbp, %rsp
+; X64-SSE42-NEXT: popq %rbp
; X64-SSE42-NEXT: retq
;
; X64-AVX1-LABEL: lshr_64bytes_qwordOff:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: pushq %rax
+; X64-AVX1-NEXT: pushq %rbp
+; X64-AVX1-NEXT: movq %rsp, %rbp
+; X64-AVX1-NEXT: andq $-32, %rsp
+; X64-AVX1-NEXT: subq $160, %rsp
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-AVX1-NEXT: movl (%rsi), %eax
; X64-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX1-NEXT: andl $7, %eax
-; X64-AVX1-NEXT: vmovups -128(%rsp,%rax,8), %xmm0
-; X64-AVX1-NEXT: vmovups -112(%rsp,%rax,8), %xmm1
-; X64-AVX1-NEXT: vmovups -96(%rsp,%rax,8), %xmm2
-; X64-AVX1-NEXT: vmovups -80(%rsp,%rax,8), %xmm3
+; X64-AVX1-NEXT: vmovups (%rsp,%rax,8), %xmm0
+; X64-AVX1-NEXT: vmovups 16(%rsp,%rax,8), %xmm1
+; X64-AVX1-NEXT: vmovups 32(%rsp,%rax,8), %xmm2
+; X64-AVX1-NEXT: vmovups 48(%rsp,%rax,8), %xmm3
; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX1-NEXT: popq %rax
+; X64-AVX1-NEXT: movq %rbp, %rsp
+; X64-AVX1-NEXT: popq %rbp
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
; X64-AVX512-LABEL: lshr_64bytes_qwordOff:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: pushq %rax
+; X64-AVX512-NEXT: pushq %rbp
+; X64-AVX512-NEXT: movq %rsp, %rbp
+; X64-AVX512-NEXT: andq $-32, %rsp
+; X64-AVX512-NEXT: subq $160, %rsp
; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-AVX512-NEXT: movl (%rsi), %eax
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT: vmovups %zmm0, (%rsp)
; X64-AVX512-NEXT: andl $7, %eax
-; X64-AVX512-NEXT: vmovups -128(%rsp,%rax,8), %xmm0
-; X64-AVX512-NEXT: vmovups -112(%rsp,%rax,8), %xmm1
-; X64-AVX512-NEXT: vmovups -96(%rsp,%rax,8), %xmm2
-; X64-AVX512-NEXT: vmovups -80(%rsp,%rax,8), %xmm3
+; X64-AVX512-NEXT: vmovups (%rsp,%rax,8), %xmm0
+; X64-AVX512-NEXT: vmovups 16(%rsp,%rax,8), %xmm1
+; X64-AVX512-NEXT: vmovups 32(%rsp,%rax,8), %xmm2
+; X64-AVX512-NEXT: vmovups 48(%rsp,%rax,8), %xmm3
; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT: popq %rax
+; X64-AVX512-NEXT: movq %rbp, %rsp
+; X64-AVX512-NEXT: popq %rbp
; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
;
@@ -13716,11 +14308,15 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: subq $160, %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
@@ -13731,25 +14327,25 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negl %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movslq %esi, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rbx), %r8
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rbx), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 64(%rsp,%rbx), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 72(%rsp,%rbx), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
@@ -13760,11 +14356,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r9
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rbx), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 88(%rsp,%rbx), %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rbx), %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 80(%rsp,%rbx), %r15
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
@@ -13776,11 +14372,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r15, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rbx), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 104(%rsp,%rbx), %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r12
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rbx), %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 96(%rsp,%rbx), %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r13, %r15
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r15
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
@@ -13792,10 +14388,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r13, %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%rbx), %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 120(%rsp,%rbx), %r12
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rbx), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 112(%rsp,%rbx), %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
@@ -13817,18 +14413,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subq $128, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
@@ -13839,37 +14440,37 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %esi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %esi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negl %esi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movslq %esi, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 80(%rsp,%r9), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 88(%rsp,%r9), %r10
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 64(%rsp,%r9), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 72(%rsp,%r9), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r9), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r9), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 96(%rsp,%r9), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 104(%rsp,%r9), %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r14
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r11, %r14
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r10, %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r9), %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r9), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 112(%rsp,%r9), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 120(%rsp,%r9), %r9
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r10, %r9
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rbx, %r10
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rdi
@@ -13883,18 +14484,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addq $8, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leaq -16(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
@@ -13905,35 +14510,35 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %esi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negl %esi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 64(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 72(%rsp,%rsi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r9, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 88(%rsp,%rsi), %r11
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 80(%rsp,%rsi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8
@@ -13941,9 +14546,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r14, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 104(%rsp,%rsi), %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rbx, %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 96(%rsp,%rsi), %r15
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r15, %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r15
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r15, %r15
@@ -13951,8 +14556,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r11
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r12, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, 120(%rsp,%rsi), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 112(%rsp,%rsi), %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rsi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rsi, %rsi
@@ -13968,18 +14573,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq -32(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subq $128, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
@@ -13990,37 +14599,37 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %esi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %esi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negl %esi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movslq %esi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 80(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 88(%rsp,%r8), %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%r8), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 64(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 72(%rsp,%r8), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 96(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 104(%rsp,%r8), %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r11, %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r9, %r11
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%r8), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 112(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 120(%rsp,%r8), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r9, %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rbx, %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r10, %rdi
@@ -14033,44 +14642,49 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leaq -16(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes:
; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: subq $160, %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, (%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negl %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movslq %ecx, %r9
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 104(%rsp,%r9), %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r9), %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 96(%rsp,%r9), %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
@@ -14078,7 +14692,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 88(%rsp,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
@@ -14086,7 +14700,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 80(%rsp,%r9), %r15
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
@@ -14094,8 +14708,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 64(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 72(%rsp,%r9), %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
@@ -14108,7 +14722,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r15
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r15
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 112(%rsp,%r9), %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, %r13
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r13
@@ -14116,7 +14730,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %rdi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -8(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 120(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r12
@@ -14133,52 +14747,58 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 32(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negl %eax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movslq %eax, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 104(%rsp,%r8), %r9
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 88(%rsp,%r8), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 80(%rsp,%r8), %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 72(%rsp,%r8), %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 112(%rsp,%r8), %r14
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r15
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 120(%rsp,%r8), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r14, %r8
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %r9
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
@@ -14192,58 +14812,63 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, 32(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 40(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %esi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negl %esi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 104(%rsp,%rsi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 96(%rsp,%rsi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 88(%rsp,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r9
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r9, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 80(%rsp,%rsi), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r10, %r14
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r11, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 64(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 72(%rsp,%rsi), %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r15
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r11, %r11
@@ -14252,12 +14877,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rbx, %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 112(%rsp,%rsi), %r15
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r12
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r12, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, 120(%rsp,%rsi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r15
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r15, %rax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rcx, %rax
@@ -14269,52 +14894,57 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 32(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq -32(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negl %eax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movslq %eax, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 104(%rsp,%r8), %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 88(%rsp,%r8), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 80(%rsp,%r8), %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 72(%rsp,%r8), %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 112(%rsp,%r8), %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 120(%rsp,%r8), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r14, %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -14327,38 +14957,44 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 32(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 40(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes:
; X64-NO-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: subq $160, %rsp
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm2, (%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rcx,8), %eax
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: negl %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movslq %ecx, %r9
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -24(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 104(%rsp,%r9), %rdi
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, %r10
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -32(%rsp,%r9), %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 96(%rsp,%r9), %r11
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %r8
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r8
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
@@ -14366,7 +15002,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r10, %r8
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -40(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 88(%rsp,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, %r10
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r10
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
@@ -14374,7 +15010,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r11, %r10
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -48(%rsp,%r9), %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 80(%rsp,%r9), %r15
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, %r11
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r11
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
@@ -14382,8 +15018,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbx, %r11
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -64(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -56(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 64(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 72(%rsp,%r9), %r12
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, %rbx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %rbx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
@@ -14396,7 +15032,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r15
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r12, %r15
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -16(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 112(%rsp,%r9), %r12
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, %r13
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r13
@@ -14404,7 +15040,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r13, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -8(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 120(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r9
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r12
@@ -14421,47 +15057,53 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 32(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r13
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes:
; X64-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm2, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: negl %eax
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movslq %eax, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 104(%rsp,%r8), %r9
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 88(%rsp,%r8), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 80(%rsp,%r8), %r10
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 72(%rsp,%r8), %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 112(%rsp,%r8), %r14
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r15
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 120(%rsp,%r8), %r8
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r14, %r8
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %r9
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r9
@@ -14475,53 +15117,58 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, 32(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsi, 40(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes:
; X64-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm2, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %esi
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: negl %esi
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -24(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 104(%rsp,%rsi), %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rdi, %r9
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -32(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 96(%rsp,%rsi), %r8
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r8, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r8
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -40(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 88(%rsp,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r9, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r9
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r9, %r9
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r10, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -48(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 80(%rsp,%rsi), %r10
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r10, %r14
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r10
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r11, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -64(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -56(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 64(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 72(%rsp,%rsi), %r11
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r15
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r11
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r11, %r11
@@ -14530,12 +15177,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %rbx, %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -16(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 112(%rsp,%rsi), %r15
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r15, %r12
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r12, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, 120(%rsp,%rsi), %rcx
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r15
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r15, %rax
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rcx, %rax
@@ -14547,47 +15194,52 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, 32(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq -32(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm2, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: negl %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movslq %eax, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 104(%rsp,%r8), %r9
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 88(%rsp,%r8), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 80(%rsp,%r8), %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 72(%rsp,%r8), %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 112(%rsp,%r8), %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 120(%rsp,%r8), %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r14, %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r9
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -14600,36 +15252,42 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 32(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsi, 40(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: subq $160, %rsp
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, (%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rcx,8), %eax
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movslq %ecx, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 104(%rsp,%r9), %rdi
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r10
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r9), %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 96(%rsp,%r9), %r11
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r8
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r8
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
@@ -14637,7 +15295,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %r8
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 88(%rsp,%r9), %rbx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r10
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r10
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
@@ -14645,7 +15303,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r11, %r10
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r9), %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 80(%rsp,%r9), %r15
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, %r11
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r11
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
@@ -14653,8 +15311,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 64(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 72(%rsp,%r9), %r12
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %rbx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rbx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
@@ -14667,7 +15325,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r15
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 112(%rsp,%r9), %r12
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %r13
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r13
@@ -14675,7 +15333,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rdi
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq 120(%rsp,%r9), %r9
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r12
@@ -14692,44 +15350,50 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 32(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, (%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %eax
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movslq %eax, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 104(%rsp,%r8), %r9
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 88(%rsp,%r8), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 80(%rsp,%r8), %r10
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 72(%rsp,%r8), %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 112(%rsp,%r8), %r14
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq 120(%rsp,%r8), %r8
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r9
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9
@@ -14743,50 +15407,55 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 32(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, (%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %esi
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 104(%rsp,%rsi), %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rdi, %r9
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 96(%rsp,%rsi), %r8
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r8, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r8
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 88(%rsp,%rsi), %r9
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r9, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r9
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r9, %r9
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 80(%rsp,%rsi), %r10
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r10, %r14
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r10
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r11, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 64(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 72(%rsp,%rsi), %r11
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r15
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r11
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r11, %r11
@@ -14795,12 +15464,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rbx, %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq 112(%rsp,%rsi), %r15
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r15, %r12
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r12, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, 120(%rsp,%rsi), %rcx
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r15
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r15, %rax
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rcx, %rax
@@ -14812,44 +15481,49 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 32(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq -32(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, (%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %eax
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %eax, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 96(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 104(%rsp,%r8), %r9
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 88(%rsp,%r8), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 80(%rsp,%r8), %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 72(%rsp,%r8), %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 112(%rsp,%r8), %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq 120(%rsp,%r8), %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r9
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -14862,9 +15536,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 32(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq
;
@@ -17493,7 +18169,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: shl_64bytes_qwordOff:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pushq %rbp
+; X64-SSE2-NEXT: movq %rsp, %rbp
; X64-SSE2-NEXT: pushq %rbx
+; X64-SSE2-NEXT: andq $-32, %rsp
+; X64-SSE2-NEXT: subq $160, %rsp
; X64-SSE2-NEXT: movq (%rdi), %rax
; X64-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-SSE2-NEXT: movq 16(%rdi), %r8
@@ -17504,30 +18184,30 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
; X64-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-SSE2-NEXT: movl (%rsi), %esi
; X64-SSE2-NEXT: xorps %xmm0, %xmm0
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movaps %xmm0, (%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-SSE2-NEXT: shll $3, %esi
; X64-SSE2-NEXT: andl $56, %esi
; X64-SSE2-NEXT: negl %esi
; X64-SSE2-NEXT: movslq %esi, %rax
-; X64-SSE2-NEXT: movq -64(%rsp,%rax), %rcx
-; X64-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
-; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rdi
-; X64-SSE2-NEXT: movq -48(%rsp,%rax), %r8
-; X64-SSE2-NEXT: movq -24(%rsp,%rax), %r9
-; X64-SSE2-NEXT: movq -32(%rsp,%rax), %r10
-; X64-SSE2-NEXT: movq -8(%rsp,%rax), %r11
-; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax
+; X64-SSE2-NEXT: movq 64(%rsp,%rax), %rcx
+; X64-SSE2-NEXT: movq 72(%rsp,%rax), %rsi
+; X64-SSE2-NEXT: movq 88(%rsp,%rax), %rdi
+; X64-SSE2-NEXT: movq 80(%rsp,%rax), %r8
+; X64-SSE2-NEXT: movq 104(%rsp,%rax), %r9
+; X64-SSE2-NEXT: movq 96(%rsp,%rax), %r10
+; X64-SSE2-NEXT: movq 120(%rsp,%rax), %r11
+; X64-SSE2-NEXT: movq 112(%rsp,%rax), %rax
; X64-SSE2-NEXT: movq %rax, 48(%rdx)
; X64-SSE2-NEXT: movq %r11, 56(%rdx)
; X64-SSE2-NEXT: movq %r10, 32(%rdx)
@@ -17536,89 +18216,103 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
; X64-SSE2-NEXT: movq %rcx, (%rdx)
; X64-SSE2-NEXT: movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT: leaq -8(%rbp), %rsp
; X64-SSE2-NEXT: popq %rbx
+; X64-SSE2-NEXT: popq %rbp
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: shl_64bytes_qwordOff:
; X64-SSE42: # %bb.0:
-; X64-SSE42-NEXT: pushq %rax
+; X64-SSE42-NEXT: pushq %rbp
+; X64-SSE42-NEXT: movq %rsp, %rbp
+; X64-SSE42-NEXT: andq $-32, %rsp
+; X64-SSE42-NEXT: subq $160, %rsp
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
; X64-SSE42-NEXT: movups 48(%rdi), %xmm3
; X64-SSE42-NEXT: movl (%rsi), %eax
; X64-SSE42-NEXT: xorps %xmm4, %xmm4
-; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm4, (%rsp)
+; X64-SSE42-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: shll $3, %eax
; X64-SSE42-NEXT: andl $56, %eax
; X64-SSE42-NEXT: negl %eax
; X64-SSE42-NEXT: cltq
-; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0
-; X64-SSE42-NEXT: movups -48(%rsp,%rax), %xmm1
-; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm2
-; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm3
+; X64-SSE42-NEXT: movups 64(%rsp,%rax), %xmm0
+; X64-SSE42-NEXT: movups 80(%rsp,%rax), %xmm1
+; X64-SSE42-NEXT: movups 96(%rsp,%rax), %xmm2
+; X64-SSE42-NEXT: movups 112(%rsp,%rax), %xmm3
; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
-; X64-SSE42-NEXT: popq %rax
+; X64-SSE42-NEXT: movq %rbp, %rsp
+; X64-SSE42-NEXT: popq %rbp
; X64-SSE42-NEXT: retq
;
; X64-AVX1-LABEL: shl_64bytes_qwordOff:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: pushq %rax
+; X64-AVX1-NEXT: pushq %rbp
+; X64-AVX1-NEXT: movq %rsp, %rbp
+; X64-AVX1-NEXT: andq $-32, %rsp
+; X64-AVX1-NEXT: subq $160, %rsp
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
; X64-AVX1-NEXT: movl (%rsi), %eax
; X64-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovaps %ymm2, {{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovaps %ymm2, (%rsp)
+; X64-AVX1-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-AVX1-NEXT: shll $3, %eax
; X64-AVX1-NEXT: andl $56, %eax
; X64-AVX1-NEXT: negl %eax
; X64-AVX1-NEXT: cltq
-; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX1-NEXT: vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX1-NEXT: vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX1-NEXT: vmovups -16(%rsp,%rax), %xmm3
+; X64-AVX1-NEXT: vmovups 64(%rsp,%rax), %xmm0
+; X64-AVX1-NEXT: vmovups 80(%rsp,%rax), %xmm1
+; X64-AVX1-NEXT: vmovups 96(%rsp,%rax), %xmm2
+; X64-AVX1-NEXT: vmovups 112(%rsp,%rax), %xmm3
; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX1-NEXT: popq %rax
+; X64-AVX1-NEXT: movq %rbp, %rsp
+; X64-AVX1-NEXT: popq %rbp
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
; X64-AVX512-LABEL: shl_64bytes_qwordOff:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: pushq %rax
+; X64-AVX512-NEXT: pushq %rbp
+; X64-AVX512-NEXT: movq %rsp, %rbp
+; X64-AVX512-NEXT: andq $-32, %rsp
+; X64-AVX512-NEXT: subq $160, %rsp
; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
; X64-AVX512-NEXT: movl (%rsi), %eax
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX512-NEXT: vmovups %zmm1, (%rsp)
+; X64-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; X64-AVX512-NEXT: shll $3, %eax
; X64-AVX512-NEXT: andl $56, %eax
; X64-AVX512-NEXT: negl %eax
; X64-AVX512-NEXT: cltq
-; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT: vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT: vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT: vmovups -16(%rsp,%rax), %xmm3
+; X64-AVX512-NEXT: vmovups 64(%rsp,%rax), %xmm0
+; X64-AVX512-NEXT: vmovups 80(%rsp,%rax), %xmm1
+; X64-AVX512-NEXT: vmovups 96(%rsp,%rax), %xmm2
+; X64-AVX512-NEXT: vmovups 112(%rsp,%rax), %xmm3
; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT: popq %rax
+; X64-AVX512-NEXT: movq %rbp, %rsp
+; X64-AVX512-NEXT: popq %rbp
; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
;
@@ -17859,11 +18553,15 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: subq $160, %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
@@ -17873,34 +18571,34 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %edi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, (%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %r14
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rdi,8), %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%rdi), %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%rdi), %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r9
@@ -17911,11 +18609,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%rdi), %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r15
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rsp,%rdi), %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r11
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
@@ -17926,11 +18624,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rsp,%rdi), %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r12
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rdi), %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rsp,%rdi), %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r13,%r13), %r15
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15
@@ -17943,7 +18641,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rsp,%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rdi,%rdi), %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14
@@ -17958,18 +18656,24 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -17979,66 +18683,71 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %r14
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, %r8
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rsp,%rax), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r10, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rbx, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rsp,%rax), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rsp,%rax), %r14
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r15
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rbx, %r15
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 48(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 56(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, (%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -18048,41 +18757,41 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %esi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rax), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r8, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r10,%r10), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, (%rsp,%rax), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r8
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rax), %r11
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rsp,%rax), %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r9
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r9
@@ -18090,9 +18799,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r10, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rsp,%rax), %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rsp,%rax), %r15
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r15,%r15), %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r12, %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r12
@@ -18101,7 +18810,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r10
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rsp,%rax), %rax
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rax,%rax), %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r14, %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %rcx
@@ -18114,18 +18823,23 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, (%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq -32(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
@@ -18135,42 +18849,42 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rsp,%rax), %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rsp,%rax), %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $rcx
@@ -18183,63 +18897,68 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes:
; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: subq $192, %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rdi), %rax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %edi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rcx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rdi,8), %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 32(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 40(%rsp,%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r8, %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r8
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rsp,%rdi), %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 64(%rsp,%rdi), %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rsp,%rdi), %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r14
@@ -18247,12 +18966,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 72(%rsp,%rdi), %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r13
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rdi), %rbp
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 80(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %r15
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %r15
@@ -18263,13 +18982,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r14
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 88(%rsp,%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rdi,%rdi), %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %rbx, %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
@@ -18285,7 +19005,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, (%rdx)
-; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13
@@ -18296,48 +19016,52 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes:
; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rsp,%rax), %r9
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rsp,%rax), %r10
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r8
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 40(%rsp,%rax), %r11
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rsp,%rax), %r11
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r15
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14
@@ -18351,138 +19075,150 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes:
; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r13
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ecx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, (%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 32(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r10, %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r12
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 40(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %rbx, %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r13, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r14, %r14
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %rbx, %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r15, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rsp,%rax), %rax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rax,%rax), %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %r15, %r15
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rcx
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rsi, %r10, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r8, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rax, %rax
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r12, 40(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r13
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rsp,%rax), %r9
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rsp,%rax), %r10
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, %r8
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 40(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %r11, %r10
@@ -18496,61 +19232,66 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r15
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r14
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r13
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r12
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: subq $192, %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rax
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rcx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %edi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rcx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rdi,8), %eax
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 32(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 40(%rsp,%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r8, %r8
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rsp,%rdi), %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rbx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 64(%rsp,%rdi), %r12
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r12,%r12), %r11
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r11
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rsp,%rdi), %rbx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, %r14
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r14
@@ -18558,12 +19299,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 72(%rsp,%rdi), %r14
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, %r13
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rdi), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 80(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %r15
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r15
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r13, %r15
@@ -18574,13 +19315,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r14
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r12, %r14
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 88(%rsp,%rdi), %rdi
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rdi,%rdi), %r12
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %r12
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %rbx, %rbx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
@@ -18596,7 +19338,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 16(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 24(%rdx)
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r12
; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r13
@@ -18608,46 +19350,50 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r15
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r14
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rsp,%rax), %r9
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rsp,%rax), %r10
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r8
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 40(%rsp,%rax), %r11
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rsp,%rax), %r11
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r15
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14
@@ -18661,136 +19407,148 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r14
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
;
; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r13
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r12
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: subq $160, %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %ecx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, (%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %sil
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 8(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rsi, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 32(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rsi, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r10, %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rsi, %r11, %r11
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 40(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rsi, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %rbx, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r13, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r14, %r14
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rsi, %rbx, %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r15, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rsp,%rax), %rax
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rsi, %r15, %r15
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rsi, %r10, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r8, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rax, %rax
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 8(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 48(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r12, 40(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, 24(%rdx)
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq -40(%rbp), %rsp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r12
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r13
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsp, %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andq $-32, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subq $160, %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 32(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rsp,%rax), %r9
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, %rsi
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rsp,%rax), %r10
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, %r8
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 40(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rsp,%rax), %r11
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq (%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 8(%rsp,%rax), %rax
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %r11, %r10
@@ -18804,9 +19562,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r14, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbp
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
@@ -20977,119 +21737,135 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind {
; X64-SSE2-LABEL: ashr_64bytes_qwordOff:
; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pushq %rbp
+; X64-SSE2-NEXT: movq %rsp, %rbp
+; X64-SSE2-NEXT: pushq %r14
; X64-SSE2-NEXT: pushq %rbx
-; X64-SSE2-NEXT: movq (%rdi), %rax
-; X64-SSE2-NEXT: movq 8(%rdi), %rcx
-; X64-SSE2-NEXT: movq 16(%rdi), %r8
-; X64-SSE2-NEXT: movq 24(%rdi), %r9
-; X64-SSE2-NEXT: movq 32(%rdi), %r10
-; X64-SSE2-NEXT: movq 40(%rdi), %r11
-; X64-SSE2-NEXT: movq 48(%rdi), %rbx
+; X64-SSE2-NEXT: andq $-32, %rsp
+; X64-SSE2-NEXT: subq $128, %rsp
+; X64-SSE2-NEXT: movq (%rdi), %rcx
+; X64-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-SSE2-NEXT: movq 24(%rdi), %r10
+; X64-SSE2-NEXT: movq 32(%rdi), %r11
+; X64-SSE2-NEXT: movq 40(%rdi), %rbx
+; X64-SSE2-NEXT: movq 48(%rdi), %r14
; X64-SSE2-NEXT: movq 56(%rdi), %rdi
-; X64-SSE2-NEXT: movl (%rsi), %esi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movl (%rsi), %eax
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rcx, (%rsp)
; X64-SSE2-NEXT: sarq $63, %rdi
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: andl $7, %esi
-; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax
-; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx
-; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi
-; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8
-; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9
-; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10
-; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11
-; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi
-; X64-SSE2-NEXT: movq %rsi, 48(%rdx)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: andl $7, %eax
+; X64-SSE2-NEXT: movq (%rsp,%rax,8), %rcx
+; X64-SSE2-NEXT: movq 8(%rsp,%rax,8), %rsi
+; X64-SSE2-NEXT: movq 24(%rsp,%rax,8), %rdi
+; X64-SSE2-NEXT: movq 16(%rsp,%rax,8), %r8
+; X64-SSE2-NEXT: movq 40(%rsp,%rax,8), %r9
+; X64-SSE2-NEXT: movq 32(%rsp,%rax,8), %r10
+; X64-SSE2-NEXT: movq 56(%rsp,%rax,8), %r11
+; X64-SSE2-NEXT: movq 48(%rsp,%rax,8), %rax
+; X64-SSE2-NEXT: movq %rax, 48(%rdx)
; X64-SSE2-NEXT: movq %r11, 56(%rdx)
; X64-SSE2-NEXT: movq %r10, 32(%rdx)
; X64-SSE2-NEXT: movq %r9, 40(%rdx)
; X64-SSE2-NEXT: movq %r8, 16(%rdx)
; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
-; X64-SSE2-NEXT: movq %rax, (%rdx)
-; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
+; X64-SSE2-NEXT: movq %rcx, (%rdx)
+; X64-SSE2-NEXT: movq %rsi, 8(%rdx)
+; X64-SSE2-NEXT: leaq -16(%rbp), %rsp
; X64-SSE2-NEXT: popq %rbx
+; X64-SSE2-NEXT: popq %r14
+; X64-SSE2-NEXT: popq %rbp
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: ashr_64bytes_qwordOff:
; X64-SSE42: # %bb.0:
-; X64-SSE42-NEXT: pushq %rax
+; X64-SSE42-NEXT: pushq %rbp
+; X64-SSE42-NEXT: movq %rsp, %rbp
+; X64-SSE42-NEXT: andq $-32, %rsp
+; X64-SSE42-NEXT: subq $160, %rsp
; X64-SSE42-NEXT: movups (%rdi), %xmm0
; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
; X64-SSE42-NEXT: movq 48(%rdi), %rax
; X64-SSE42-NEXT: movq 56(%rdi), %rcx
; X64-SSE42-NEXT: movl (%rsi), %esi
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movaps %xmm0, (%rsp)
; X64-SSE42-NEXT: sarq $63, %rcx
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-SSE42-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-SSE42-NEXT: andl $7, %esi
-; X64-SSE42-NEXT: movups -128(%rsp,%rsi,8), %xmm0
-; X64-SSE42-NEXT: movups -112(%rsp,%rsi,8), %xmm1
-; X64-SSE42-NEXT: movups -96(%rsp,%rsi,8), %xmm2
-; X64-SSE42-NEXT: movups -80(%rsp,%rsi,8), %xmm3
+; X64-SSE42-NEXT: movups (%rsp,%rsi,8), %xmm0
+; X64-SSE42-NEXT: movups 16(%rsp,%rsi,8), %xmm1
+; X64-SSE42-NEXT: movups 32(%rsp,%rsi,8), %xmm2
+; X64-SSE42-NEXT: movups 48(%rsp,%rsi,8), %xmm3
; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
; X64-SSE42-NEXT: movups %xmm0, (%rdx)
-; X64-SSE42-NEXT: popq %rax
+; X64-SSE42-NEXT: movq %rbp, %rsp
+; X64-SSE42-NEXT: popq %rbp
; X64-SSE42-NEXT: retq
;
; X64-AVX-LABEL: ashr_64bytes_qwordOff:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: pushq %rax
+; X64-AVX-NEXT: pushq %rbp
+; X64-AVX-NEXT: movq %rsp, %rbp
+; X64-AVX-NEXT: andq $-32, %rsp
+; X64-AVX-NEXT: subq $160, %rsp
; X64-AVX-NEXT: vmovups (%rdi), %ymm0
; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1
; X64-AVX-NEXT: movq 48(%rdi), %rax
; X64-AVX-NEXT: movq 56(%rdi), %rcx
; X64-AVX-NEXT: movl (%rsi), %esi
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX-NEXT: sarq $63, %rcx
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-AVX-NEXT: andl $7, %esi
-; X64-AVX-NEXT: vmovups -128(%rsp,%rsi,8), %xmm0
-; X64-AVX-NEXT: vmovups -112(%rsp,%rsi,8), %xmm1
-; X64-AVX-NEXT: vmovups -96(%rsp,%rsi,8), %xmm2
-; X64-AVX-NEXT: vmovups -80(%rsp,%rsi,8), %xmm3
+; X64-AVX-NEXT: vmovups (%rsp,%rsi,8), %xmm0
+; X64-AVX-NEXT: vmovups 16(%rsp,%rsi,8), %xmm1
+; X64-AVX-NEXT: vmovups 32(%rsp,%rsi,8), %xmm2
+; X64-AVX-NEXT: vmovups 48(%rsp,%rsi,8), %xmm3
; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx)
; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx)
; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT: popq %rax
+; X64-AVX-NEXT: movq %rbp, %rsp
+; X64-AVX-NEXT: popq %rbp
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 221a51ed44696..0cbf1b4bbb586 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -1389,31 +1389,35 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, (%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%r8,8), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%r8,8), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
@@ -1426,7 +1430,7 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rsp,%r8,8), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
@@ -1437,32 +1441,38 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq -8(%rbp), %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rax,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rsp,%rax,8), %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax
@@ -1470,40 +1480,46 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rsi,8), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, (%rsp,%rsi,8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rsp,%rsi,8), %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
@@ -1512,31 +1528,37 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rax,8), %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax
@@ -1544,6 +1566,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: lshr_32bytes:
@@ -1939,26 +1963,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X64-NO-BMI2-NO-SHLD-NEXT: andb $24, %cl
; X64-NO-BMI2-NO-SHLD-NEXT: negb %cl
; X64-NO-BMI2-NO-SHLD-NEXT: movsbq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%r10), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%r10), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rsp,%r10), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rsp,%r10), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
@@ -1970,10 +1998,10 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%r10), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rsp,%r10), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%r10), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rsp,%r10), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
@@ -1991,33 +2019,39 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq -8(%rbp), %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
; X64-NO-BMI2-HAVE-SHLD-NEXT: andb $24, %al
; X64-NO-BMI2-HAVE-SHLD-NEXT: negb %al
; X64-NO-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rsp,%rax), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rsp,%rax), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rsp,%rax), %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %r8
@@ -2025,30 +2059,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rsp,%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rsp,%rdi), %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
@@ -2056,8 +2096,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, 56(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rsp,%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi
@@ -2069,32 +2109,38 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $3, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andb $24, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negb %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movsbq %al, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rsp,%rax), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%rax), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rsp,%rax), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rsp,%rax), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rcx, %r8, %rcx
@@ -2102,6 +2148,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: shl_32bytes:
@@ -2512,33 +2560,37 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, (%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%r8,8), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%r8,8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rsp,%r8,8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%r8,8), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil
; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%r8,8), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%r8,8), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
@@ -2551,7 +2603,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%r8,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rsp,%r8,8), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
@@ -2562,34 +2614,40 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq -8(%rbp), %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rax,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rax,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rsp,%rax,8), %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax
@@ -2597,42 +2655,48 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_32bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%rsi), %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rsi,8), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, (%rsp,%rsi,8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rsp,%rsi,8), %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
@@ -2641,33 +2705,39 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl (%rsi), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rax,8), %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax
@@ -2675,6 +2745,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: ashr_32bytes:
@@ -3094,12 +3166,14 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $192, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -3110,34 +3184,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rsp,%r8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rsp,%r8), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
@@ -3146,11 +3221,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rsp,%r8), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 64(%rsp,%r8), %r15
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
@@ -3161,12 +3236,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 72(%rsp,%r8), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 80(%rsp,%r8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12
@@ -3177,12 +3252,12 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 88(%rsp,%r8), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx)
@@ -3192,8 +3267,9 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq -40(%rbp), %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
@@ -3204,9 +3280,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: subq $160, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
@@ -3217,62 +3297,67 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rsp,%rax), %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq -24(%rbp), %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $160, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -3283,82 +3368,87 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rax), %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, (%rsp,%rax), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rsp,%rax), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rsp,%rax), %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r12, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rsp,%rax), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq -32(%rbp), %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: subq $160, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
@@ -3369,52 +3459,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rax), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rsp,%rax), %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
@@ -4214,11 +4306,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $160, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
@@ -4229,26 +4325,26 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi
; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rbx), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 64(%rsp,%rbx), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 72(%rsp,%rbx), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
@@ -4259,11 +4355,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rbx), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 88(%rsp,%rbx), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rbx), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 80(%rsp,%rbx), %r15
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
@@ -4275,11 +4371,11 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rbx), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 104(%rsp,%rbx), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rbx), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 96(%rsp,%rbx), %r13
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r13, %r15
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r15
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
@@ -4291,10 +4387,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -8(%rsp,%rbx), %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 120(%rsp,%rbx), %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rbx), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 112(%rsp,%rbx), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r13
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
@@ -4316,18 +4412,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq -40(%rbp), %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: subq $128, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
@@ -4338,38 +4439,38 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %esi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 80(%rsp,%r9), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 88(%rsp,%r9), %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r9), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 64(%rsp,%r9), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 72(%rsp,%r9), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r9), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r9), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 96(%rsp,%r9), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 104(%rsp,%r9), %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r9), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r9), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 112(%rsp,%r9), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 120(%rsp,%r9), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r8, %rdi
@@ -4383,18 +4484,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq -16(%rbp), %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $160, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8
@@ -4405,18 +4510,18 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -4424,17 +4529,17 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 64(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 72(%rsp,%rsi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 88(%rsp,%rsi), %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 80(%rsp,%rsi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8
@@ -4442,9 +4547,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 104(%rsp,%rsi), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 96(%rsp,%rsi), %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r15, %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r15, %r15
@@ -4452,8 +4557,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, 120(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 112(%rsp,%rsi), %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi
@@ -4469,18 +4574,22 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq -32(%rbp), %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: subq $128, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8
@@ -4491,38 +4600,38 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r8), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 80(%rsp,%r8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 88(%rsp,%r8), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%r8), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%r8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 64(%rsp,%r8), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 72(%rsp,%r8), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -32(%rsp,%r8), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -24(%rsp,%r8), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 96(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 104(%rsp,%r8), %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r11, %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -16(%rsp,%r8), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 112(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 120(%rsp,%r8), %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r9, %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %rbx, %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shldq %cl, %r10, %rdi
@@ -4535,9 +4644,10 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq -16(%rbp), %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: shl_64bytes:
@@ -5353,12 +5463,14 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $192, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx
; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -5368,39 +5480,40 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rsp,%r8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rsp,%r8), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
@@ -5409,11 +5522,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rsp,%r8), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 64(%rsp,%r8), %r15
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
@@ -5424,12 +5537,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 72(%rsp,%r8), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 80(%rsp,%r8), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r12
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12
@@ -5440,12 +5553,12 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 88(%rsp,%r8), %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %rdi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx)
@@ -5455,8 +5568,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq -40(%rbp), %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13
@@ -5467,9 +5581,13 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: subq $160, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
@@ -5479,67 +5597,72 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rax), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rax), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rsp,%rax), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rsp,%rax), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rsp,%rax), %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r15
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rsp,%rax), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq -24(%rbp), %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $160, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -5549,87 +5672,92 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarq $63, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rax), %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, (%rsp,%rax), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rsp,%rax), %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rsp,%rax), %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r12, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rsp,%rax), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq -32(%rbp), %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: subq $160, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9
@@ -5639,57 +5767,59 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, (%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rax), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rax), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rax), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rsp,%rax), %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rbx, %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rsp,%rax), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 48(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 32(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq -24(%rbp), %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index fde915247760a..7b68d3bfdff87 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -964,19 +964,25 @@ define void @load_8byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64: # %bb.0:
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $96, %rsp
; X64-NEXT: movups (%rdi), %xmm0
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: leal (,%rsi,8), %eax
-; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, (%rsp)
; X64-NEXT: shrb $6, %al
; X64-NEXT: movzbl %al, %eax
-; X64-NEXT: leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT: leaq (%rsp,%rax,8), %rax
; X64-NEXT: andl $7, %esi
; X64-NEXT: movzbl (%rsi,%rax), %eax
; X64-NEXT: movb %al, (%rdx)
+; X64-NEXT: movq %rbp, %rsp
+; X64-NEXT: popq %rbp
; X64-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1079,19 +1085,23 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rbp
+; X64-NO-BMI2-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NEXT: subq $96, %rsp
; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NEXT: movl %ecx, %eax
; X64-NO-BMI2-NEXT: shrb $6, %al
; X64-NO-BMI2-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: movq (%rsp,%rax,8), %rsi
; X64-NO-BMI2-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: movl 8(%rsp,%rax,8), %eax
; X64-NO-BMI2-NEXT: addl %eax, %eax
; X64-NO-BMI2-NEXT: andb $56, %cl
; X64-NO-BMI2-NEXT: notb %cl
@@ -1099,29 +1109,37 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-NO-BMI2-NEXT: shlq %cl, %rax
; X64-NO-BMI2-NEXT: orl %esi, %eax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NEXT: popq %rbp
; X64-NO-BMI2-NEXT: retq
;
; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rbp
+; X64-BMI2-NEXT: movq %rsp, %rbp
+; X64-BMI2-NEXT: andq $-32, %rsp
+; X64-BMI2-NEXT: subq $96, %rsp
; X64-BMI2-NEXT: movups (%rdi), %xmm0
; X64-BMI2-NEXT: xorps %xmm1, %xmm1
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-BMI2-NEXT: movl %esi, %eax
; X64-BMI2-NEXT: shrb $6, %al
; X64-BMI2-NEXT: movzbl %al, %eax
-; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: shrxq %rsi, (%rsp,%rax,8), %rcx
; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
; X64-BMI2-NEXT: andb $56, %sil
; X64-BMI2-NEXT: notb %sil
-; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: movl 8(%rsp,%rax,8), %eax
; X64-BMI2-NEXT: addl %eax, %eax
; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
; X64-BMI2-NEXT: orl %eax, %ecx
; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: movq %rbp, %rsp
+; X64-BMI2-NEXT: popq %rbp
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1223,19 +1241,23 @@ define void @load_2byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rbp
+; X64-NO-BMI2-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NEXT: subq $96, %rsp
; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NEXT: movl %ecx, %eax
; X64-NO-BMI2-NEXT: shrb $6, %al
; X64-NO-BMI2-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: movq (%rsp,%rax,8), %rsi
; X64-NO-BMI2-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: movl 8(%rsp,%rax,8), %eax
; X64-NO-BMI2-NEXT: addl %eax, %eax
; X64-NO-BMI2-NEXT: andb $56, %cl
; X64-NO-BMI2-NEXT: notb %cl
@@ -1243,29 +1265,37 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-NO-BMI2-NEXT: shlq %cl, %rax
; X64-NO-BMI2-NEXT: orl %esi, %eax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NEXT: popq %rbp
; X64-NO-BMI2-NEXT: retq
;
; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rbp
+; X64-BMI2-NEXT: movq %rsp, %rbp
+; X64-BMI2-NEXT: andq $-32, %rsp
+; X64-BMI2-NEXT: subq $96, %rsp
; X64-BMI2-NEXT: movups (%rdi), %xmm0
; X64-BMI2-NEXT: xorps %xmm1, %xmm1
; X64-BMI2-NEXT: shll $3, %esi
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-BMI2-NEXT: movl %esi, %eax
; X64-BMI2-NEXT: shrb $6, %al
; X64-BMI2-NEXT: movzbl %al, %eax
-; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: shrxq %rsi, (%rsp,%rax,8), %rcx
; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
; X64-BMI2-NEXT: andb $56, %sil
; X64-BMI2-NEXT: notb %sil
-; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: movl 8(%rsp,%rax,8), %eax
; X64-BMI2-NEXT: addl %eax, %eax
; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
; X64-BMI2-NEXT: orl %eax, %ecx
; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: movq %rbp, %rsp
+; X64-BMI2-NEXT: popq %rbp
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1367,18 +1397,22 @@ define void @load_4byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rax,8), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
@@ -1386,46 +1420,60 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: pushq %rbp
+; X64-SHLD-NEXT: movq %rsp, %rbp
+; X64-SHLD-NEXT: andq $-32, %rsp
+; X64-SHLD-NEXT: subq $96, %rsp
; X64-SHLD-NEXT: movups (%rdi), %xmm0
; X64-SHLD-NEXT: xorps %xmm1, %xmm1
; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
-; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-SHLD-NEXT: movl %ecx, %eax
; X64-SHLD-NEXT: shrb $6, %al
; X64-SHLD-NEXT: movzbl %al, %eax
-; X64-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
-; X64-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT: movq (%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT: movq 8(%rsp,%rax,8), %rax
; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-SHLD-NEXT: movq %rbp, %rsp
+; X64-SHLD-NEXT: popq %rbp
; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rsp,%rax,8), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1566,18 +1614,22 @@ define void @load_8byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rdi,8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
@@ -1588,34 +1640,40 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rdi,8), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1
; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rsi,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rsi,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rsi,8), %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi
@@ -1623,25 +1681,31 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm1, %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, (%rsp,%rcx,8), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rcx,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rcx,8), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9
@@ -1651,27 +1715,33 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm1, %xmm1
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rax,8), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax
@@ -1679,6 +1749,8 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1866,59 +1938,67 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: pushq %rbp
+; X64-NO-BMI2-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NEXT: subq $160, %rsp
; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: andl $56, %ecx
; X64-NO-BMI2-NEXT: andl $56, %esi
-; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: movq (%rsp,%rsi), %rax
; X64-NO-BMI2-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: movl 8(%rsp,%rsi), %esi
; X64-NO-BMI2-NEXT: addl %esi, %esi
; X64-NO-BMI2-NEXT: notl %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shlq %cl, %rsi
; X64-NO-BMI2-NEXT: orl %eax, %esi
; X64-NO-BMI2-NEXT: movb %sil, (%rdx)
-; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NEXT: popq %rbp
; X64-NO-BMI2-NEXT: retq
;
; X64-BMI2-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: pushq %rbp
+; X64-BMI2-NEXT: movq %rsp, %rbp
+; X64-BMI2-NEXT: andq $-32, %rsp
+; X64-BMI2-NEXT: subq $160, %rsp
; X64-BMI2-NEXT: movups (%rdi), %xmm0
; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
; X64-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-BMI2-NEXT: leal (,%rsi,8), %eax
; X64-BMI2-NEXT: andl $56, %eax
; X64-BMI2-NEXT: movl %eax, %ecx
; X64-BMI2-NEXT: notl %eax
; X64-BMI2-NEXT: andl $56, %esi
-; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT: movl 8(%rsp,%rsi), %edi
; X64-BMI2-NEXT: addl %edi, %edi
; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax
-; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: shrxq %rcx, (%rsp,%rsi), %rcx
; X64-BMI2-NEXT: orl %eax, %ecx
; X64-BMI2-NEXT: movb %cl, (%rdx)
-; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: movq %rbp, %rsp
+; X64-BMI2-NEXT: popq %rbp
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2030,59 +2110,67 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: pushq %rbp
+; X64-NO-BMI2-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NEXT: subq $160, %rsp
; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: andl $56, %ecx
; X64-NO-BMI2-NEXT: andl $56, %esi
-; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: movq (%rsp,%rsi), %rax
; X64-NO-BMI2-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: movl 8(%rsp,%rsi), %esi
; X64-NO-BMI2-NEXT: addl %esi, %esi
; X64-NO-BMI2-NEXT: notl %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shlq %cl, %rsi
; X64-NO-BMI2-NEXT: orl %eax, %esi
; X64-NO-BMI2-NEXT: movw %si, (%rdx)
-; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NEXT: popq %rbp
; X64-NO-BMI2-NEXT: retq
;
; X64-BMI2-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: pushq %rbp
+; X64-BMI2-NEXT: movq %rsp, %rbp
+; X64-BMI2-NEXT: andq $-32, %rsp
+; X64-BMI2-NEXT: subq $160, %rsp
; X64-BMI2-NEXT: movups (%rdi), %xmm0
; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
; X64-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-BMI2-NEXT: leal (,%rsi,8), %eax
; X64-BMI2-NEXT: andl $56, %eax
; X64-BMI2-NEXT: movl %eax, %ecx
; X64-BMI2-NEXT: notl %eax
; X64-BMI2-NEXT: andl $56, %esi
-; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT: movl 8(%rsp,%rsi), %edi
; X64-BMI2-NEXT: addl %edi, %edi
; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax
-; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: shrxq %rcx, (%rsp,%rsi), %rcx
; X64-BMI2-NEXT: orl %eax, %ecx
; X64-BMI2-NEXT: movw %cx, (%rdx)
-; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: movq %rbp, %rsp
+; X64-BMI2-NEXT: popq %rbp
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2193,59 +2281,67 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: pushq %rax
+; X64-NO-BMI2-NEXT: pushq %rbp
+; X64-NO-BMI2-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NEXT: subq $160, %rsp
; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: andl $56, %ecx
; X64-NO-BMI2-NEXT: andl $56, %esi
-; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax
+; X64-NO-BMI2-NEXT: movq (%rsp,%rsi), %rax
; X64-NO-BMI2-NEXT: shrq %cl, %rax
-; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
+; X64-NO-BMI2-NEXT: movl 8(%rsp,%rsi), %esi
; X64-NO-BMI2-NEXT: addl %esi, %esi
; X64-NO-BMI2-NEXT: notl %ecx
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shlq %cl, %rsi
; X64-NO-BMI2-NEXT: orl %eax, %esi
; X64-NO-BMI2-NEXT: movl %esi, (%rdx)
-; X64-NO-BMI2-NEXT: popq %rax
+; X64-NO-BMI2-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NEXT: popq %rbp
; X64-NO-BMI2-NEXT: retq
;
; X64-BMI2-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-BMI2: # %bb.0:
-; X64-BMI2-NEXT: pushq %rax
+; X64-BMI2-NEXT: pushq %rbp
+; X64-BMI2-NEXT: movq %rsp, %rbp
+; X64-BMI2-NEXT: andq $-32, %rsp
+; X64-BMI2-NEXT: subq $160, %rsp
; X64-BMI2-NEXT: movups (%rdi), %xmm0
; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
; X64-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-BMI2-NEXT: leal (,%rsi,8), %eax
; X64-BMI2-NEXT: andl $56, %eax
; X64-BMI2-NEXT: movl %eax, %ecx
; X64-BMI2-NEXT: notl %eax
; X64-BMI2-NEXT: andl $56, %esi
-; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT: movl 8(%rsp,%rsi), %edi
; X64-BMI2-NEXT: addl %edi, %edi
; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax
-; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx
+; X64-BMI2-NEXT: shrxq %rcx, (%rsp,%rsi), %rcx
; X64-BMI2-NEXT: orl %eax, %ecx
; X64-BMI2-NEXT: movl %ecx, (%rdx)
-; X64-BMI2-NEXT: popq %rax
+; X64-BMI2-NEXT: movq %rbp, %rsp
+; X64-BMI2-NEXT: popq %rbp
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2356,22 +2452,25 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $160, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rsp,%rsi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rsi), %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
@@ -2379,57 +2478,66 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rax, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-SHLD: # %bb.0:
-; X64-SHLD-NEXT: pushq %rax
+; X64-SHLD-NEXT: pushq %rbp
+; X64-SHLD-NEXT: movq %rsp, %rbp
+; X64-SHLD-NEXT: andq $-32, %rsp
+; X64-SHLD-NEXT: subq $160, %rsp
; X64-SHLD-NEXT: movups (%rdi), %xmm0
; X64-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-SHLD-NEXT: andl $56, %esi
-; X64-SHLD-NEXT: movq -128(%rsp,%rsi), %rax
-; X64-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-SHLD-NEXT: movq (%rsp,%rsi), %rax
+; X64-SHLD-NEXT: movq 8(%rsp,%rsi), %rsi
; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SHLD-NEXT: shrdq %cl, %rsi, %rax
; X64-SHLD-NEXT: movq %rax, (%rdx)
-; X64-SHLD-NEXT: popq %rax
+; X64-SHLD-NEXT: movq %rbp, %rsp
+; X64-SHLD-NEXT: popq %rbp
; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $160, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, (%rsp,%rsi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rsi), %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2582,24 +2690,27 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $160, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rsp,%rsi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rsi), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
@@ -2609,41 +2720,45 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rsi), %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: subq $160, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %edi
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rsi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rsi), %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
@@ -2652,32 +2767,36 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $160, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, (%rsp,%rsi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rsi), %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r10, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx
@@ -2688,40 +2807,45 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: subq $160, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rsi), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rdi, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rsi), %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rax, %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2916,26 +3040,29 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $128, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi
; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rsi), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %r8d
@@ -2948,71 +3075,75 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax
; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rsi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rsp,%rsi), %r11
; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r14
; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rsp,%rsi), %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: leaq -16(%rbp), %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: subq $128, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %edi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rsi), %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %edi
; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rsi), %r11
; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rsp,%rsi), %r10
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %r14
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rsp,%rsi), %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
@@ -3024,90 +3155,101 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 8(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: addq $8, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq -16(%rbp), %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx
; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $160, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, (%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %rbx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rsp,%rsi), %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq -8(%rbp), %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: subq $160, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rsi), %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rsi), %r10
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %r11, %r11
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rsp,%rsi), %r9
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r9, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rsp,%rsi), %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rsi, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rax
@@ -3118,7 +3260,9 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq -8(%rbp), %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index bed8e5806380c..1fe9a148c2bee 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -1193,20 +1193,26 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-LABEL: load_1byte_chunk_of_32byte_alloca:
; X64: # %bb.0:
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $96, %rsp
; X64-NEXT: movups (%rdi), %xmm0
; X64-NEXT: movups 16(%rdi), %xmm1
; X64-NEXT: leal (,%rsi,8), %eax
; X64-NEXT: xorps %xmm2, %xmm2
-; X64-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, (%rsp)
; X64-NEXT: shrb $6, %al
; X64-NEXT: movzbl %al, %eax
-; X64-NEXT: leaq -72(%rsp,%rax,8), %rax
+; X64-NEXT: leaq (%rsp,%rax,8), %rax
; X64-NEXT: andl $7, %esi
; X64-NEXT: movzbl (%rsi,%rax), %eax
; X64-NEXT: movb %al, (%rdx)
+; X64-NEXT: movq %rbp, %rsp
+; X64-NEXT: popq %rbp
; X64-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
@@ -1310,20 +1316,24 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rbp
+; X64-NO-BMI2-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NEXT: subq $96, %rsp
; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NEXT: movl %ecx, %eax
; X64-NO-BMI2-NEXT: shrb $6, %al
; X64-NO-BMI2-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: movq (%rsp,%rax,8), %rsi
; X64-NO-BMI2-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: movl 8(%rsp,%rax,8), %eax
; X64-NO-BMI2-NEXT: addl %eax, %eax
; X64-NO-BMI2-NEXT: andb $56, %cl
; X64-NO-BMI2-NEXT: notb %cl
@@ -1331,30 +1341,38 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-NEXT: shlq %cl, %rax
; X64-NO-BMI2-NEXT: orl %esi, %eax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
+; X64-NO-BMI2-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NEXT: popq %rbp
; X64-NO-BMI2-NEXT: retq
;
; X64-BMI2-LABEL: load_2byte_chunk_of_32byte_alloca:
; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rbp
+; X64-BMI2-NEXT: movq %rsp, %rbp
+; X64-BMI2-NEXT: andq $-32, %rsp
+; X64-BMI2-NEXT: subq $96, %rsp
; X64-BMI2-NEXT: movups (%rdi), %xmm0
; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-BMI2-NEXT: movl %esi, %eax
; X64-BMI2-NEXT: shrb $6, %al
; X64-BMI2-NEXT: movzbl %al, %eax
-; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: shrxq %rsi, (%rsp,%rax,8), %rcx
; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
; X64-BMI2-NEXT: andb $56, %sil
; X64-BMI2-NEXT: notb %sil
-; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: movl 8(%rsp,%rax,8), %eax
; X64-BMI2-NEXT: addl %eax, %eax
; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
; X64-BMI2-NEXT: orl %eax, %ecx
; X64-BMI2-NEXT: movw %cx, (%rdx)
+; X64-BMI2-NEXT: movq %rbp, %rsp
+; X64-BMI2-NEXT: popq %rbp
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_32byte_alloca:
@@ -1457,20 +1475,24 @@ define void @load_2byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
; X64-NO-BMI2: # %bb.0:
+; X64-NO-BMI2-NEXT: pushq %rbp
+; X64-NO-BMI2-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NEXT: subq $96, %rsp
; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NEXT: movl %ecx, %eax
; X64-NO-BMI2-NEXT: shrb $6, %al
; X64-NO-BMI2-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NEXT: movq (%rsp,%rax,8), %rsi
; X64-NO-BMI2-NEXT: shrq %cl, %rsi
-; X64-NO-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-NO-BMI2-NEXT: movl 8(%rsp,%rax,8), %eax
; X64-NO-BMI2-NEXT: addl %eax, %eax
; X64-NO-BMI2-NEXT: andb $56, %cl
; X64-NO-BMI2-NEXT: notb %cl
@@ -1478,30 +1500,38 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-NEXT: shlq %cl, %rax
; X64-NO-BMI2-NEXT: orl %esi, %eax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
+; X64-NO-BMI2-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NEXT: popq %rbp
; X64-NO-BMI2-NEXT: retq
;
; X64-BMI2-LABEL: load_4byte_chunk_of_32byte_alloca:
; X64-BMI2: # %bb.0:
+; X64-BMI2-NEXT: pushq %rbp
+; X64-BMI2-NEXT: movq %rsp, %rbp
+; X64-BMI2-NEXT: andq $-32, %rsp
+; X64-BMI2-NEXT: subq $96, %rsp
; X64-BMI2-NEXT: movups (%rdi), %xmm0
; X64-BMI2-NEXT: movups 16(%rdi), %xmm1
; X64-BMI2-NEXT: shll $3, %esi
; X64-BMI2-NEXT: xorps %xmm2, %xmm2
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-BMI2-NEXT: movaps %xmm0, (%rsp)
; X64-BMI2-NEXT: movl %esi, %eax
; X64-BMI2-NEXT: shrb $6, %al
; X64-BMI2-NEXT: movzbl %al, %eax
-; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT: shrxq %rsi, (%rsp,%rax,8), %rcx
; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
; X64-BMI2-NEXT: andb $56, %sil
; X64-BMI2-NEXT: notb %sil
-; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT: movl 8(%rsp,%rax,8), %eax
; X64-BMI2-NEXT: addl %eax, %eax
; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax
; X64-BMI2-NEXT: orl %eax, %ecx
; X64-BMI2-NEXT: movl %ecx, (%rdx)
+; X64-BMI2-NEXT: movq %rbp, %rsp
+; X64-BMI2-NEXT: popq %rbp
; X64-BMI2-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_32byte_alloca:
@@ -1604,19 +1634,23 @@ define void @load_4byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %al
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rsp,%rax,8), %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rax,8), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi
; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
@@ -1624,48 +1658,62 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X64-SHLD: # %bb.0:
+; X64-SHLD-NEXT: pushq %rbp
+; X64-SHLD-NEXT: movq %rsp, %rbp
+; X64-SHLD-NEXT: andq $-32, %rsp
+; X64-SHLD-NEXT: subq $96, %rsp
; X64-SHLD-NEXT: movups (%rdi), %xmm0
; X64-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-SHLD-NEXT: leal (,%rsi,8), %ecx
; X64-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-SHLD-NEXT: movl %ecx, %eax
; X64-SHLD-NEXT: shrb $6, %al
; X64-SHLD-NEXT: movzbl %al, %eax
-; X64-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
-; X64-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-SHLD-NEXT: movq (%rsp,%rax,8), %rsi
+; X64-SHLD-NEXT: movq 8(%rsp,%rax,8), %rax
; X64-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-SHLD-NEXT: shrdq %cl, %rax, %rsi
; X64-SHLD-NEXT: movq %rsi, (%rdx)
+; X64-SHLD-NEXT: movq %rbp, %rsp
+; X64-SHLD-NEXT: popq %rbp
; X64-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, (%rsp,%rax,8), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_32byte_alloca:
@@ -1807,19 +1855,23 @@ define void @load_8byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
+; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrb $6, %cl
; X64-NO-BMI2-NO-SHLD-NEXT: movzbl %cl, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi,8), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rdi,8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rsp,%rdi,8), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rdi,8), %r9
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi
@@ -1830,35 +1882,41 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10
; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rdi,8), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rdi,8), %rax
; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-NO-SHLD-NEXT: retq
;
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
+; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: subq $96, %rsp
; X64-NO-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrb $6, %cl
; X64-NO-BMI2-HAVE-SHLD-NEXT: movzbl %cl, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rsi,8), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rsi,8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rsi,8), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rsi,8), %r8
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9
; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rsi,8), %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi
; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi
@@ -1866,26 +1924,32 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx)
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-NO-BMI2-HAVE-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, (%rsp,%rcx,8), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rsp,%rcx,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rsp,%rcx,8), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9
@@ -1895,28 +1959,34 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsp, %rbp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andq $-32, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: subq $96, %rsp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm2, %xmm2
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, (%rsp)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrb $6, %al
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movzbl %al, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax,8), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rsp,%rax,8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rsp,%rax,8), %rdi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rsp,%rax,8), %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rax, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rax, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rax
@@ -1924,6 +1994,8 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rsi
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx)
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbp, %rsp
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq
;
; X86-NO-BMI2-NO-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptosi129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptosi129.ll
index 23c80f4bb523d..326562a425086 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptosi129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptosi129.ll
@@ -4,8 +4,7 @@
define i129 @halftosi129(half %a) {
; CHECK-LABEL: @halftosi129(
-; CHECK-NEXT: [[TMP1:%.*]] = fptosi half [[A:%.*]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i129
+; CHECK-NEXT: [[TMP2:%.*]] = fptosi half [[A:%.*]] to i129
; CHECK-NEXT: ret i129 [[TMP2]]
;
%conv = fptosi half %a to i129
@@ -14,35 +13,7 @@ define i129 @halftosi129(half %a) {
define i129 @floattosi129(float %a) {
; CHECK-LABEL: @floattosi129(
-; CHECK-NEXT: fp-to-i-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze float [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i32 [[TMP0]], -1
-; CHECK-NEXT: [[SIGN:%.*]] = select i1 [[TMP2]], i129 1, i129 -1
-; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP0]], 23
-; CHECK-NEXT: [[BIASED_EXP:%.*]] = and i32 [[TMP5]], 255
-; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 8388607
-; CHECK-NEXT: [[SIGNIFICAND1:%.*]] = or i32 [[TMP3]], 8388608
-; CHECK-NEXT: [[EXP_IS_NEGATIVE:%.*]] = icmp ult i32 [[BIASED_EXP]], 127
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK: fp-to-i-if-check.exp.size:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i32 [[BIASED_EXP]], 150
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK: fp-to-i-if-exp.small:
-; CHECK-NEXT: [[TMP14:%.*]] = sub i32 150, [[BIASED_EXP]]
-; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i129
-; CHECK-NEXT: [[TMP9:%.*]] = mul i129 [[TMP8]], [[SIGN]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-if-exp.large:
-; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[BIASED_EXP]], -150
-; CHECK-NEXT: [[SIGNIFICAND:%.*]] = zext i32 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP15]] to i129
-; CHECK-NEXT: [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = mul i129 [[TMP11]], [[SIGN]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-cleanup:
-; CHECK-NEXT: [[TMP13:%.*]] = phi i129 [ [[TMP9]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP12]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[A:%.*]] to i129
; CHECK-NEXT: ret i129 [[TMP13]]
;
%conv = fptosi float %a to i129
@@ -51,35 +22,7 @@ define i129 @floattosi129(float %a) {
define i129 @doubletosi129(double %a) {
; CHECK-LABEL: @doubletosi129(
-; CHECK-NEXT: fp-to-i-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze double [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast double [[A]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i64 [[TMP0]], -1
-; CHECK-NEXT: [[SIGN:%.*]] = select i1 [[TMP2]], i129 1, i129 -1
-; CHECK-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP0]], 52
-; CHECK-NEXT: [[BIASED_EXP:%.*]] = and i64 [[TMP5]], 2047
-; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP0]], 4503599627370495
-; CHECK-NEXT: [[SIGNIFICAND1:%.*]] = or i64 [[TMP3]], 4503599627370496
-; CHECK-NEXT: [[EXP_IS_NEGATIVE:%.*]] = icmp ult i64 [[BIASED_EXP]], 1023
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK: fp-to-i-if-check.exp.size:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i64 [[BIASED_EXP]], 1075
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK: fp-to-i-if-exp.small:
-; CHECK-NEXT: [[TMP14:%.*]] = sub i64 1075, [[BIASED_EXP]]
-; CHECK-NEXT: [[TMP7:%.*]] = lshr i64 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT: [[TMP8:%.*]] = zext i64 [[TMP7]] to i129
-; CHECK-NEXT: [[TMP9:%.*]] = mul i129 [[TMP8]], [[SIGN]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-if-exp.large:
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[BIASED_EXP]], -1075
-; CHECK-NEXT: [[SIGNIFICAND:%.*]] = zext i64 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT: [[TMP10:%.*]] = zext i64 [[TMP15]] to i129
-; CHECK-NEXT: [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = mul i129 [[TMP11]], [[SIGN]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-cleanup:
-; CHECK-NEXT: [[TMP13:%.*]] = phi i129 [ [[TMP9]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP12]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = fptosi double [[A:%.*]] to i129
; CHECK-NEXT: ret i129 [[TMP13]]
;
%conv = fptosi double %a to i129
@@ -88,36 +31,7 @@ define i129 @doubletosi129(double %a) {
define i129 @x86_fp80tosi129(x86_fp80 %a) {
; CHECK-LABEL: @x86_fp80tosi129(
-; CHECK-NEXT: fp-to-i-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze x86_fp80 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = fpext x86_fp80 [[A]] to fp128
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast fp128 [[TMP0]] to i128
-; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i128 [[TMP1]], -1
-; CHECK-NEXT: [[SIGN:%.*]] = select i1 [[TMP3]], i129 1, i129 -1
-; CHECK-NEXT: [[TMP6:%.*]] = lshr i128 [[TMP1]], 112
-; CHECK-NEXT: [[BIASED_EXP:%.*]] = and i128 [[TMP6]], 32767
-; CHECK-NEXT: [[TMP4:%.*]] = and i128 [[TMP1]], 5192296858534827628530496329220095
-; CHECK-NEXT: [[SIGNIFICAND1:%.*]] = or i128 [[TMP4]], 5192296858534827628530496329220096
-; CHECK-NEXT: [[EXP_IS_NEGATIVE:%.*]] = icmp ult i128 [[BIASED_EXP]], 16383
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK: fp-to-i-if-check.exp.size:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i128 [[BIASED_EXP]], 16495
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK: fp-to-i-if-exp.small:
-; CHECK-NEXT: [[TMP15:%.*]] = sub i128 16495, [[BIASED_EXP]]
-; CHECK-NEXT: [[TMP8:%.*]] = lshr i128 [[SIGNIFICAND1]], [[TMP15]]
-; CHECK-NEXT: [[TMP9:%.*]] = zext i128 [[TMP8]] to i129
-; CHECK-NEXT: [[TMP10:%.*]] = mul i129 [[TMP9]], [[SIGN]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-if-exp.large:
-; CHECK-NEXT: [[TMP16:%.*]] = add i128 [[BIASED_EXP]], -16495
-; CHECK-NEXT: [[SIGNIFICAND:%.*]] = zext i128 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT: [[TMP11:%.*]] = zext i128 [[TMP16]] to i129
-; CHECK-NEXT: [[TMP12:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = mul i129 [[TMP12]], [[SIGN]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-cleanup:
-; CHECK-NEXT: [[TMP14:%.*]] = phi i129 [ [[TMP10]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP13]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = fptosi x86_fp80 [[A:%.*]] to i129
; CHECK-NEXT: ret i129 [[TMP14]]
;
%conv = fptosi x86_fp80 %a to i129
@@ -126,35 +40,7 @@ define i129 @x86_fp80tosi129(x86_fp80 %a) {
define i129 @fp128tosi129(fp128 %a) {
; CHECK-LABEL: @fp128tosi129(
-; CHECK-NEXT: fp-to-i-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze fp128 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast fp128 [[A]] to i128
-; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt i128 [[TMP0]], -1
-; CHECK-NEXT: [[SIGN:%.*]] = select i1 [[TMP2]], i129 1, i129 -1
-; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP0]], 112
-; CHECK-NEXT: [[BIASED_EXP:%.*]] = and i128 [[TMP5]], 32767
-; CHECK-NEXT: [[TMP3:%.*]] = and i128 [[TMP0]], 5192296858534827628530496329220095
-; CHECK-NEXT: [[SIGNIFICAND1:%.*]] = or i128 [[TMP3]], 5192296858534827628530496329220096
-; CHECK-NEXT: [[EXP_IS_NEGATIVE:%.*]] = icmp ult i128 [[BIASED_EXP]], 16383
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK: fp-to-i-if-check.exp.size:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i128 [[BIASED_EXP]], 16495
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK: fp-to-i-if-exp.small:
-; CHECK-NEXT: [[TMP14:%.*]] = sub i128 16495, [[BIASED_EXP]]
-; CHECK-NEXT: [[TMP7:%.*]] = lshr i128 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT: [[TMP8:%.*]] = zext i128 [[TMP7]] to i129
-; CHECK-NEXT: [[TMP9:%.*]] = mul i129 [[TMP8]], [[SIGN]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-if-exp.large:
-; CHECK-NEXT: [[TMP15:%.*]] = add i128 [[BIASED_EXP]], -16495
-; CHECK-NEXT: [[SIGNIFICAND:%.*]] = zext i128 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT: [[TMP10:%.*]] = zext i128 [[TMP15]] to i129
-; CHECK-NEXT: [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = mul i129 [[TMP11]], [[SIGN]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-cleanup:
-; CHECK-NEXT: [[TMP13:%.*]] = phi i129 [ [[TMP9]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP12]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = fptosi fp128 [[A:%.*]] to i129
; CHECK-NEXT: ret i129 [[TMP13]]
;
%conv = fptosi fp128 %a to i129
@@ -163,67 +49,7 @@ define i129 @fp128tosi129(fp128 %a) {
define <2 x i129> @floattosi129v2(<2 x float> %a) {
; CHECK-LABEL: @floattosi129v2(
-; CHECK-NEXT: fp-to-i-entryfp-to-i-entry:
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x float> [[A:%.*]], i64 0
-; CHECK-NEXT: [[TMP2:%.*]] = freeze float [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[TMP2]] to i32
-; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], -1
-; CHECK-NEXT: [[SIGN7:%.*]] = select i1 [[TMP3]], i129 1, i129 -1
-; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP1]], 23
-; CHECK-NEXT: [[BIASED_EXP8:%.*]] = and i32 [[TMP6]], 255
-; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP1]], 8388607
-; CHECK-NEXT: [[SIGNIFICAND10:%.*]] = or i32 [[TMP4]], 8388608
-; CHECK-NEXT: [[EXP_IS_NEGATIVE10:%.*]] = icmp ult i32 [[BIASED_EXP8]], 127
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE10]], label [[FP_TO_I_CLEANUP1:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE2:%.*]]
-; CHECK: fp-to-i-if-check.exp.size2:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH12:%.*]] = icmp ult i32 [[BIASED_EXP8]], 150
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH12]], label [[FP_TO_I_IF_EXP_SMALL5:%.*]], label [[FP_TO_I_IF_EXP_LARGE6:%.*]]
-; CHECK: fp-to-i-if-exp.small3:
-; CHECK-NEXT: [[TMP18:%.*]] = sub i32 150, [[BIASED_EXP8]]
-; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[SIGNIFICAND10]], [[TMP18]]
-; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i129
-; CHECK-NEXT: [[TMP10:%.*]] = mul i129 [[TMP9]], [[SIGN7]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP1]]
-; CHECK: fp-to-i-if-exp.large4:
-; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[BIASED_EXP8]], -150
-; CHECK-NEXT: [[SIGNIFICAND9:%.*]] = zext i32 [[SIGNIFICAND10]] to i129
-; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP20]] to i129
-; CHECK-NEXT: [[TMP12:%.*]] = shl i129 [[SIGNIFICAND9]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = mul i129 [[TMP12]], [[SIGN7]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP1]]
-; CHECK: fp-to-i-cleanup1:
-; CHECK-NEXT: [[TMP14:%.*]] = phi i129 [ [[TMP10]], [[FP_TO_I_IF_EXP_SMALL5]] ], [ [[TMP13]], [[FP_TO_I_IF_EXP_LARGE6]] ], [ 0, [[FP_TO_I_ENTRYFP_TO_I_ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i129> poison, i129 [[TMP14]], i64 0
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[A]], i64 1
-; CHECK-NEXT: [[TMP35:%.*]] = freeze float [[TMP16]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float [[TMP35]] to i32
-; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], -1
-; CHECK-NEXT: [[SIGN:%.*]] = select i1 [[TMP19]], i129 1, i129 -1
-; CHECK-NEXT: [[TMP21:%.*]] = lshr i32 [[TMP17]], 23
-; CHECK-NEXT: [[BIASED_EXP:%.*]] = and i32 [[TMP21]], 255
-; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP17]], 8388607
-; CHECK-NEXT: [[SIGNIFICAND1:%.*]] = or i32 [[TMP22]], 8388608
-; CHECK-NEXT: [[EXP_IS_NEGATIVE:%.*]] = icmp ult i32 [[BIASED_EXP]], 127
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK: fp-to-i-if-check.exp.size:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i32 [[BIASED_EXP]], 150
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK: fp-to-i-if-exp.small:
-; CHECK-NEXT: [[TMP32:%.*]] = sub i32 150, [[BIASED_EXP]]
-; CHECK-NEXT: [[TMP33:%.*]] = lshr i32 [[SIGNIFICAND1]], [[TMP32]]
-; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP33]] to i129
-; CHECK-NEXT: [[TMP26:%.*]] = mul i129 [[TMP25]], [[SIGN]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-if-exp.large:
-; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[BIASED_EXP]], -150
-; CHECK-NEXT: [[SIGNIFICAND:%.*]] = zext i32 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT: [[TMP27:%.*]] = zext i32 [[TMP34]] to i129
-; CHECK-NEXT: [[TMP28:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP27]]
-; CHECK-NEXT: [[TMP29:%.*]] = mul i129 [[TMP28]], [[SIGN]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-cleanup:
-; CHECK-NEXT: [[TMP30:%.*]] = phi i129 [ [[TMP26]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP29]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_CLEANUP1]] ]
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i129> [[TMP15]], i129 [[TMP30]], i64 1
+; CHECK-NEXT: [[TMP31:%.*]] = fptosi <2 x float> [[A:%.*]] to <2 x i129>
; CHECK-NEXT: ret <2 x i129> [[TMP31]]
;
%conv = fptosi <2 x float> %a to <2 x i129>
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptoui129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptoui129.ll
index 864f13fe61624..7f2ea43498de0 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptoui129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-fptoui129.ll
@@ -4,8 +4,7 @@
define i129 @halftoui129(half %a) {
; CHECK-LABEL: @halftoui129(
-; CHECK-NEXT: [[TMP1:%.*]] = fptoui half [[A:%.*]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i129
+; CHECK-NEXT: [[TMP2:%.*]] = fptoui half [[A:%.*]] to i129
; CHECK-NEXT: ret i129 [[TMP2]]
;
%conv = fptoui half %a to i129
@@ -14,31 +13,7 @@ define i129 @halftoui129(half %a) {
define i129 @floattoui129(float %a) {
; CHECK-LABEL: @floattoui129(
-; CHECK-NEXT: fp-to-i-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze float [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A]] to i32
-; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP0]], 23
-; CHECK-NEXT: [[BIASED_EXP:%.*]] = and i32 [[TMP5]], 255
-; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP0]], 8388607
-; CHECK-NEXT: [[SIGNIFICAND1:%.*]] = or i32 [[TMP3]], 8388608
-; CHECK-NEXT: [[EXP_IS_NEGATIVE:%.*]] = icmp ult i32 [[BIASED_EXP]], 127
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK: fp-to-i-if-check.exp.size:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i32 [[BIASED_EXP]], 150
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK: fp-to-i-if-exp.small:
-; CHECK-NEXT: [[TMP14:%.*]] = sub i32 150, [[BIASED_EXP]]
-; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i129
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-if-exp.large:
-; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[BIASED_EXP]], -150
-; CHECK-NEXT: [[SIGNIFICAND:%.*]] = zext i32 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP15]] to i129
-; CHECK-NEXT: [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-cleanup:
-; CHECK-NEXT: [[TMP13:%.*]] = phi i129 [ [[TMP8]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP11]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[A:%.*]] to i129
; CHECK-NEXT: ret i129 [[TMP13]]
;
%conv = fptoui float %a to i129
@@ -47,31 +22,7 @@ define i129 @floattoui129(float %a) {
define i129 @doubletoui129(double %a) {
; CHECK-LABEL: @doubletoui129(
-; CHECK-NEXT: fp-to-i-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze double [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast double [[A]] to i64
-; CHECK-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP0]], 52
-; CHECK-NEXT: [[BIASED_EXP:%.*]] = and i64 [[TMP5]], 2047
-; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP0]], 4503599627370495
-; CHECK-NEXT: [[SIGNIFICAND1:%.*]] = or i64 [[TMP3]], 4503599627370496
-; CHECK-NEXT: [[EXP_IS_NEGATIVE:%.*]] = icmp ult i64 [[BIASED_EXP]], 1023
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK: fp-to-i-if-check.exp.size:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i64 [[BIASED_EXP]], 1075
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK: fp-to-i-if-exp.small:
-; CHECK-NEXT: [[TMP14:%.*]] = sub i64 1075, [[BIASED_EXP]]
-; CHECK-NEXT: [[TMP7:%.*]] = lshr i64 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT: [[TMP8:%.*]] = zext i64 [[TMP7]] to i129
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-if-exp.large:
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[BIASED_EXP]], -1075
-; CHECK-NEXT: [[SIGNIFICAND:%.*]] = zext i64 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT: [[TMP10:%.*]] = zext i64 [[TMP15]] to i129
-; CHECK-NEXT: [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-cleanup:
-; CHECK-NEXT: [[TMP13:%.*]] = phi i129 [ [[TMP8]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP11]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = fptoui double [[A:%.*]] to i129
; CHECK-NEXT: ret i129 [[TMP13]]
;
%conv = fptoui double %a to i129
@@ -80,32 +31,7 @@ define i129 @doubletoui129(double %a) {
define i129 @x86_fp80toui129(x86_fp80 %a) {
; CHECK-LABEL: @x86_fp80toui129(
-; CHECK-NEXT: fp-to-i-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze x86_fp80 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = fpext x86_fp80 [[A]] to fp128
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast fp128 [[TMP0]] to i128
-; CHECK-NEXT: [[TMP6:%.*]] = lshr i128 [[TMP1]], 112
-; CHECK-NEXT: [[BIASED_EXP:%.*]] = and i128 [[TMP6]], 32767
-; CHECK-NEXT: [[TMP4:%.*]] = and i128 [[TMP1]], 5192296858534827628530496329220095
-; CHECK-NEXT: [[SIGNIFICAND1:%.*]] = or i128 [[TMP4]], 5192296858534827628530496329220096
-; CHECK-NEXT: [[EXP_IS_NEGATIVE:%.*]] = icmp ult i128 [[BIASED_EXP]], 16383
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK: fp-to-i-if-check.exp.size:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i128 [[BIASED_EXP]], 16495
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK: fp-to-i-if-exp.small:
-; CHECK-NEXT: [[TMP15:%.*]] = sub i128 16495, [[BIASED_EXP]]
-; CHECK-NEXT: [[TMP8:%.*]] = lshr i128 [[SIGNIFICAND1]], [[TMP15]]
-; CHECK-NEXT: [[TMP9:%.*]] = zext i128 [[TMP8]] to i129
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-if-exp.large:
-; CHECK-NEXT: [[TMP16:%.*]] = add i128 [[BIASED_EXP]], -16495
-; CHECK-NEXT: [[SIGNIFICAND:%.*]] = zext i128 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT: [[TMP11:%.*]] = zext i128 [[TMP16]] to i129
-; CHECK-NEXT: [[TMP12:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP11]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-cleanup:
-; CHECK-NEXT: [[TMP14:%.*]] = phi i129 [ [[TMP9]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP12]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = fptoui x86_fp80 [[A:%.*]] to i129
; CHECK-NEXT: ret i129 [[TMP14]]
;
%conv = fptoui x86_fp80 %a to i129
@@ -114,31 +40,7 @@ define i129 @x86_fp80toui129(x86_fp80 %a) {
define i129 @fp128toui129(fp128 %a) {
; CHECK-LABEL: @fp128toui129(
-; CHECK-NEXT: fp-to-i-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze fp128 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast fp128 [[A]] to i128
-; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP0]], 112
-; CHECK-NEXT: [[BIASED_EXP:%.*]] = and i128 [[TMP5]], 32767
-; CHECK-NEXT: [[TMP3:%.*]] = and i128 [[TMP0]], 5192296858534827628530496329220095
-; CHECK-NEXT: [[SIGNIFICAND1:%.*]] = or i128 [[TMP3]], 5192296858534827628530496329220096
-; CHECK-NEXT: [[EXP_IS_NEGATIVE:%.*]] = icmp ult i128 [[BIASED_EXP]], 16383
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK: fp-to-i-if-check.exp.size:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i128 [[BIASED_EXP]], 16495
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK: fp-to-i-if-exp.small:
-; CHECK-NEXT: [[TMP14:%.*]] = sub i128 16495, [[BIASED_EXP]]
-; CHECK-NEXT: [[TMP7:%.*]] = lshr i128 [[SIGNIFICAND1]], [[TMP14]]
-; CHECK-NEXT: [[TMP8:%.*]] = zext i128 [[TMP7]] to i129
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-if-exp.large:
-; CHECK-NEXT: [[TMP15:%.*]] = add i128 [[BIASED_EXP]], -16495
-; CHECK-NEXT: [[SIGNIFICAND:%.*]] = zext i128 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT: [[TMP10:%.*]] = zext i128 [[TMP15]] to i129
-; CHECK-NEXT: [[TMP11:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP10]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-cleanup:
-; CHECK-NEXT: [[TMP13:%.*]] = phi i129 [ [[TMP8]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP11]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = fptoui fp128 [[A:%.*]] to i129
; CHECK-NEXT: ret i129 [[TMP13]]
;
%conv = fptoui fp128 %a to i129
@@ -147,59 +49,7 @@ define i129 @fp128toui129(fp128 %a) {
define <2 x i129> @floattoui129v2(<2 x float> %a) {
; CHECK-LABEL: @floattoui129v2(
-; CHECK-NEXT: fp-to-i-entryfp-to-i-entry:
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x float> [[A:%.*]], i64 0
-; CHECK-NEXT: [[TMP2:%.*]] = freeze float [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[TMP2]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP1]], 23
-; CHECK-NEXT: [[BIASED_EXP8:%.*]] = and i32 [[TMP6]], 255
-; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP1]], 8388607
-; CHECK-NEXT: [[SIGNIFICAND10:%.*]] = or i32 [[TMP4]], 8388608
-; CHECK-NEXT: [[EXP_IS_NEGATIVE10:%.*]] = icmp ult i32 [[BIASED_EXP8]], 127
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE10]], label [[FP_TO_I_CLEANUP1:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE2:%.*]]
-; CHECK: fp-to-i-if-check.exp.size2:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH12:%.*]] = icmp ult i32 [[BIASED_EXP8]], 150
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH12]], label [[FP_TO_I_IF_EXP_SMALL5:%.*]], label [[FP_TO_I_IF_EXP_LARGE6:%.*]]
-; CHECK: fp-to-i-if-exp.small3:
-; CHECK-NEXT: [[TMP18:%.*]] = sub i32 150, [[BIASED_EXP8]]
-; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[SIGNIFICAND10]], [[TMP18]]
-; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i129
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP1]]
-; CHECK: fp-to-i-if-exp.large4:
-; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[BIASED_EXP8]], -150
-; CHECK-NEXT: [[SIGNIFICAND9:%.*]] = zext i32 [[SIGNIFICAND10]] to i129
-; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP20]] to i129
-; CHECK-NEXT: [[TMP12:%.*]] = shl i129 [[SIGNIFICAND9]], [[TMP11]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP1]]
-; CHECK: fp-to-i-cleanup1:
-; CHECK-NEXT: [[TMP14:%.*]] = phi i129 [ [[TMP9]], [[FP_TO_I_IF_EXP_SMALL5]] ], [ [[TMP12]], [[FP_TO_I_IF_EXP_LARGE6]] ], [ 0, [[FP_TO_I_ENTRYFP_TO_I_ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i129> poison, i129 [[TMP14]], i64 0
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[A]], i64 1
-; CHECK-NEXT: [[TMP35:%.*]] = freeze float [[TMP16]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float [[TMP35]] to i32
-; CHECK-NEXT: [[TMP21:%.*]] = lshr i32 [[TMP17]], 23
-; CHECK-NEXT: [[BIASED_EXP:%.*]] = and i32 [[TMP21]], 255
-; CHECK-NEXT: [[TMP22:%.*]] = and i32 [[TMP17]], 8388607
-; CHECK-NEXT: [[SIGNIFICAND1:%.*]] = or i32 [[TMP22]], 8388608
-; CHECK-NEXT: [[EXP_IS_NEGATIVE:%.*]] = icmp ult i32 [[BIASED_EXP]], 127
-; CHECK-NEXT: br i1 [[EXP_IS_NEGATIVE]], label [[FP_TO_I_CLEANUP:%.*]], label [[FP_TO_I_IF_CHECK_SATURATE:%.*]]
-; CHECK: fp-to-i-if-check.exp.size:
-; CHECK-NEXT: [[EXP_SMALLER_MANTISSA_WIDTH:%.*]] = icmp ult i32 [[BIASED_EXP]], 150
-; CHECK-NEXT: br i1 [[EXP_SMALLER_MANTISSA_WIDTH]], label [[FP_TO_I_IF_EXP_SMALL:%.*]], label [[FP_TO_I_IF_EXP_LARGE:%.*]]
-; CHECK: fp-to-i-if-exp.small:
-; CHECK-NEXT: [[TMP32:%.*]] = sub i32 150, [[BIASED_EXP]]
-; CHECK-NEXT: [[TMP33:%.*]] = lshr i32 [[SIGNIFICAND1]], [[TMP32]]
-; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP33]] to i129
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-if-exp.large:
-; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[BIASED_EXP]], -150
-; CHECK-NEXT: [[SIGNIFICAND:%.*]] = zext i32 [[SIGNIFICAND1]] to i129
-; CHECK-NEXT: [[TMP27:%.*]] = zext i32 [[TMP34]] to i129
-; CHECK-NEXT: [[TMP28:%.*]] = shl i129 [[SIGNIFICAND]], [[TMP27]]
-; CHECK-NEXT: br label [[FP_TO_I_CLEANUP]]
-; CHECK: fp-to-i-cleanup:
-; CHECK-NEXT: [[TMP30:%.*]] = phi i129 [ [[TMP25]], [[FP_TO_I_IF_EXP_SMALL]] ], [ [[TMP28]], [[FP_TO_I_IF_EXP_LARGE]] ], [ 0, [[FP_TO_I_CLEANUP1]] ]
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i129> [[TMP15]], i129 [[TMP30]], i64 1
+; CHECK-NEXT: [[TMP31:%.*]] = fptoui <2 x float> [[A:%.*]] to <2 x i129>
; CHECK-NEXT: ret <2 x i129> [[TMP31]]
;
%conv = fptoui <2 x float> %a to <2 x i129>
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-si129tofp.ll b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-si129tofp.ll
index a3677bafb4449..ec8628688489c 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-si129tofp.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-si129tofp.ll
@@ -4,84 +4,7 @@
define half @si129tohalf(i129 %a) {
; CHECK-LABEL: @si129tohalf(
-; CHECK-NEXT: itofp-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT: br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT: [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
-; CHECK-NEXT: br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP9:%.*]] = shl i129 [[TMP3]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP10:%.*]] = sub i32 103, [[TMP5]]
-; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT: [[TMP12:%.*]] = lshr i129 [[TMP3]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP5]], 26
-; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT: [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = and i129 [[TMP15]], [[TMP3]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT: [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP3]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT: [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT: [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT: [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT: [[TMP27:%.*]] = ashr i129 [[TMP26]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP26]], 67108864
-; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i129 [[TMP27]] to i32
-; CHECK-NEXT: [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT: [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT: br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP32:%.*]] = ashr i129 [[TMP26]], 3
-; CHECK-NEXT: [[TMP33:%.*]] = trunc i129 [[TMP32]] to i32
-; CHECK-NEXT: [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT: [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP5]], -105
-; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT: [[TMP38:%.*]] = shl i129 [[TMP3]], [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = trunc i129 [[TMP38]] to i32
-; CHECK-NEXT: [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT: [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP42:%.*]] = phi i32 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP43:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP44:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP44]], -2147483648
-; CHECK-NEXT: [[TMP46:%.*]] = shl i32 [[TMP43]], 23
-; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP46]], 1065353216
-; CHECK-NEXT: [[TMP48:%.*]] = and i32 [[TMP42]], 8388607
-; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP48]], [[TMP45]]
-; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP47]]
-; CHECK-NEXT: [[TMP51:%.*]] = bitcast i32 [[TMP50]] to float
-; CHECK-NEXT: [[TMP52:%.*]] = fptrunc float [[TMP51]] to half
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP53:%.*]] = phi half [ [[TMP52]], [[ITOFP_IF_END26]] ], [ 0xH0000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP53:%.*]] = sitofp i129 [[A:%.*]] to half
; CHECK-NEXT: ret half [[TMP53]]
;
%conv = sitofp i129 %a to half
@@ -90,83 +13,7 @@ define half @si129tohalf(i129 %a) {
define float @si129tofloat(i129 %a) {
; CHECK-LABEL: @si129tofloat(
-; CHECK-NEXT: itofp-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT: br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT: [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
-; CHECK-NEXT: br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP9:%.*]] = shl i129 [[TMP3]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP10:%.*]] = sub i32 103, [[TMP5]]
-; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT: [[TMP12:%.*]] = lshr i129 [[TMP3]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP5]], 26
-; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT: [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = and i129 [[TMP15]], [[TMP3]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT: [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP3]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT: [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT: [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT: [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT: [[TMP27:%.*]] = ashr i129 [[TMP26]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP26]], 67108864
-; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i129 [[TMP27]] to i32
-; CHECK-NEXT: [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT: [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT: br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP32:%.*]] = ashr i129 [[TMP26]], 3
-; CHECK-NEXT: [[TMP33:%.*]] = trunc i129 [[TMP32]] to i32
-; CHECK-NEXT: [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT: [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP5]], -105
-; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT: [[TMP38:%.*]] = shl i129 [[TMP3]], [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = trunc i129 [[TMP38]] to i32
-; CHECK-NEXT: [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT: [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP42:%.*]] = phi i32 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP43:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP44:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP44]], -2147483648
-; CHECK-NEXT: [[TMP46:%.*]] = shl i32 [[TMP43]], 23
-; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP46]], 1065353216
-; CHECK-NEXT: [[TMP48:%.*]] = and i32 [[TMP42]], 8388607
-; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP48]], [[TMP45]]
-; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP47]]
-; CHECK-NEXT: [[TMP51:%.*]] = bitcast i32 [[TMP50]] to float
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP52:%.*]] = phi float [ [[TMP51]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP52:%.*]] = sitofp i129 [[A:%.*]] to float
; CHECK-NEXT: ret float [[TMP52]]
;
%conv = sitofp i129 %a to float
@@ -175,88 +22,7 @@ define float @si129tofloat(i129 %a) {
define double @si129todouble(i129 %a) {
; CHECK-LABEL: @si129todouble(
-; CHECK-NEXT: itofp-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT: br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT: [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 53
-; CHECK-NEXT: br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i32 54, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i32 55, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP9:%.*]] = shl i129 [[TMP3]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP10:%.*]] = sub i32 74, [[TMP5]]
-; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT: [[TMP12:%.*]] = lshr i129 [[TMP3]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP5]], 55
-; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT: [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = and i129 [[TMP15]], [[TMP3]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT: [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP3]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT: [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT: [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT: [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT: [[TMP27:%.*]] = ashr i129 [[TMP26]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP26]], 36028797018963968
-; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i129 [[TMP27]] to i64
-; CHECK-NEXT: [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT: [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT: br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP32:%.*]] = ashr i129 [[TMP26]], 3
-; CHECK-NEXT: [[TMP33:%.*]] = trunc i129 [[TMP32]] to i64
-; CHECK-NEXT: [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT: [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP5]], -76
-; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT: [[TMP38:%.*]] = shl i129 [[TMP3]], [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = trunc i129 [[TMP38]] to i64
-; CHECK-NEXT: [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT: [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP42:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP43:%.*]] = phi i32 [ [[TMP35]], [[ITOFP_IF_THEN20]] ], [ [[TMP31]], [[ITOFP_SW_EPILOG]] ], [ [[TMP41]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP44:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP45:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
-; CHECK-NEXT: [[TMP47:%.*]] = shl i32 [[TMP44]], 20
-; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP47]], 1072693248
-; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP43]], 1048575
-; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
-; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP50]], [[TMP48]]
-; CHECK-NEXT: [[TMP52:%.*]] = zext i32 [[TMP51]] to i64
-; CHECK-NEXT: [[TMP53:%.*]] = shl i64 [[TMP52]], 32
-; CHECK-NEXT: [[TMP54:%.*]] = and i64 [[TMP42]], 4294967295
-; CHECK-NEXT: [[TMP55:%.*]] = or i64 [[TMP53]], [[TMP54]]
-; CHECK-NEXT: [[TMP56:%.*]] = bitcast i64 [[TMP55]] to double
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP57:%.*]] = phi double [ [[TMP56]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP57:%.*]] = sitofp i129 [[A:%.*]] to double
; CHECK-NEXT: ret double [[TMP57]]
;
%conv = sitofp i129 %a to double
@@ -265,83 +31,7 @@ define double @si129todouble(i129 %a) {
define x86_fp80 @si129tox86_fp80(i129 %a) {
; CHECK-LABEL: @si129tox86_fp80(
-; CHECK-NEXT: itofp-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT: br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT: [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = sub i129 129, [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
-; CHECK-NEXT: br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i129 115, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP9:%.*]] = shl i129 [[TMP3]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP10:%.*]] = sub i129 14, [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = lshr i129 [[TMP3]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = add i129 [[TMP4]], 115
-; CHECK-NEXT: [[TMP13:%.*]] = lshr i129 -1, [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = and i129 [[TMP13]], [[TMP3]]
-; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i129 [[TMP14]], 0
-; CHECK-NEXT: [[TMP16:%.*]] = zext i1 [[TMP15]] to i129
-; CHECK-NEXT: [[TMP17:%.*]] = or i129 [[TMP11]], [[TMP16]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP18:%.*]] = phi i129 [ [[TMP17]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP3]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP19:%.*]] = trunc i129 [[TMP18]] to i32
-; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP19]], 2
-; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 1
-; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i129
-; CHECK-NEXT: [[TMP23:%.*]] = or i129 [[TMP18]], [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = add i129 [[TMP23]], 1
-; CHECK-NEXT: [[TMP25:%.*]] = ashr i129 [[TMP24]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP24]], 41538374868278621028243970633760768
-; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP27:%.*]] = trunc i129 [[TMP25]] to i128
-; CHECK-NEXT: [[TMP28:%.*]] = lshr i129 [[TMP25]], 32
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT: br i1 [[TMP26]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP30:%.*]] = ashr i129 [[TMP24]], 3
-; CHECK-NEXT: [[TMP31:%.*]] = trunc i129 [[TMP30]] to i128
-; CHECK-NEXT: [[TMP32:%.*]] = lshr i129 [[TMP30]], 32
-; CHECK-NEXT: [[TMP33:%.*]] = trunc i129 [[TMP6]] to i64
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP34:%.*]] = add i129 [[TMP4]], -16
-; CHECK-NEXT: [[TMP35:%.*]] = shl i129 [[TMP3]], [[TMP34]]
-; CHECK-NEXT: [[TMP36:%.*]] = trunc i129 [[TMP35]] to i128
-; CHECK-NEXT: [[TMP37:%.*]] = lshr i129 [[TMP35]], 32
-; CHECK-NEXT: [[TMP38:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP39:%.*]] = phi i128 [ [[TMP31]], [[ITOFP_IF_THEN20]] ], [ [[TMP27]], [[ITOFP_SW_EPILOG]] ], [ [[TMP36]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP40:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP38]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[AND29:%.*]] = and i129 [[TMP1]], 9223372036854775808
-; CHECK-NEXT: [[TMP41:%.*]] = shl i64 [[TMP40]], 48
-; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[TMP41]], 4611404543450677248
-; CHECK-NEXT: [[TMP43:%.*]] = zext i64 [[TMP42]] to i128
-; CHECK-NEXT: [[TMP44:%.*]] = trunc i129 [[AND29]] to i128
-; CHECK-NEXT: [[TMP45:%.*]] = or i128 [[TMP44]], [[TMP43]]
-; CHECK-NEXT: [[TMP46:%.*]] = shl i128 [[TMP45]], 64
-; CHECK-NEXT: [[TMP47:%.*]] = and i128 [[TMP39]], 5192296858534827628530496329220095
-; CHECK-NEXT: [[TMP48:%.*]] = or i128 [[TMP46]], [[TMP47]]
-; CHECK-NEXT: [[TMP49:%.*]] = bitcast i128 [[TMP48]] to fp128
-; CHECK-NEXT: [[TMP50:%.*]] = fptrunc fp128 [[TMP49]] to x86_fp80
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP51:%.*]] = phi x86_fp80 [ [[TMP50]], [[ITOFP_IF_END26]] ], [ 0xK00000000000000000000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP51:%.*]] = sitofp i129 [[A:%.*]] to x86_fp80
; CHECK-NEXT: ret x86_fp80 [[TMP51]]
;
%conv = sitofp i129 %a to x86_fp80
@@ -350,82 +40,7 @@ define x86_fp80 @si129tox86_fp80(i129 %a) {
define fp128 @si129tofp128(i129 %a) {
; CHECK-LABEL: @si129tofp128(
-; CHECK-NEXT: itofp-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT: br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT: [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP3]], i1 true)
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = sub i129 129, [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
-; CHECK-NEXT: br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i129 115, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP9:%.*]] = shl i129 [[TMP3]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP10:%.*]] = sub i129 14, [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = lshr i129 [[TMP3]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = add i129 [[TMP4]], 115
-; CHECK-NEXT: [[TMP13:%.*]] = lshr i129 -1, [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = and i129 [[TMP13]], [[TMP3]]
-; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i129 [[TMP14]], 0
-; CHECK-NEXT: [[TMP16:%.*]] = zext i1 [[TMP15]] to i129
-; CHECK-NEXT: [[TMP17:%.*]] = or i129 [[TMP11]], [[TMP16]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP18:%.*]] = phi i129 [ [[TMP17]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP3]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP19:%.*]] = trunc i129 [[TMP18]] to i32
-; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP19]], 2
-; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 1
-; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i129
-; CHECK-NEXT: [[TMP23:%.*]] = or i129 [[TMP18]], [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = add i129 [[TMP23]], 1
-; CHECK-NEXT: [[TMP25:%.*]] = ashr i129 [[TMP24]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP24]], 41538374868278621028243970633760768
-; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP27:%.*]] = trunc i129 [[TMP25]] to i128
-; CHECK-NEXT: [[TMP28:%.*]] = lshr i129 [[TMP25]], 32
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT: br i1 [[TMP26]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP30:%.*]] = ashr i129 [[TMP24]], 3
-; CHECK-NEXT: [[TMP31:%.*]] = trunc i129 [[TMP30]] to i128
-; CHECK-NEXT: [[TMP32:%.*]] = lshr i129 [[TMP30]], 32
-; CHECK-NEXT: [[TMP33:%.*]] = trunc i129 [[TMP6]] to i64
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP34:%.*]] = add i129 [[TMP4]], -16
-; CHECK-NEXT: [[TMP35:%.*]] = shl i129 [[TMP3]], [[TMP34]]
-; CHECK-NEXT: [[TMP36:%.*]] = trunc i129 [[TMP35]] to i128
-; CHECK-NEXT: [[TMP37:%.*]] = lshr i129 [[TMP35]], 32
-; CHECK-NEXT: [[TMP38:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP39:%.*]] = phi i128 [ [[TMP31]], [[ITOFP_IF_THEN20]] ], [ [[TMP27]], [[ITOFP_SW_EPILOG]] ], [ [[TMP36]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP40:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP38]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[AND29:%.*]] = and i129 [[TMP1]], 9223372036854775808
-; CHECK-NEXT: [[TMP41:%.*]] = shl i64 [[TMP40]], 48
-; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[TMP41]], 4611404543450677248
-; CHECK-NEXT: [[TMP43:%.*]] = zext i64 [[TMP42]] to i128
-; CHECK-NEXT: [[TMP44:%.*]] = trunc i129 [[AND29]] to i128
-; CHECK-NEXT: [[TMP45:%.*]] = or i128 [[TMP44]], [[TMP43]]
-; CHECK-NEXT: [[TMP46:%.*]] = shl i128 [[TMP45]], 64
-; CHECK-NEXT: [[TMP47:%.*]] = and i128 [[TMP39]], 5192296858534827628530496329220095
-; CHECK-NEXT: [[TMP48:%.*]] = or i128 [[TMP46]], [[TMP47]]
-; CHECK-NEXT: [[TMP49:%.*]] = bitcast i128 [[TMP48]] to fp128
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP50:%.*]] = phi fp128 [ [[TMP49]], [[ITOFP_IF_END26]] ], [ 0xL00000000000000000000000000000000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP50:%.*]] = sitofp i129 [[A:%.*]] to fp128
; CHECK-NEXT: ret fp128 [[TMP50]]
;
%conv = sitofp i129 %a to fp128
@@ -434,163 +49,7 @@ define fp128 @si129tofp128(i129 %a) {
define <2 x float> @si129tofloatv2(<2 x i129> %a) {
; CHECK-LABEL: @si129tofloatv2(
-; CHECK-NEXT: itofp-entryitofp-entry:
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i129> [[A:%.*]], i64 0
-; CHECK-NEXT: [[TMP110:%.*]] = freeze i129 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i129 [[TMP110]], 0
-; CHECK-NEXT: br i1 [[TMP1]], label [[ITOFP_RETURN1:%.*]], label [[ITOFP_IF_END2:%.*]]
-; CHECK: itofp-if-end2:
-; CHECK-NEXT: [[TMP2:%.*]] = ashr i129 [[TMP110]], 128
-; CHECK-NEXT: [[TMP3:%.*]] = xor i129 [[TMP2]], [[TMP110]]
-; CHECK-NEXT: [[TMP4:%.*]] = sub i129 [[TMP3]], [[TMP2]]
-; CHECK-NEXT: [[TMP5:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP4]], i1 true)
-; CHECK-NEXT: [[TMP6:%.*]] = trunc i129 [[TMP5]] to i32
-; CHECK-NEXT: [[TMP7:%.*]] = sub i32 129, [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = sub i32 128, [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 24
-; CHECK-NEXT: br i1 [[TMP9]], label [[ITOFP_IF_THEN43:%.*]], label [[ITOFP_IF_ELSE8:%.*]]
-; CHECK: itofp-if-then43:
-; CHECK-NEXT: switch i32 [[TMP7]], label [[ITOFP_SW_DEFAULT5:%.*]] [
-; CHECK-NEXT: i32 25, label [[ITOFP_SW_BB4:%.*]]
-; CHECK-NEXT: i32 26, label [[ITOFP_SW_EPILOG6:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb4:
-; CHECK-NEXT: [[TMP10:%.*]] = shl i129 [[TMP4]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG6]]
-; CHECK: itofp-sw-default5:
-; CHECK-NEXT: [[TMP11:%.*]] = sub i32 103, [[TMP6]]
-; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i129
-; CHECK-NEXT: [[TMP13:%.*]] = lshr i129 [[TMP4]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP6]], 26
-; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i129
-; CHECK-NEXT: [[TMP16:%.*]] = lshr i129 -1, [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = and i129 [[TMP16]], [[TMP4]]
-; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i129 [[TMP17]], 0
-; CHECK-NEXT: [[TMP19:%.*]] = zext i1 [[TMP18]] to i129
-; CHECK-NEXT: [[TMP20:%.*]] = or i129 [[TMP13]], [[TMP19]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG6]]
-; CHECK: itofp-sw-epilog6:
-; CHECK-NEXT: [[TMP21:%.*]] = phi i129 [ [[TMP20]], [[ITOFP_SW_DEFAULT5]] ], [ [[TMP4]], [[ITOFP_IF_THEN43]] ], [ [[TMP10]], [[ITOFP_SW_BB4]] ]
-; CHECK-NEXT: [[TMP22:%.*]] = trunc i129 [[TMP21]] to i32
-; CHECK-NEXT: [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
-; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 1
-; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i129
-; CHECK-NEXT: [[TMP26:%.*]] = or i129 [[TMP21]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = add i129 [[TMP26]], 1
-; CHECK-NEXT: [[TMP28:%.*]] = ashr i129 [[TMP27]], 2
-; CHECK-NEXT: [[A310:%.*]] = and i129 [[TMP27]], 67108864
-; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i129 [[A310]], 0
-; CHECK-NEXT: [[TMP30:%.*]] = trunc i129 [[TMP28]] to i32
-; CHECK-NEXT: [[TMP31:%.*]] = lshr i129 [[TMP28]], 32
-; CHECK-NEXT: [[TMP32:%.*]] = trunc i129 [[TMP31]] to i32
-; CHECK-NEXT: br i1 [[TMP29]], label [[ITOFP_IF_END269:%.*]], label [[ITOFP_IF_THEN207:%.*]]
-; CHECK: itofp-if-then207:
-; CHECK-NEXT: [[TMP33:%.*]] = ashr i129 [[TMP27]], 3
-; CHECK-NEXT: [[TMP34:%.*]] = trunc i129 [[TMP33]] to i32
-; CHECK-NEXT: [[TMP35:%.*]] = lshr i129 [[TMP33]], 32
-; CHECK-NEXT: [[TMP36:%.*]] = trunc i129 [[TMP35]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END269]]
-; CHECK: itofp-if-else8:
-; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP6]], -105
-; CHECK-NEXT: [[TMP38:%.*]] = zext i32 [[TMP37]] to i129
-; CHECK-NEXT: [[TMP39:%.*]] = shl i129 [[TMP4]], [[TMP38]]
-; CHECK-NEXT: [[TMP40:%.*]] = trunc i129 [[TMP39]] to i32
-; CHECK-NEXT: [[TMP41:%.*]] = lshr i129 [[TMP39]], 32
-; CHECK-NEXT: [[TMP42:%.*]] = trunc i129 [[TMP41]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END269]]
-; CHECK: itofp-if-end269:
-; CHECK-NEXT: [[TMP43:%.*]] = phi i32 [ [[TMP34]], [[ITOFP_IF_THEN207]] ], [ [[TMP30]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP40]], [[ITOFP_IF_ELSE8]] ]
-; CHECK-NEXT: [[TMP44:%.*]] = phi i32 [ [[TMP7]], [[ITOFP_IF_THEN207]] ], [ [[TMP8]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP8]], [[ITOFP_IF_ELSE8]] ]
-; CHECK-NEXT: [[TMP45:%.*]] = trunc i129 [[TMP2]] to i32
-; CHECK-NEXT: [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
-; CHECK-NEXT: [[TMP47:%.*]] = shl i32 [[TMP44]], 23
-; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP47]], 1065353216
-; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP43]], 8388607
-; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
-; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP50]], [[TMP48]]
-; CHECK-NEXT: [[TMP52:%.*]] = bitcast i32 [[TMP51]] to float
-; CHECK-NEXT: br label [[ITOFP_RETURN1]]
-; CHECK: itofp-return1:
-; CHECK-NEXT: [[TMP53:%.*]] = phi float [ [[TMP52]], [[ITOFP_IF_END269]] ], [ 0.000000e+00, [[ITOFP_ENTRYITOFP_ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP54:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0
-; CHECK-NEXT: [[TMP55:%.*]] = extractelement <2 x i129> [[A]], i64 1
-; CHECK-NEXT: [[TMP111:%.*]] = freeze i129 [[TMP55]]
-; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i129 [[TMP111]], 0
-; CHECK-NEXT: br i1 [[TMP56]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP57:%.*]] = ashr i129 [[TMP111]], 128
-; CHECK-NEXT: [[TMP58:%.*]] = xor i129 [[TMP57]], [[TMP111]]
-; CHECK-NEXT: [[TMP59:%.*]] = sub i129 [[TMP58]], [[TMP57]]
-; CHECK-NEXT: [[TMP60:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP59]], i1 true)
-; CHECK-NEXT: [[TMP61:%.*]] = trunc i129 [[TMP60]] to i32
-; CHECK-NEXT: [[TMP62:%.*]] = sub i32 129, [[TMP61]]
-; CHECK-NEXT: [[TMP63:%.*]] = sub i32 128, [[TMP61]]
-; CHECK-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], 24
-; CHECK-NEXT: br i1 [[TMP64]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i32 [[TMP62]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP65:%.*]] = shl i129 [[TMP59]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP66:%.*]] = sub i32 103, [[TMP61]]
-; CHECK-NEXT: [[TMP67:%.*]] = zext i32 [[TMP66]] to i129
-; CHECK-NEXT: [[TMP68:%.*]] = lshr i129 [[TMP59]], [[TMP67]]
-; CHECK-NEXT: [[TMP69:%.*]] = add i32 [[TMP61]], 26
-; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP69]] to i129
-; CHECK-NEXT: [[TMP71:%.*]] = lshr i129 -1, [[TMP70]]
-; CHECK-NEXT: [[TMP72:%.*]] = and i129 [[TMP71]], [[TMP59]]
-; CHECK-NEXT: [[TMP73:%.*]] = icmp ne i129 [[TMP72]], 0
-; CHECK-NEXT: [[TMP74:%.*]] = zext i1 [[TMP73]] to i129
-; CHECK-NEXT: [[TMP75:%.*]] = or i129 [[TMP68]], [[TMP74]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP76:%.*]] = phi i129 [ [[TMP75]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP59]], [[ITOFP_IF_THEN4]] ], [ [[TMP65]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP77:%.*]] = trunc i129 [[TMP76]] to i32
-; CHECK-NEXT: [[TMP78:%.*]] = lshr i32 [[TMP77]], 2
-; CHECK-NEXT: [[TMP79:%.*]] = and i32 [[TMP78]], 1
-; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP79]] to i129
-; CHECK-NEXT: [[TMP81:%.*]] = or i129 [[TMP76]], [[TMP80]]
-; CHECK-NEXT: [[TMP82:%.*]] = add i129 [[TMP81]], 1
-; CHECK-NEXT: [[TMP83:%.*]] = ashr i129 [[TMP82]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP82]], 67108864
-; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP85:%.*]] = trunc i129 [[TMP83]] to i32
-; CHECK-NEXT: [[TMP86:%.*]] = lshr i129 [[TMP83]], 32
-; CHECK-NEXT: [[TMP87:%.*]] = trunc i129 [[TMP86]] to i32
-; CHECK-NEXT: br i1 [[TMP84]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP88:%.*]] = ashr i129 [[TMP82]], 3
-; CHECK-NEXT: [[TMP89:%.*]] = trunc i129 [[TMP88]] to i32
-; CHECK-NEXT: [[TMP90:%.*]] = lshr i129 [[TMP88]], 32
-; CHECK-NEXT: [[TMP91:%.*]] = trunc i129 [[TMP90]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP61]], -105
-; CHECK-NEXT: [[TMP93:%.*]] = zext i32 [[TMP92]] to i129
-; CHECK-NEXT: [[TMP94:%.*]] = shl i129 [[TMP59]], [[TMP93]]
-; CHECK-NEXT: [[TMP95:%.*]] = trunc i129 [[TMP94]] to i32
-; CHECK-NEXT: [[TMP96:%.*]] = lshr i129 [[TMP94]], 32
-; CHECK-NEXT: [[TMP97:%.*]] = trunc i129 [[TMP96]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP98:%.*]] = phi i32 [ [[TMP89]], [[ITOFP_IF_THEN20]] ], [ [[TMP85]], [[ITOFP_SW_EPILOG]] ], [ [[TMP95]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP99:%.*]] = phi i32 [ [[TMP62]], [[ITOFP_IF_THEN20]] ], [ [[TMP63]], [[ITOFP_SW_EPILOG]] ], [ [[TMP63]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP100:%.*]] = trunc i129 [[TMP57]] to i32
-; CHECK-NEXT: [[TMP101:%.*]] = and i32 [[TMP100]], -2147483648
-; CHECK-NEXT: [[TMP102:%.*]] = shl i32 [[TMP99]], 23
-; CHECK-NEXT: [[TMP103:%.*]] = add i32 [[TMP102]], 1065353216
-; CHECK-NEXT: [[TMP104:%.*]] = and i32 [[TMP98]], 8388607
-; CHECK-NEXT: [[TMP105:%.*]] = or i32 [[TMP104]], [[TMP101]]
-; CHECK-NEXT: [[TMP106:%.*]] = or i32 [[TMP105]], [[TMP103]]
-; CHECK-NEXT: [[TMP107:%.*]] = bitcast i32 [[TMP106]] to float
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP108:%.*]] = phi float [ [[TMP107]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_RETURN1]] ]
-; CHECK-NEXT: [[TMP109:%.*]] = insertelement <2 x float> [[TMP54]], float [[TMP108]], i64 1
+; CHECK-NEXT: [[TMP109:%.*]] = sitofp <2 x i129> [[A:%.*]] to <2 x float>
; CHECK-NEXT: ret <2 x float> [[TMP109]]
;
%conv = sitofp <2 x i129> %a to <2 x float>
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-ui129tofp.ll b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-ui129tofp.ll
index eed61b7c53989..ea161746e49c6 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-ui129tofp.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-convert-ui129tofp.ll
@@ -4,84 +4,7 @@
define half @ui129tohalf(i129 %a) {
; CHECK-LABEL: @ui129tohalf(
-; CHECK-NEXT: itofp-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT: br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT: [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[A]], i1 true)
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
-; CHECK-NEXT: br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP9:%.*]] = shl i129 [[A]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP10:%.*]] = sub i32 103, [[TMP5]]
-; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT: [[TMP12:%.*]] = lshr i129 [[A]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP5]], 26
-; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT: [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = and i129 [[TMP15]], [[A]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT: [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[A]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT: [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT: [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT: [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT: [[TMP27:%.*]] = lshr i129 [[TMP26]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP26]], 67108864
-; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i129 [[TMP27]] to i32
-; CHECK-NEXT: [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT: [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT: br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP32:%.*]] = lshr i129 [[TMP26]], 3
-; CHECK-NEXT: [[TMP33:%.*]] = trunc i129 [[TMP32]] to i32
-; CHECK-NEXT: [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT: [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP5]], -105
-; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT: [[TMP38:%.*]] = shl i129 [[A]], [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = trunc i129 [[TMP38]] to i32
-; CHECK-NEXT: [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT: [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP42:%.*]] = phi i32 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP43:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP44:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP44]], -2147483648
-; CHECK-NEXT: [[TMP46:%.*]] = shl i32 [[TMP43]], 23
-; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP46]], 1065353216
-; CHECK-NEXT: [[TMP48:%.*]] = and i32 [[TMP42]], 8388607
-; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP48]], [[TMP45]]
-; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP48]], [[TMP47]]
-; CHECK-NEXT: [[TMP51:%.*]] = bitcast i32 [[TMP50]] to float
-; CHECK-NEXT: [[TMP52:%.*]] = fptrunc float [[TMP51]] to half
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP53:%.*]] = phi half [ [[TMP52]], [[ITOFP_IF_END26]] ], [ 0xH0000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP53:%.*]] = uitofp i129 [[A:%.*]] to half
; CHECK-NEXT: ret half [[TMP53]]
;
%conv = uitofp i129 %a to half
@@ -90,83 +13,7 @@ define half @ui129tohalf(i129 %a) {
define float @ui129tofloat(i129 %a) {
; CHECK-LABEL: @ui129tofloat(
-; CHECK-NEXT: itofp-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT: br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT: [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[A]], i1 true)
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 24
-; CHECK-NEXT: br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP9:%.*]] = shl i129 [[A]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP10:%.*]] = sub i32 103, [[TMP5]]
-; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT: [[TMP12:%.*]] = lshr i129 [[A]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP5]], 26
-; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT: [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = and i129 [[TMP15]], [[A]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT: [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[A]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT: [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT: [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT: [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT: [[TMP27:%.*]] = lshr i129 [[TMP26]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP26]], 67108864
-; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i129 [[TMP27]] to i32
-; CHECK-NEXT: [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT: [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT: br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP32:%.*]] = lshr i129 [[TMP26]], 3
-; CHECK-NEXT: [[TMP33:%.*]] = trunc i129 [[TMP32]] to i32
-; CHECK-NEXT: [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT: [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP5]], -105
-; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT: [[TMP38:%.*]] = shl i129 [[A]], [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = trunc i129 [[TMP38]] to i32
-; CHECK-NEXT: [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT: [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP42:%.*]] = phi i32 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP43:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP44:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP45:%.*]] = and i32 [[TMP44]], -2147483648
-; CHECK-NEXT: [[TMP46:%.*]] = shl i32 [[TMP43]], 23
-; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP46]], 1065353216
-; CHECK-NEXT: [[TMP48:%.*]] = and i32 [[TMP42]], 8388607
-; CHECK-NEXT: [[TMP49:%.*]] = or i32 [[TMP48]], [[TMP45]]
-; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP48]], [[TMP47]]
-; CHECK-NEXT: [[TMP51:%.*]] = bitcast i32 [[TMP50]] to float
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP52:%.*]] = phi float [ [[TMP51]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP52:%.*]] = uitofp i129 [[A:%.*]] to float
; CHECK-NEXT: ret float [[TMP52]]
;
%conv = uitofp i129 %a to float
@@ -175,88 +22,7 @@ define float @ui129tofloat(i129 %a) {
define double @ui129todouble(i129 %a) {
; CHECK-LABEL: @ui129todouble(
-; CHECK-NEXT: itofp-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT: br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT: [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[A]], i1 true)
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = sub i32 129, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i32 128, [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], 53
-; CHECK-NEXT: br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i32 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i32 54, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i32 55, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP9:%.*]] = shl i129 [[A]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP10:%.*]] = sub i32 74, [[TMP5]]
-; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i129
-; CHECK-NEXT: [[TMP12:%.*]] = lshr i129 [[A]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP5]], 55
-; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i129
-; CHECK-NEXT: [[TMP15:%.*]] = lshr i129 -1, [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = and i129 [[TMP15]], [[A]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i129 [[TMP16]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = zext i1 [[TMP17]] to i129
-; CHECK-NEXT: [[TMP19:%.*]] = or i129 [[TMP12]], [[TMP18]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP20:%.*]] = phi i129 [ [[TMP19]], [[ITOFP_SW_DEFAULT]] ], [ [[A]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = trunc i129 [[TMP20]] to i32
-; CHECK-NEXT: [[TMP22:%.*]] = lshr i32 [[TMP21]], 2
-; CHECK-NEXT: [[TMP23:%.*]] = and i32 [[TMP22]], 1
-; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP23]] to i129
-; CHECK-NEXT: [[TMP25:%.*]] = or i129 [[TMP20]], [[TMP24]]
-; CHECK-NEXT: [[TMP26:%.*]] = add i129 [[TMP25]], 1
-; CHECK-NEXT: [[TMP27:%.*]] = lshr i129 [[TMP26]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP26]], 36028797018963968
-; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i129 [[TMP27]] to i64
-; CHECK-NEXT: [[TMP30:%.*]] = lshr i129 [[TMP27]], 32
-; CHECK-NEXT: [[TMP31:%.*]] = trunc i129 [[TMP30]] to i32
-; CHECK-NEXT: br i1 [[TMP28]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP32:%.*]] = lshr i129 [[TMP26]], 3
-; CHECK-NEXT: [[TMP33:%.*]] = trunc i129 [[TMP32]] to i64
-; CHECK-NEXT: [[TMP34:%.*]] = lshr i129 [[TMP32]], 32
-; CHECK-NEXT: [[TMP35:%.*]] = trunc i129 [[TMP34]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP5]], -76
-; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[TMP36]] to i129
-; CHECK-NEXT: [[TMP38:%.*]] = shl i129 [[A]], [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = trunc i129 [[TMP38]] to i64
-; CHECK-NEXT: [[TMP40:%.*]] = lshr i129 [[TMP38]], 32
-; CHECK-NEXT: [[TMP41:%.*]] = trunc i129 [[TMP40]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP42:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP39]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP43:%.*]] = phi i32 [ [[TMP35]], [[ITOFP_IF_THEN20]] ], [ [[TMP31]], [[ITOFP_SW_EPILOG]] ], [ [[TMP41]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP44:%.*]] = phi i32 [ [[TMP6]], [[ITOFP_IF_THEN20]] ], [ [[TMP7]], [[ITOFP_SW_EPILOG]] ], [ [[TMP7]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP45:%.*]] = trunc i129 [[TMP1]] to i32
-; CHECK-NEXT: [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
-; CHECK-NEXT: [[TMP47:%.*]] = shl i32 [[TMP44]], 20
-; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP47]], 1072693248
-; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP43]], 1048575
-; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
-; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP49]], [[TMP48]]
-; CHECK-NEXT: [[TMP52:%.*]] = zext i32 [[TMP51]] to i64
-; CHECK-NEXT: [[TMP53:%.*]] = shl i64 [[TMP52]], 32
-; CHECK-NEXT: [[TMP54:%.*]] = and i64 [[TMP42]], 4294967295
-; CHECK-NEXT: [[TMP55:%.*]] = or i64 [[TMP53]], [[TMP54]]
-; CHECK-NEXT: [[TMP56:%.*]] = bitcast i64 [[TMP55]] to double
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP57:%.*]] = phi double [ [[TMP56]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP57:%.*]] = uitofp i129 [[A:%.*]] to double
; CHECK-NEXT: ret double [[TMP57]]
;
%conv = uitofp i129 %a to double
@@ -265,83 +31,7 @@ define double @ui129todouble(i129 %a) {
define x86_fp80 @ui129tox86_fp80(i129 %a) {
; CHECK-LABEL: @ui129tox86_fp80(
-; CHECK-NEXT: itofp-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT: br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT: [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[A]], i1 true)
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = sub i129 129, [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
-; CHECK-NEXT: br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i129 115, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP9:%.*]] = shl i129 [[A]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP10:%.*]] = sub i129 14, [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = lshr i129 [[A]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = add i129 [[TMP4]], 115
-; CHECK-NEXT: [[TMP13:%.*]] = lshr i129 -1, [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = and i129 [[TMP13]], [[A]]
-; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i129 [[TMP14]], 0
-; CHECK-NEXT: [[TMP16:%.*]] = zext i1 [[TMP15]] to i129
-; CHECK-NEXT: [[TMP17:%.*]] = or i129 [[TMP11]], [[TMP16]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP18:%.*]] = phi i129 [ [[TMP17]], [[ITOFP_SW_DEFAULT]] ], [ [[A]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP19:%.*]] = trunc i129 [[TMP18]] to i32
-; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP19]], 2
-; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 1
-; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i129
-; CHECK-NEXT: [[TMP23:%.*]] = or i129 [[TMP18]], [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = add i129 [[TMP23]], 1
-; CHECK-NEXT: [[TMP25:%.*]] = lshr i129 [[TMP24]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP24]], 41538374868278621028243970633760768
-; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP27:%.*]] = trunc i129 [[TMP25]] to i128
-; CHECK-NEXT: [[TMP28:%.*]] = lshr i129 [[TMP25]], 32
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT: br i1 [[TMP26]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP30:%.*]] = lshr i129 [[TMP24]], 3
-; CHECK-NEXT: [[TMP31:%.*]] = trunc i129 [[TMP30]] to i128
-; CHECK-NEXT: [[TMP32:%.*]] = lshr i129 [[TMP30]], 32
-; CHECK-NEXT: [[TMP33:%.*]] = trunc i129 [[TMP6]] to i64
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP34:%.*]] = add i129 [[TMP4]], -16
-; CHECK-NEXT: [[TMP35:%.*]] = shl i129 [[A]], [[TMP34]]
-; CHECK-NEXT: [[TMP36:%.*]] = trunc i129 [[TMP35]] to i128
-; CHECK-NEXT: [[TMP37:%.*]] = lshr i129 [[TMP35]], 32
-; CHECK-NEXT: [[TMP38:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP39:%.*]] = phi i128 [ [[TMP31]], [[ITOFP_IF_THEN20]] ], [ [[TMP27]], [[ITOFP_SW_EPILOG]] ], [ [[TMP36]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP40:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP38]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[AND29:%.*]] = and i129 [[TMP1]], 9223372036854775808
-; CHECK-NEXT: [[TMP41:%.*]] = shl i64 [[TMP40]], 48
-; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[TMP41]], 4611404543450677248
-; CHECK-NEXT: [[TMP43:%.*]] = zext i64 [[TMP42]] to i128
-; CHECK-NEXT: [[TMP44:%.*]] = trunc i129 [[AND29]] to i128
-; CHECK-NEXT: [[TMP45:%.*]] = or i128 [[TMP44]], [[TMP43]]
-; CHECK-NEXT: [[TMP46:%.*]] = shl i128 [[TMP45]], 64
-; CHECK-NEXT: [[TMP47:%.*]] = and i128 [[TMP39]], 5192296858534827628530496329220095
-; CHECK-NEXT: [[TMP48:%.*]] = or i128 [[TMP46]], [[TMP47]]
-; CHECK-NEXT: [[TMP49:%.*]] = bitcast i128 [[TMP48]] to fp128
-; CHECK-NEXT: [[TMP50:%.*]] = fptrunc fp128 [[TMP49]] to x86_fp80
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP51:%.*]] = phi x86_fp80 [ [[TMP50]], [[ITOFP_IF_END26]] ], [ 0xK00000000000000000000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP51:%.*]] = uitofp i129 [[A:%.*]] to x86_fp80
; CHECK-NEXT: ret x86_fp80 [[TMP51]]
;
%conv = uitofp i129 %a to x86_fp80
@@ -350,82 +40,7 @@ define x86_fp80 @ui129tox86_fp80(i129 %a) {
define fp128 @ui129tofp128(i129 %a) {
; CHECK-LABEL: @ui129tofp128(
-; CHECK-NEXT: itofp-entry:
-; CHECK-NEXT: [[A:%.*]] = freeze i129 [[A1:%.*]]
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i129 [[A]], 0
-; CHECK-NEXT: br i1 [[TMP0]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP1:%.*]] = ashr i129 [[A]], 128
-; CHECK-NEXT: [[TMP2:%.*]] = xor i129 [[TMP1]], [[A]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i129 [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i129 @llvm.ctlz.i129(i129 [[A]], i1 true)
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i129 [[TMP4]] to i32
-; CHECK-NEXT: [[TMP6:%.*]] = sub i129 129, [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i129 128, [[TMP4]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i129 [[TMP6]], 113
-; CHECK-NEXT: br i1 [[TMP8]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i129 [[TMP6]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i129 114, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i129 115, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP9:%.*]] = shl i129 [[A]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP10:%.*]] = sub i129 14, [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = lshr i129 [[A]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = add i129 [[TMP4]], 115
-; CHECK-NEXT: [[TMP13:%.*]] = lshr i129 -1, [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = and i129 [[TMP13]], [[A]]
-; CHECK-NEXT: [[TMP15:%.*]] = icmp ne i129 [[TMP14]], 0
-; CHECK-NEXT: [[TMP16:%.*]] = zext i1 [[TMP15]] to i129
-; CHECK-NEXT: [[TMP17:%.*]] = or i129 [[TMP11]], [[TMP16]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP18:%.*]] = phi i129 [ [[TMP17]], [[ITOFP_SW_DEFAULT]] ], [ [[A]], [[ITOFP_IF_THEN4]] ], [ [[TMP9]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP19:%.*]] = trunc i129 [[TMP18]] to i32
-; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP19]], 2
-; CHECK-NEXT: [[TMP21:%.*]] = and i32 [[TMP20]], 1
-; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i129
-; CHECK-NEXT: [[TMP23:%.*]] = or i129 [[TMP18]], [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = add i129 [[TMP23]], 1
-; CHECK-NEXT: [[TMP25:%.*]] = lshr i129 [[TMP24]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP24]], 41538374868278621028243970633760768
-; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP27:%.*]] = trunc i129 [[TMP25]] to i128
-; CHECK-NEXT: [[TMP28:%.*]] = lshr i129 [[TMP25]], 32
-; CHECK-NEXT: [[TMP29:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT: br i1 [[TMP26]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP30:%.*]] = lshr i129 [[TMP24]], 3
-; CHECK-NEXT: [[TMP31:%.*]] = trunc i129 [[TMP30]] to i128
-; CHECK-NEXT: [[TMP32:%.*]] = lshr i129 [[TMP30]], 32
-; CHECK-NEXT: [[TMP33:%.*]] = trunc i129 [[TMP6]] to i64
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP34:%.*]] = add i129 [[TMP4]], -16
-; CHECK-NEXT: [[TMP35:%.*]] = shl i129 [[A]], [[TMP34]]
-; CHECK-NEXT: [[TMP36:%.*]] = trunc i129 [[TMP35]] to i128
-; CHECK-NEXT: [[TMP37:%.*]] = lshr i129 [[TMP35]], 32
-; CHECK-NEXT: [[TMP38:%.*]] = trunc i129 [[TMP7]] to i64
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP39:%.*]] = phi i128 [ [[TMP31]], [[ITOFP_IF_THEN20]] ], [ [[TMP27]], [[ITOFP_SW_EPILOG]] ], [ [[TMP36]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP40:%.*]] = phi i64 [ [[TMP33]], [[ITOFP_IF_THEN20]] ], [ [[TMP29]], [[ITOFP_SW_EPILOG]] ], [ [[TMP38]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[AND29:%.*]] = and i129 [[TMP1]], 9223372036854775808
-; CHECK-NEXT: [[TMP41:%.*]] = shl i64 [[TMP40]], 48
-; CHECK-NEXT: [[TMP42:%.*]] = add i64 [[TMP41]], 4611404543450677248
-; CHECK-NEXT: [[TMP43:%.*]] = zext i64 [[TMP42]] to i128
-; CHECK-NEXT: [[TMP44:%.*]] = trunc i129 [[AND29]] to i128
-; CHECK-NEXT: [[TMP45:%.*]] = or i128 [[TMP44]], [[TMP43]]
-; CHECK-NEXT: [[TMP46:%.*]] = shl i128 [[TMP45]], 64
-; CHECK-NEXT: [[TMP47:%.*]] = and i128 [[TMP39]], 5192296858534827628530496329220095
-; CHECK-NEXT: [[TMP48:%.*]] = or i128 [[TMP46]], [[TMP47]]
-; CHECK-NEXT: [[TMP49:%.*]] = bitcast i128 [[TMP48]] to fp128
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP50:%.*]] = phi fp128 [ [[TMP49]], [[ITOFP_IF_END26]] ], [ 0xL00000000000000000000000000000000, [[ITOFP_ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP50:%.*]] = uitofp i129 [[A:%.*]] to fp128
; CHECK-NEXT: ret fp128 [[TMP50]]
;
%conv = uitofp i129 %a to fp128
@@ -434,163 +49,7 @@ define fp128 @ui129tofp128(i129 %a) {
define <2 x float> @ui129tofloatv2(<2 x i129> %a) {
; CHECK-LABEL: @ui129tofloatv2(
-; CHECK-NEXT: itofp-entryitofp-entry:
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i129> [[A:%.*]], i64 0
-; CHECK-NEXT: [[TMP10:%.*]] = freeze i129 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i129 [[TMP10]], 0
-; CHECK-NEXT: br i1 [[TMP1]], label [[ITOFP_RETURN1:%.*]], label [[ITOFP_IF_END2:%.*]]
-; CHECK: itofp-if-end2:
-; CHECK-NEXT: [[TMP2:%.*]] = ashr i129 [[TMP10]], 128
-; CHECK-NEXT: [[TMP3:%.*]] = xor i129 [[TMP2]], [[TMP10]]
-; CHECK-NEXT: [[TMP4:%.*]] = sub i129 [[TMP3]], [[TMP2]]
-; CHECK-NEXT: [[TMP5:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP10]], i1 true)
-; CHECK-NEXT: [[TMP6:%.*]] = trunc i129 [[TMP5]] to i32
-; CHECK-NEXT: [[TMP7:%.*]] = sub i32 129, [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = sub i32 128, [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 24
-; CHECK-NEXT: br i1 [[TMP9]], label [[ITOFP_IF_THEN43:%.*]], label [[ITOFP_IF_ELSE8:%.*]]
-; CHECK: itofp-if-then43:
-; CHECK-NEXT: switch i32 [[TMP7]], label [[ITOFP_SW_DEFAULT5:%.*]] [
-; CHECK-NEXT: i32 25, label [[ITOFP_SW_BB4:%.*]]
-; CHECK-NEXT: i32 26, label [[ITOFP_SW_EPILOG6:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb4:
-; CHECK-NEXT: [[TMP65:%.*]] = shl i129 [[TMP10]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG6]]
-; CHECK: itofp-sw-default5:
-; CHECK-NEXT: [[TMP11:%.*]] = sub i32 103, [[TMP6]]
-; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i129
-; CHECK-NEXT: [[TMP13:%.*]] = lshr i129 [[TMP10]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP6]], 26
-; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i129
-; CHECK-NEXT: [[TMP16:%.*]] = lshr i129 -1, [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = and i129 [[TMP16]], [[TMP10]]
-; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i129 [[TMP17]], 0
-; CHECK-NEXT: [[TMP19:%.*]] = zext i1 [[TMP18]] to i129
-; CHECK-NEXT: [[TMP20:%.*]] = or i129 [[TMP13]], [[TMP19]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG6]]
-; CHECK: itofp-sw-epilog6:
-; CHECK-NEXT: [[TMP21:%.*]] = phi i129 [ [[TMP20]], [[ITOFP_SW_DEFAULT5]] ], [ [[TMP10]], [[ITOFP_IF_THEN43]] ], [ [[TMP65]], [[ITOFP_SW_BB4]] ]
-; CHECK-NEXT: [[TMP22:%.*]] = trunc i129 [[TMP21]] to i32
-; CHECK-NEXT: [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
-; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 1
-; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i129
-; CHECK-NEXT: [[TMP26:%.*]] = or i129 [[TMP21]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = add i129 [[TMP26]], 1
-; CHECK-NEXT: [[TMP28:%.*]] = lshr i129 [[TMP27]], 2
-; CHECK-NEXT: [[A310:%.*]] = and i129 [[TMP27]], 67108864
-; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i129 [[A310]], 0
-; CHECK-NEXT: [[TMP30:%.*]] = trunc i129 [[TMP28]] to i32
-; CHECK-NEXT: [[TMP31:%.*]] = lshr i129 [[TMP28]], 32
-; CHECK-NEXT: [[TMP32:%.*]] = trunc i129 [[TMP31]] to i32
-; CHECK-NEXT: br i1 [[TMP29]], label [[ITOFP_IF_END269:%.*]], label [[ITOFP_IF_THEN207:%.*]]
-; CHECK: itofp-if-then207:
-; CHECK-NEXT: [[TMP33:%.*]] = lshr i129 [[TMP27]], 3
-; CHECK-NEXT: [[TMP34:%.*]] = trunc i129 [[TMP33]] to i32
-; CHECK-NEXT: [[TMP35:%.*]] = lshr i129 [[TMP33]], 32
-; CHECK-NEXT: [[TMP36:%.*]] = trunc i129 [[TMP35]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END269]]
-; CHECK: itofp-if-else8:
-; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP6]], -105
-; CHECK-NEXT: [[TMP38:%.*]] = zext i32 [[TMP37]] to i129
-; CHECK-NEXT: [[TMP39:%.*]] = shl i129 [[TMP10]], [[TMP38]]
-; CHECK-NEXT: [[TMP40:%.*]] = trunc i129 [[TMP39]] to i32
-; CHECK-NEXT: [[TMP41:%.*]] = lshr i129 [[TMP39]], 32
-; CHECK-NEXT: [[TMP42:%.*]] = trunc i129 [[TMP41]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END269]]
-; CHECK: itofp-if-end269:
-; CHECK-NEXT: [[TMP43:%.*]] = phi i32 [ [[TMP34]], [[ITOFP_IF_THEN207]] ], [ [[TMP30]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP40]], [[ITOFP_IF_ELSE8]] ]
-; CHECK-NEXT: [[TMP44:%.*]] = phi i32 [ [[TMP7]], [[ITOFP_IF_THEN207]] ], [ [[TMP8]], [[ITOFP_SW_EPILOG6]] ], [ [[TMP8]], [[ITOFP_IF_ELSE8]] ]
-; CHECK-NEXT: [[TMP45:%.*]] = trunc i129 [[TMP2]] to i32
-; CHECK-NEXT: [[TMP46:%.*]] = and i32 [[TMP45]], -2147483648
-; CHECK-NEXT: [[TMP47:%.*]] = shl i32 [[TMP44]], 23
-; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP47]], 1065353216
-; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP43]], 8388607
-; CHECK-NEXT: [[TMP50:%.*]] = or i32 [[TMP49]], [[TMP46]]
-; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP49]], [[TMP48]]
-; CHECK-NEXT: [[TMP52:%.*]] = bitcast i32 [[TMP51]] to float
-; CHECK-NEXT: br label [[ITOFP_RETURN1]]
-; CHECK: itofp-return1:
-; CHECK-NEXT: [[TMP53:%.*]] = phi float [ [[TMP52]], [[ITOFP_IF_END269]] ], [ 0.000000e+00, [[ITOFP_ENTRYITOFP_ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP54:%.*]] = insertelement <2 x float> poison, float [[TMP53]], i64 0
-; CHECK-NEXT: [[TMP55:%.*]] = extractelement <2 x i129> [[A]], i64 1
-; CHECK-NEXT: [[TMP110:%.*]] = freeze i129 [[TMP55]]
-; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i129 [[TMP110]], 0
-; CHECK-NEXT: br i1 [[TMP56]], label [[ITOFP_RETURN:%.*]], label [[ITOFP_IF_END:%.*]]
-; CHECK: itofp-if-end:
-; CHECK-NEXT: [[TMP57:%.*]] = ashr i129 [[TMP110]], 128
-; CHECK-NEXT: [[TMP58:%.*]] = xor i129 [[TMP57]], [[TMP110]]
-; CHECK-NEXT: [[TMP59:%.*]] = sub i129 [[TMP58]], [[TMP57]]
-; CHECK-NEXT: [[TMP60:%.*]] = call i129 @llvm.ctlz.i129(i129 [[TMP110]], i1 true)
-; CHECK-NEXT: [[TMP61:%.*]] = trunc i129 [[TMP60]] to i32
-; CHECK-NEXT: [[TMP62:%.*]] = sub i32 129, [[TMP61]]
-; CHECK-NEXT: [[TMP63:%.*]] = sub i32 128, [[TMP61]]
-; CHECK-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[TMP62]], 24
-; CHECK-NEXT: br i1 [[TMP64]], label [[ITOFP_IF_THEN4:%.*]], label [[ITOFP_IF_ELSE:%.*]]
-; CHECK: itofp-if-then4:
-; CHECK-NEXT: switch i32 [[TMP62]], label [[ITOFP_SW_DEFAULT:%.*]] [
-; CHECK-NEXT: i32 25, label [[ITOFP_SW_BB:%.*]]
-; CHECK-NEXT: i32 26, label [[ITOFP_SW_EPILOG:%.*]]
-; CHECK-NEXT: ]
-; CHECK: itofp-sw-bb:
-; CHECK-NEXT: [[TMP111:%.*]] = shl i129 [[TMP110]], 1
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-default:
-; CHECK-NEXT: [[TMP66:%.*]] = sub i32 103, [[TMP61]]
-; CHECK-NEXT: [[TMP67:%.*]] = zext i32 [[TMP66]] to i129
-; CHECK-NEXT: [[TMP68:%.*]] = lshr i129 [[TMP110]], [[TMP67]]
-; CHECK-NEXT: [[TMP69:%.*]] = add i32 [[TMP61]], 26
-; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP69]] to i129
-; CHECK-NEXT: [[TMP71:%.*]] = lshr i129 -1, [[TMP70]]
-; CHECK-NEXT: [[TMP72:%.*]] = and i129 [[TMP71]], [[TMP110]]
-; CHECK-NEXT: [[TMP73:%.*]] = icmp ne i129 [[TMP72]], 0
-; CHECK-NEXT: [[TMP74:%.*]] = zext i1 [[TMP73]] to i129
-; CHECK-NEXT: [[TMP75:%.*]] = or i129 [[TMP68]], [[TMP74]]
-; CHECK-NEXT: br label [[ITOFP_SW_EPILOG]]
-; CHECK: itofp-sw-epilog:
-; CHECK-NEXT: [[TMP76:%.*]] = phi i129 [ [[TMP75]], [[ITOFP_SW_DEFAULT]] ], [ [[TMP110]], [[ITOFP_IF_THEN4]] ], [ [[TMP111]], [[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP77:%.*]] = trunc i129 [[TMP76]] to i32
-; CHECK-NEXT: [[TMP78:%.*]] = lshr i32 [[TMP77]], 2
-; CHECK-NEXT: [[TMP79:%.*]] = and i32 [[TMP78]], 1
-; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP79]] to i129
-; CHECK-NEXT: [[TMP81:%.*]] = or i129 [[TMP76]], [[TMP80]]
-; CHECK-NEXT: [[TMP82:%.*]] = add i129 [[TMP81]], 1
-; CHECK-NEXT: [[TMP83:%.*]] = lshr i129 [[TMP82]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i129 [[TMP82]], 67108864
-; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i129 [[A3]], 0
-; CHECK-NEXT: [[TMP85:%.*]] = trunc i129 [[TMP83]] to i32
-; CHECK-NEXT: [[TMP86:%.*]] = lshr i129 [[TMP83]], 32
-; CHECK-NEXT: [[TMP87:%.*]] = trunc i129 [[TMP86]] to i32
-; CHECK-NEXT: br i1 [[TMP84]], label [[ITOFP_IF_END26:%.*]], label [[ITOFP_IF_THEN20:%.*]]
-; CHECK: itofp-if-then20:
-; CHECK-NEXT: [[TMP88:%.*]] = lshr i129 [[TMP82]], 3
-; CHECK-NEXT: [[TMP89:%.*]] = trunc i129 [[TMP88]] to i32
-; CHECK-NEXT: [[TMP90:%.*]] = lshr i129 [[TMP88]], 32
-; CHECK-NEXT: [[TMP91:%.*]] = trunc i129 [[TMP90]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-else:
-; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP61]], -105
-; CHECK-NEXT: [[TMP93:%.*]] = zext i32 [[TMP92]] to i129
-; CHECK-NEXT: [[TMP94:%.*]] = shl i129 [[TMP110]], [[TMP93]]
-; CHECK-NEXT: [[TMP95:%.*]] = trunc i129 [[TMP94]] to i32
-; CHECK-NEXT: [[TMP96:%.*]] = lshr i129 [[TMP94]], 32
-; CHECK-NEXT: [[TMP97:%.*]] = trunc i129 [[TMP96]] to i32
-; CHECK-NEXT: br label [[ITOFP_IF_END26]]
-; CHECK: itofp-if-end26:
-; CHECK-NEXT: [[TMP98:%.*]] = phi i32 [ [[TMP89]], [[ITOFP_IF_THEN20]] ], [ [[TMP85]], [[ITOFP_SW_EPILOG]] ], [ [[TMP95]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP99:%.*]] = phi i32 [ [[TMP62]], [[ITOFP_IF_THEN20]] ], [ [[TMP63]], [[ITOFP_SW_EPILOG]] ], [ [[TMP63]], [[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP100:%.*]] = trunc i129 [[TMP57]] to i32
-; CHECK-NEXT: [[TMP101:%.*]] = and i32 [[TMP100]], -2147483648
-; CHECK-NEXT: [[TMP102:%.*]] = shl i32 [[TMP99]], 23
-; CHECK-NEXT: [[TMP103:%.*]] = add i32 [[TMP102]], 1065353216
-; CHECK-NEXT: [[TMP104:%.*]] = and i32 [[TMP98]], 8388607
-; CHECK-NEXT: [[TMP105:%.*]] = or i32 [[TMP104]], [[TMP101]]
-; CHECK-NEXT: [[TMP106:%.*]] = or i32 [[TMP104]], [[TMP103]]
-; CHECK-NEXT: [[TMP107:%.*]] = bitcast i32 [[TMP106]] to float
-; CHECK-NEXT: br label [[ITOFP_RETURN]]
-; CHECK: itofp-return:
-; CHECK-NEXT: [[TMP108:%.*]] = phi float [ [[TMP107]], [[ITOFP_IF_END26]] ], [ 0.000000e+00, [[ITOFP_RETURN1]] ]
-; CHECK-NEXT: [[TMP109:%.*]] = insertelement <2 x float> [[TMP54]], float [[TMP108]], i64 1
+; CHECK-NEXT: [[TMP109:%.*]] = uitofp <2 x i129> [[A:%.*]] to <2 x float>
; CHECK-NEXT: ret <2 x float> [[TMP109]]
;
%conv = uitofp <2 x i129> %a to <2 x float>
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-optnone.ll b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-optnone.ll
index a0e00be2a94ff..87a0260c5c2f7 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-optnone.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/expand-large-fp-optnone.ll
@@ -8,88 +8,8 @@
define double @main(i224 %0) #0 {
; CHECK-LABEL: define double @main(
; CHECK-SAME: i224 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRYITOFP_ENTRY:.*]]:
-; CHECK-NEXT: [[TMP59:%.*]] = freeze i224 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i224 [[TMP59]], 0
-; CHECK-NEXT: br i1 [[TMP1]], label %[[ITOFP_RETURN:.*]], label %[[ITOFP_IF_END:.*]]
-; CHECK: [[ITOFP_IF_END]]:
-; CHECK-NEXT: [[TMP2:%.*]] = ashr i224 [[TMP59]], 223
-; CHECK-NEXT: [[TMP3:%.*]] = xor i224 [[TMP2]], [[TMP59]]
-; CHECK-NEXT: [[TMP4:%.*]] = sub i224 [[TMP3]], [[TMP2]]
-; CHECK-NEXT: [[TMP5:%.*]] = call i224 @llvm.ctlz.i224(i224 [[TMP4]], i1 true)
-; CHECK-NEXT: [[TMP6:%.*]] = trunc i224 [[TMP5]] to i32
-; CHECK-NEXT: [[TMP7:%.*]] = sub i32 224, [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = sub i32 223, [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP7]], 53
-; CHECK-NEXT: br i1 [[TMP9]], label %[[ITOFP_IF_THEN4:.*]], label %[[ITOFP_IF_ELSE:.*]]
-; CHECK: [[ITOFP_IF_THEN4]]:
-; CHECK-NEXT: switch i32 [[TMP7]], label %[[ITOFP_SW_DEFAULT:.*]] [
-; CHECK-NEXT: i32 54, label %[[ITOFP_SW_BB:.*]]
-; CHECK-NEXT: i32 55, label %[[ITOFP_SW_EPILOG:.*]]
-; CHECK-NEXT: ]
-; CHECK: [[ITOFP_SW_BB]]:
-; CHECK-NEXT: [[TMP10:%.*]] = shl i224 [[TMP4]], 1
-; CHECK-NEXT: br label %[[ITOFP_SW_EPILOG]]
-; CHECK: [[ITOFP_SW_DEFAULT]]:
-; CHECK-NEXT: [[TMP11:%.*]] = sub i32 169, [[TMP6]]
-; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i224
-; CHECK-NEXT: [[TMP13:%.*]] = lshr i224 [[TMP4]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP6]], 55
-; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i224
-; CHECK-NEXT: [[TMP16:%.*]] = lshr i224 -1, [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = and i224 [[TMP16]], [[TMP4]]
-; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i224 [[TMP17]], 0
-; CHECK-NEXT: [[TMP19:%.*]] = zext i1 [[TMP18]] to i224
-; CHECK-NEXT: [[TMP20:%.*]] = or i224 [[TMP13]], [[TMP19]]
-; CHECK-NEXT: br label %[[ITOFP_SW_EPILOG]]
-; CHECK: [[ITOFP_SW_EPILOG]]:
-; CHECK-NEXT: [[TMP21:%.*]] = phi i224 [ [[TMP20]], %[[ITOFP_SW_DEFAULT]] ], [ [[TMP4]], %[[ITOFP_IF_THEN4]] ], [ [[TMP10]], %[[ITOFP_SW_BB]] ]
-; CHECK-NEXT: [[TMP22:%.*]] = trunc i224 [[TMP21]] to i32
-; CHECK-NEXT: [[TMP23:%.*]] = lshr i32 [[TMP22]], 2
-; CHECK-NEXT: [[TMP24:%.*]] = and i32 [[TMP23]], 1
-; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP24]] to i224
-; CHECK-NEXT: [[TMP26:%.*]] = or i224 [[TMP21]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = add i224 [[TMP26]], 1
-; CHECK-NEXT: [[TMP28:%.*]] = ashr i224 [[TMP27]], 2
-; CHECK-NEXT: [[A3:%.*]] = and i224 [[TMP27]], 36028797018963968
-; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i224 [[A3]], 0
-; CHECK-NEXT: [[TMP30:%.*]] = trunc i224 [[TMP28]] to i64
-; CHECK-NEXT: [[TMP31:%.*]] = lshr i224 [[TMP28]], 32
-; CHECK-NEXT: [[TMP32:%.*]] = trunc i224 [[TMP31]] to i32
-; CHECK-NEXT: br i1 [[TMP29]], label %[[ITOFP_IF_END26:.*]], label %[[ITOFP_IF_THEN20:.*]]
-; CHECK: [[ITOFP_IF_THEN20]]:
-; CHECK-NEXT: [[TMP33:%.*]] = ashr i224 [[TMP27]], 3
-; CHECK-NEXT: [[TMP34:%.*]] = trunc i224 [[TMP33]] to i64
-; CHECK-NEXT: [[TMP35:%.*]] = lshr i224 [[TMP33]], 32
-; CHECK-NEXT: [[TMP36:%.*]] = trunc i224 [[TMP35]] to i32
-; CHECK-NEXT: br label %[[ITOFP_IF_END26]]
-; CHECK: [[ITOFP_IF_ELSE]]:
-; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP6]], -171
-; CHECK-NEXT: [[TMP38:%.*]] = zext i32 [[TMP37]] to i224
-; CHECK-NEXT: [[TMP39:%.*]] = shl i224 [[TMP4]], [[TMP38]]
-; CHECK-NEXT: [[TMP40:%.*]] = trunc i224 [[TMP39]] to i64
-; CHECK-NEXT: [[TMP41:%.*]] = lshr i224 [[TMP39]], 32
-; CHECK-NEXT: [[TMP42:%.*]] = trunc i224 [[TMP41]] to i32
-; CHECK-NEXT: br label %[[ITOFP_IF_END26]]
-; CHECK: [[ITOFP_IF_END26]]:
-; CHECK-NEXT: [[TMP43:%.*]] = phi i64 [ [[TMP34]], %[[ITOFP_IF_THEN20]] ], [ [[TMP30]], %[[ITOFP_SW_EPILOG]] ], [ [[TMP40]], %[[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP44:%.*]] = phi i32 [ [[TMP36]], %[[ITOFP_IF_THEN20]] ], [ [[TMP32]], %[[ITOFP_SW_EPILOG]] ], [ [[TMP42]], %[[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP45:%.*]] = phi i32 [ [[TMP7]], %[[ITOFP_IF_THEN20]] ], [ [[TMP8]], %[[ITOFP_SW_EPILOG]] ], [ [[TMP8]], %[[ITOFP_IF_ELSE]] ]
-; CHECK-NEXT: [[TMP46:%.*]] = trunc i224 [[TMP2]] to i32
-; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP46]], -2147483648
-; CHECK-NEXT: [[TMP48:%.*]] = shl i32 [[TMP45]], 20
-; CHECK-NEXT: [[TMP49:%.*]] = add i32 [[TMP48]], 1072693248
-; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP44]], 1048575
-; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[TMP50]], [[TMP47]]
-; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[TMP51]], [[TMP49]]
-; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP52]] to i64
-; CHECK-NEXT: [[TMP54:%.*]] = shl i64 [[TMP53]], 32
-; CHECK-NEXT: [[TMP55:%.*]] = and i64 [[TMP43]], 4294967295
-; CHECK-NEXT: [[TMP56:%.*]] = or i64 [[TMP54]], [[TMP55]]
-; CHECK-NEXT: [[TMP57:%.*]] = bitcast i64 [[TMP56]] to double
-; CHECK-NEXT: br label %[[ITOFP_RETURN]]
-; CHECK: [[ITOFP_RETURN]]:
-; CHECK-NEXT: [[TMP58:%.*]] = phi double [ [[TMP57]], %[[ITOFP_IF_END26]] ], [ 0.000000e+00, %[[ENTRYITOFP_ENTRY]] ]
+; CHECK-NEXT: [[ITOFP_RETURN:.*:]]
+; CHECK-NEXT: [[TMP58:%.*]] = sitofp i224 [[TMP0]] to double
; CHECK-NEXT: ret double [[TMP58]]
;
entry:
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll
index 751bdbade15d9..182073312504f 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/sdiv129.ll
@@ -5,7 +5,7 @@
define void @sdiv129(ptr %ptr, ptr %out) nounwind !prof !0 {
; CHECK-LABEL: @sdiv129(
; CHECK-NEXT: _udiv-special-cases:
-; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
+; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 32
; CHECK-NEXT: [[TMP0:%.*]] = freeze i129 [[A]]
; CHECK-NEXT: [[TMP1:%.*]] = freeze i129 3
; CHECK-NEXT: [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
@@ -67,7 +67,7 @@ define void @sdiv129(ptr %ptr, ptr %out) nounwind !prof !0 {
; CHECK-NEXT: [[TMP48:%.*]] = phi i129 [ [[TMP25]], [[UDIV_LOOP_EXIT]] ], [ [[TMP20]], [[_UDIV_SPECIAL_CASES:%.*]] ]
; CHECK-NEXT: [[TMP49:%.*]] = xor i129 [[TMP48]], [[TMP8]]
; CHECK-NEXT: [[TMP50:%.*]] = sub i129 [[TMP49]], [[TMP8]]
-; CHECK-NEXT: store i129 [[TMP50]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: store i129 [[TMP50]], ptr [[OUT:%.*]], align 32
; CHECK-NEXT: ret void
;
%a = load i129, ptr %ptr
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll
index 45491ccda2b19..6257ac24355a3 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/srem129.ll
@@ -5,7 +5,7 @@
define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
; CHECK-LABEL: @test(
; CHECK-NEXT: _udiv-special-cases:
-; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
+; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 32
; CHECK-NEXT: [[TMP0:%.*]] = freeze i129 [[A]]
; CHECK-NEXT: [[TMP1:%.*]] = freeze i129 3
; CHECK-NEXT: [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
@@ -70,7 +70,7 @@ define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
; CHECK-NEXT: [[TMP51:%.*]] = sub i129 [[TMP8]], [[TMP50]]
; CHECK-NEXT: [[TMP52:%.*]] = xor i129 [[TMP51]], [[TMP2]]
; CHECK-NEXT: [[TMP53:%.*]] = sub i129 [[TMP52]], [[TMP2]]
-; CHECK-NEXT: store i129 [[TMP53]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: store i129 [[TMP53]], ptr [[OUT:%.*]], align 32
; CHECK-NEXT: ret void
;
%a = load i129, ptr %ptr
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll
index 6ad696ae446fd..9f96313d29891 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/udiv129.ll
@@ -5,7 +5,7 @@
define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
; CHECK-LABEL: @test(
; CHECK-NEXT: _udiv-special-cases:
-; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
+; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 32
; CHECK-NEXT: [[TMP0:%.*]] = freeze i129 3
; CHECK-NEXT: [[TMP1:%.*]] = freeze i129 [[A]]
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i129 [[TMP0]], 0
@@ -56,7 +56,7 @@ define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
; CHECK-NEXT: br i1 [[TMP38]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]], !prof [[PROF1]]
; CHECK: udiv-end:
; CHECK-NEXT: [[TMP39:%.*]] = phi i129 [ [[TMP16]], [[UDIV_LOOP_EXIT]] ], [ [[TMP11]], [[_UDIV_SPECIAL_CASES:%.*]] ]
-; CHECK-NEXT: store i129 [[TMP39]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: store i129 [[TMP39]], ptr [[OUT:%.*]], align 32
; CHECK-NEXT: ret void
;
%a = load i129, ptr %ptr
diff --git a/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll b/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll
index a4c4ac2cba329..a20ffd2575d33 100644
--- a/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll
+++ b/llvm/test/Transforms/ExpandIRInsts/X86/urem129.ll
@@ -5,7 +5,7 @@
define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
; CHECK-LABEL: @test(
; CHECK-NEXT: _udiv-special-cases:
-; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
+; CHECK-NEXT: [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 32
; CHECK-NEXT: [[TMP0:%.*]] = freeze i129 [[A]]
; CHECK-NEXT: [[TMP1:%.*]] = freeze i129 3
; CHECK-NEXT: [[TMP2:%.*]] = freeze i129 [[TMP1]]
@@ -60,7 +60,7 @@ define void @test(ptr %ptr, ptr %out) nounwind !prof !0 {
; CHECK-NEXT: [[TMP41:%.*]] = phi i129 [ [[TMP18]], [[UDIV_LOOP_EXIT]] ], [ [[TMP13]], [[_UDIV_SPECIAL_CASES:%.*]] ]
; CHECK-NEXT: [[TMP42:%.*]] = mul i129 [[TMP1]], [[TMP41]]
; CHECK-NEXT: [[TMP43:%.*]] = sub i129 [[TMP0]], [[TMP42]]
-; CHECK-NEXT: store i129 [[TMP43]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT: store i129 [[TMP43]], ptr [[OUT:%.*]], align 32
; CHECK-NEXT: ret void
;
%a = load i129, ptr %ptr
>From 4829f992c80a837aca96389223a259a51cfe0823 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:39:47 +0100
Subject: [PATCH 07/17] [compiler-rt] Add __int256 builtins
Add 37 new runtime builtins for 256-bit integer operations, conditional
on CRT_HAS_256BIT (requires 64-bit pointers + compiler __int256 support):
- Arithmetic: multi5 (multiply), divmodoi4/udivmodoi4, div/mod/udiv/umod
- Shifts: ashloi3, ashroi3, lshroi3
- Comparisons: cmpoi2, ucmpoi2
- Bit operations: clzoi2, ctzoi2, ffsoi2, popcountoi2, parityoi2
- Overflow-checked: addvoi3, subvoi3, mulvoi3, absvoi2, negvoi2, negoi2
- Float conversions: fix/fixuns/float/floatun for sf/df/tf/xf
Extends int_types.h (oi_int = 256-bit), int_lib.h, fp_fixint_impl.inc,
int_to_fp.h, int_to_fp_impl.inc with generic 256-bit support.
CMakeLists.txt conditionally compiles on 64-bit targets.
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
compiler-rt/lib/builtins/CMakeLists.txt | 40 ++++++
compiler-rt/lib/builtins/absvoi2.c | 29 ++++
compiler-rt/lib/builtins/addvoi3.c | 33 +++++
compiler-rt/lib/builtins/ashloi3.c | 39 ++++++
compiler-rt/lib/builtins/ashroi3.c | 40 ++++++
compiler-rt/lib/builtins/clzoi2.c | 29 ++++
compiler-rt/lib/builtins/cmpoi2.c | 37 +++++
compiler-rt/lib/builtins/ctzoi2.c | 29 ++++
compiler-rt/lib/builtins/divmodoi4.c | 32 +++++
compiler-rt/lib/builtins/divoi3.c | 26 ++++
compiler-rt/lib/builtins/ffsoi2.c | 31 +++++
compiler-rt/lib/builtins/fixdfoi.c | 21 +++
compiler-rt/lib/builtins/fixsfoi.c | 21 +++
compiler-rt/lib/builtins/fixtfoi.c | 18 +++
compiler-rt/lib/builtins/fixunsdfoi.c | 17 +++
compiler-rt/lib/builtins/fixunssfoi.c | 17 +++
compiler-rt/lib/builtins/fixunstfoi.c | 17 +++
compiler-rt/lib/builtins/fixunsxfoi.c | 44 ++++++
compiler-rt/lib/builtins/fixxfoi.c | 46 ++++++
compiler-rt/lib/builtins/floatoidf.c | 23 +++
compiler-rt/lib/builtins/floatoisf.c | 23 +++
compiler-rt/lib/builtins/floatoitf.c | 26 ++++
compiler-rt/lib/builtins/floatoixf.c | 73 ++++++++++
compiler-rt/lib/builtins/floatunoidf.c | 23 +++
compiler-rt/lib/builtins/floatunoisf.c | 23 +++
compiler-rt/lib/builtins/floatunoitf.c | 26 ++++
compiler-rt/lib/builtins/floatunoixf.c | 70 ++++++++++
compiler-rt/lib/builtins/fp_fixint_impl.inc | 4 +-
compiler-rt/lib/builtins/int_lib.h | 6 +
compiler-rt/lib/builtins/int_to_fp.h | 10 ++
compiler-rt/lib/builtins/int_to_fp_impl.inc | 6 +
compiler-rt/lib/builtins/int_types.h | 50 +++++++
compiler-rt/lib/builtins/lshroi3.c | 38 +++++
compiler-rt/lib/builtins/modoi3.c | 26 ++++
compiler-rt/lib/builtins/muloi5.c | 29 ++++
compiler-rt/lib/builtins/multi5.c | 51 +++++++
compiler-rt/lib/builtins/mulvoi3.c | 27 ++++
compiler-rt/lib/builtins/negoi2.c | 25 ++++
compiler-rt/lib/builtins/negvoi2.c | 28 ++++
compiler-rt/lib/builtins/parityoi2.c | 36 +++++
compiler-rt/lib/builtins/popcountoi2.c | 27 ++++
compiler-rt/lib/builtins/subvoi3.c | 33 +++++
compiler-rt/lib/builtins/ucmpoi2.c | 37 +++++
compiler-rt/lib/builtins/udivmodoi4.c | 147 ++++++++++++++++++++
compiler-rt/lib/builtins/udivoi3.c | 23 +++
compiler-rt/lib/builtins/umodoi3.c | 25 ++++
46 files changed, 1479 insertions(+), 2 deletions(-)
create mode 100644 compiler-rt/lib/builtins/absvoi2.c
create mode 100644 compiler-rt/lib/builtins/addvoi3.c
create mode 100644 compiler-rt/lib/builtins/ashloi3.c
create mode 100644 compiler-rt/lib/builtins/ashroi3.c
create mode 100644 compiler-rt/lib/builtins/clzoi2.c
create mode 100644 compiler-rt/lib/builtins/cmpoi2.c
create mode 100644 compiler-rt/lib/builtins/ctzoi2.c
create mode 100644 compiler-rt/lib/builtins/divmodoi4.c
create mode 100644 compiler-rt/lib/builtins/divoi3.c
create mode 100644 compiler-rt/lib/builtins/ffsoi2.c
create mode 100644 compiler-rt/lib/builtins/fixdfoi.c
create mode 100644 compiler-rt/lib/builtins/fixsfoi.c
create mode 100644 compiler-rt/lib/builtins/fixtfoi.c
create mode 100644 compiler-rt/lib/builtins/fixunsdfoi.c
create mode 100644 compiler-rt/lib/builtins/fixunssfoi.c
create mode 100644 compiler-rt/lib/builtins/fixunstfoi.c
create mode 100644 compiler-rt/lib/builtins/fixunsxfoi.c
create mode 100644 compiler-rt/lib/builtins/fixxfoi.c
create mode 100644 compiler-rt/lib/builtins/floatoidf.c
create mode 100644 compiler-rt/lib/builtins/floatoisf.c
create mode 100644 compiler-rt/lib/builtins/floatoitf.c
create mode 100644 compiler-rt/lib/builtins/floatoixf.c
create mode 100644 compiler-rt/lib/builtins/floatunoidf.c
create mode 100644 compiler-rt/lib/builtins/floatunoisf.c
create mode 100644 compiler-rt/lib/builtins/floatunoitf.c
create mode 100644 compiler-rt/lib/builtins/floatunoixf.c
create mode 100644 compiler-rt/lib/builtins/lshroi3.c
create mode 100644 compiler-rt/lib/builtins/modoi3.c
create mode 100644 compiler-rt/lib/builtins/muloi5.c
create mode 100644 compiler-rt/lib/builtins/multi5.c
create mode 100644 compiler-rt/lib/builtins/mulvoi3.c
create mode 100644 compiler-rt/lib/builtins/negoi2.c
create mode 100644 compiler-rt/lib/builtins/negvoi2.c
create mode 100644 compiler-rt/lib/builtins/parityoi2.c
create mode 100644 compiler-rt/lib/builtins/popcountoi2.c
create mode 100644 compiler-rt/lib/builtins/subvoi3.c
create mode 100644 compiler-rt/lib/builtins/ucmpoi2.c
create mode 100644 compiler-rt/lib/builtins/udivmodoi4.c
create mode 100644 compiler-rt/lib/builtins/udivoi3.c
create mode 100644 compiler-rt/lib/builtins/umodoi3.c
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index f43ef4743ff97..d1b19fd375fb3 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -72,74 +72,94 @@ include_directories(../../../third-party/siphash/include)
set(GENERIC_SOURCES
absvdi2.c
absvsi2.c
+ absvoi2.c
absvti2.c
adddf3.c
addsf3.c
addvdi3.c
addvsi3.c
+ addvoi3.c
addvti3.c
apple_versioning.c
ashldi3.c
+ ashloi3.c
ashlti3.c
ashrdi3.c
+ ashroi3.c
ashrti3.c
bswapdi2.c
bswapsi2.c
clzdi2.c
clzsi2.c
+ clzoi2.c
clzti2.c
cmpdi2.c
+ cmpoi2.c
cmpti2.c
comparedf2.c
comparesf2.c
ctzdi2.c
ctzsi2.c
+ ctzoi2.c
ctzti2.c
divdc3.c
divdf3.c
divdi3.c
divmoddi4.c
divmodsi4.c
+ divmodoi4.c
divmodti4.c
divsc3.c
divsf3.c
divsi3.c
+ divoi3.c
divti3.c
extendsfdf2.c
extendhfsf2.c
extendhfdf2.c
ffsdi2.c
ffssi2.c
+ ffsoi2.c
ffsti2.c
fixdfdi.c
fixdfsi.c
+ fixdfoi.c
fixdfti.c
fixsfdi.c
+ fixsfoi.c
fixsfsi.c
fixsfti.c
fixunsdfdi.c
+ fixunsdfoi.c
fixunsdfsi.c
fixunsdfti.c
fixunssfdi.c
+ fixunssfoi.c
fixunssfsi.c
fixunssfti.c
floatdidf.c
floatdisf.c
floatsidf.c
floatsisf.c
+ floatoidf.c
+ floatoisf.c
floattidf.c
floattisf.c
floatundidf.c
floatundisf.c
floatunsidf.c
floatunsisf.c
+ floatunoidf.c
+ floatunoisf.c
floatuntidf.c
floatuntisf.c
fp_mode.c
int_util.c
lshrdi3.c
+ lshroi3.c
lshrti3.c
moddi3.c
+ modoi3.c
modsi3.c
modti3.c
muldc3.c
@@ -147,25 +167,32 @@ set(GENERIC_SOURCES
muldi3.c
mulodi4.c
mulosi4.c
+ muloi5.c
muloti4.c
mulsc3.c
mulsf3.c
multi3.c
+ multi5.c
mulvdi3.c
+ mulvoi3.c
mulvsi3.c
mulvti3.c
negdf2.c
negdi2.c
negsf2.c
+ negoi2.c
negti2.c
negvdi2.c
+ negvoi2.c
negvsi2.c
negvti2.c
os_version_check.c
paritydi2.c
paritysi2.c
+ parityoi2.c
parityti2.c
popcountdi2.c
+ popcountoi2.c
popcountsi2.c
popcountti2.c
powidf2.c
@@ -174,20 +201,25 @@ set(GENERIC_SOURCES
subsf3.c
subvdi3.c
subvsi3.c
+ subvoi3.c
subvti3.c
trampoline_setup.c
truncdfhf2.c
truncdfsf2.c
truncsfhf2.c
ucmpdi2.c
+ ucmpoi2.c
ucmpti2.c
udivdi3.c
udivmoddi4.c
+ udivmodoi4.c
udivmodsi4.c
udivmodti4.c
+ udivoi3.c
udivsi3.c
udivti3.c
umoddi3.c
+ umodoi3.c
umodsi3.c
umodti3.c
)
@@ -211,14 +243,18 @@ set(GENERIC_TF_SOURCES
extendsftf2.c
fixtfdi.c
fixtfsi.c
+ fixtfoi.c
fixtfti.c
fixunstfdi.c
+ fixunstfoi.c
fixunstfsi.c
fixunstfti.c
floatditf.c
+ floatoitf.c
floatsitf.c
floattitf.c
floatunditf.c
+ floatunoitf.c
floatunsitf.c
floatuntitf.c
multc3.c
@@ -306,13 +342,17 @@ set(x86_80_BIT_SOURCES
extendhfxf2.c
extendxftf2.c
fixxfdi.c
+ fixxfoi.c
fixxfti.c
fixunsxfdi.c
+ fixunsxfoi.c
fixunsxfsi.c
fixunsxfti.c
floatdixf.c
+ floatoixf.c
floattixf.c
floatundixf.c
+ floatunoixf.c
floatuntixf.c
mulxc3.c
powixf2.c
diff --git a/compiler-rt/lib/builtins/absvoi2.c b/compiler-rt/lib/builtins/absvoi2.c
new file mode 100644
index 0000000000000..1cc6c8a47731b
--- /dev/null
+++ b/compiler-rt/lib/builtins/absvoi2.c
@@ -0,0 +1,29 @@
+//===-- absvoi2.c - Implement __absvoi2 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __absvoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: absolute value
+
+// Effects: aborts if abs(x) < 0
+
+COMPILER_RT_ABI oi_int __absvoi2(oi_int a) {
+ const int N = (int)(sizeof(oi_int) * CHAR_BIT);
+ if (a == (oi_int)((ou_int)1 << (N - 1)))
+ compilerrt_abort();
+ const oi_int s = a >> (N - 1);
+ return (a ^ s) - s;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/addvoi3.c b/compiler-rt/lib/builtins/addvoi3.c
new file mode 100644
index 0000000000000..8000c6b9104a5
--- /dev/null
+++ b/compiler-rt/lib/builtins/addvoi3.c
@@ -0,0 +1,33 @@
+//===-- addvoi3.c - Implement __addvoi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __addvoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a + b
+
+// Effects: aborts if a + b overflows
+
+COMPILER_RT_ABI oi_int __addvoi3(oi_int a, oi_int b) {
+ oi_int s = (ou_int)a + (ou_int)b;
+ if (b >= 0) {
+ if (s < a)
+ compilerrt_abort();
+ } else {
+ if (s >= a)
+ compilerrt_abort();
+ }
+ return s;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/ashloi3.c b/compiler-rt/lib/builtins/ashloi3.c
new file mode 100644
index 0000000000000..9d81628403ab7
--- /dev/null
+++ b/compiler-rt/lib/builtins/ashloi3.c
@@ -0,0 +1,39 @@
+//===-- ashloi3.c - Implement __ashloi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __ashloi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a << b
+
+// Precondition: 0 <= b < bits_in_oword
+
+COMPILER_RT_ABI oi_int __ashloi3(oi_int a, int b) {
+ const int bits_in_tword = (int)(sizeof(ti_int) * CHAR_BIT);
+ owords input;
+ owords result;
+ input.all = a;
+ if (b & bits_in_tword) /* bits_in_tword <= b < bits_in_oword */ {
+ result.s.low = 0;
+ result.s.high = input.s.low << (b - bits_in_tword);
+ } else /* 0 <= b < bits_in_tword */ {
+ if (b == 0)
+ return a;
+ result.s.low = input.s.low << b;
+ result.s.high =
+ ((tu_int)input.s.high << b) | (input.s.low >> (bits_in_tword - b));
+ }
+ return result.all;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/ashroi3.c b/compiler-rt/lib/builtins/ashroi3.c
new file mode 100644
index 0000000000000..35b583d47f7cb
--- /dev/null
+++ b/compiler-rt/lib/builtins/ashroi3.c
@@ -0,0 +1,40 @@
+//===-- ashroi3.c - Implement __ashroi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __ashroi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: arithmetic a >> b
+
+// Precondition: 0 <= b < bits_in_oword
+
+COMPILER_RT_ABI oi_int __ashroi3(oi_int a, int b) {
+ const int bits_in_tword = (int)(sizeof(ti_int) * CHAR_BIT);
+ owords input;
+ owords result;
+ input.all = a;
+ if (b & bits_in_tword) /* bits_in_tword <= b < bits_in_oword */ {
+ // result.s.high = input.s.high < 0 ? -1 : 0
+ result.s.high = input.s.high >> (bits_in_tword - 1);
+ result.s.low = input.s.high >> (b - bits_in_tword);
+ } else /* 0 <= b < bits_in_tword */ {
+ if (b == 0)
+ return a;
+ result.s.high = input.s.high >> b;
+ result.s.low =
+ ((tu_int)input.s.high << (bits_in_tword - b)) | (input.s.low >> b);
+ }
+ return result.all;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/clzoi2.c b/compiler-rt/lib/builtins/clzoi2.c
new file mode 100644
index 0000000000000..11a61d1034129
--- /dev/null
+++ b/compiler-rt/lib/builtins/clzoi2.c
@@ -0,0 +1,29 @@
+//===-- clzoi2.c - Implement __clzoi2 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __clzoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: the number of leading 0-bits
+
+// Precondition: a != 0
+
+COMPILER_RT_ABI int __clzoi2(oi_int a) {
+ owords x;
+ x.all = a;
+ const ti_int f = -(x.s.high == 0);
+ return __clzti2((x.s.high & ~f) | (x.s.low & f)) +
+ ((si_int)f & ((si_int)(sizeof(ti_int) * CHAR_BIT)));
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/cmpoi2.c b/compiler-rt/lib/builtins/cmpoi2.c
new file mode 100644
index 0000000000000..ba16733eda25c
--- /dev/null
+++ b/compiler-rt/lib/builtins/cmpoi2.c
@@ -0,0 +1,37 @@
+//===-- cmpoi2.c - Implement __cmpoi2 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __cmpoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: if (a < b) returns 0
+// if (a == b) returns 1
+// if (a > b) returns 2
+
+COMPILER_RT_ABI si_int __cmpoi2(oi_int a, oi_int b) {
+ owords x;
+ x.all = a;
+ owords y;
+ y.all = b;
+ if (x.s.high < y.s.high)
+ return 0;
+ if (x.s.high > y.s.high)
+ return 2;
+ if (x.s.low < y.s.low)
+ return 0;
+ if (x.s.low > y.s.low)
+ return 2;
+ return 1;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/ctzoi2.c b/compiler-rt/lib/builtins/ctzoi2.c
new file mode 100644
index 0000000000000..b477aa01b31a8
--- /dev/null
+++ b/compiler-rt/lib/builtins/ctzoi2.c
@@ -0,0 +1,29 @@
+//===-- ctzoi2.c - Implement __ctzoi2 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __ctzoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: the number of trailing 0-bits
+
+// Precondition: a != 0
+
+COMPILER_RT_ABI int __ctzoi2(oi_int a) {
+ owords x;
+ x.all = a;
+ if (x.s.low != 0)
+ return __ctzti2(x.s.low);
+ return __ctzti2(x.s.high) + (int)(sizeof(ti_int) * CHAR_BIT);
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/divmodoi4.c b/compiler-rt/lib/builtins/divmodoi4.c
new file mode 100644
index 0000000000000..450dcaecf0720
--- /dev/null
+++ b/compiler-rt/lib/builtins/divmodoi4.c
@@ -0,0 +1,32 @@
+//===-- divmodoi4.c - Implement __divmodoi4 -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __divmodoi4 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a / b, *rem = a % b
+
+COMPILER_RT_ABI oi_int __divmodoi4(oi_int a, oi_int b, oi_int *rem) {
+ const int bits_in_oword_m1 = (int)(sizeof(oi_int) * CHAR_BIT) - 1;
+ oi_int s_a = a >> bits_in_oword_m1; // s_a = a < 0 ? -1 : 0
+ oi_int s_b = b >> bits_in_oword_m1; // s_b = b < 0 ? -1 : 0
+ a = (ou_int)(a ^ s_a) - s_a; // negate if s_a == -1
+ b = (ou_int)(b ^ s_b) - s_b; // negate if s_b == -1
+ s_b ^= s_a; // sign of quotient
+ ou_int r;
+ oi_int q = (__udivmodoi4(a, b, &r) ^ s_b) - s_b; // negate if s_b == -1
+ *rem = (r ^ s_a) - s_a; // negate if s_a == -1
+ return q;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/divoi3.c b/compiler-rt/lib/builtins/divoi3.c
new file mode 100644
index 0000000000000..2132b578cdecd
--- /dev/null
+++ b/compiler-rt/lib/builtins/divoi3.c
@@ -0,0 +1,26 @@
+//===-- divoi3.c - Implement __divoi3 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __divoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a / b
+
+#define fixint_t oi_int
+#define fixuint_t ou_int
+#define COMPUTE_UDIV(a, b) __udivmodoi4((a), (b), (ou_int *)0)
+#include "int_div_impl.inc"
+
+COMPILER_RT_ABI oi_int __divoi3(oi_int a, oi_int b) { return __divXi3(a, b); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/ffsoi2.c b/compiler-rt/lib/builtins/ffsoi2.c
new file mode 100644
index 0000000000000..2378ee305ec6a
--- /dev/null
+++ b/compiler-rt/lib/builtins/ffsoi2.c
@@ -0,0 +1,31 @@
+//===-- ffsoi2.c - Implement __ffsoi2 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __ffsoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: the index of the least significant 1-bit in a, or
+// the value zero if a is zero. The least significant bit is index one.
+
+COMPILER_RT_ABI int __ffsoi2(oi_int a) {
+ owords x;
+ x.all = a;
+ if (x.s.low == 0) {
+ if (x.s.high == 0)
+ return 0;
+ return __ctzti2(x.s.high) + (1 + sizeof(ti_int) * CHAR_BIT);
+ }
+ return __ctzti2(x.s.low) + 1;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/fixdfoi.c b/compiler-rt/lib/builtins/fixdfoi.c
new file mode 100644
index 0000000000000..cfe45b63cb520
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixdfoi.c
@@ -0,0 +1,21 @@
+//===-- fixdfoi.c - Implement __fixdfoi -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+#define DOUBLE_PRECISION
+#include "fp_lib.h"
+
+typedef oi_int fixint_t;
+typedef ou_int fixuint_t;
+#include "fp_fixint_impl.inc"
+
+COMPILER_RT_ABI oi_int __fixdfoi(fp_t a) { return __fixint(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/fixsfoi.c b/compiler-rt/lib/builtins/fixsfoi.c
new file mode 100644
index 0000000000000..2c67dee2bb206
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixsfoi.c
@@ -0,0 +1,21 @@
+//===-- fixsfoi.c - Implement __fixsfoi -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+#define SINGLE_PRECISION
+#include "fp_lib.h"
+
+typedef oi_int fixint_t;
+typedef ou_int fixuint_t;
+#include "fp_fixint_impl.inc"
+
+COMPILER_RT_ABI oi_int __fixsfoi(fp_t a) { return __fixint(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/fixtfoi.c b/compiler-rt/lib/builtins/fixtfoi.c
new file mode 100644
index 0000000000000..7edab043c4c40
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixtfoi.c
@@ -0,0 +1,18 @@
+//===-- fixtfoi.c - Implement __fixtfoi -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+typedef oi_int fixint_t;
+typedef ou_int fixuint_t;
+#include "fp_fixint_impl.inc"
+
+COMPILER_RT_ABI oi_int __fixtfoi(fp_t a) { return __fixint(a); }
+#endif
diff --git a/compiler-rt/lib/builtins/fixunsdfoi.c b/compiler-rt/lib/builtins/fixunsdfoi.c
new file mode 100644
index 0000000000000..86066cd83e674
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixunsdfoi.c
@@ -0,0 +1,17 @@
+//===-- fixunsdfoi.c - Implement __fixunsdfoi -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define DOUBLE_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_256BIT)
+typedef ou_int fixuint_t;
+#include "fp_fixuint_impl.inc"
+
+COMPILER_RT_ABI ou_int __fixunsdfoi(fp_t a) { return __fixuint(a); }
+#endif
diff --git a/compiler-rt/lib/builtins/fixunssfoi.c b/compiler-rt/lib/builtins/fixunssfoi.c
new file mode 100644
index 0000000000000..069dc584ea18b
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixunssfoi.c
@@ -0,0 +1,17 @@
+//===-- fixunssfoi.c - Implement __fixunssfoi -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define SINGLE_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_256BIT)
+typedef ou_int fixuint_t;
+#include "fp_fixuint_impl.inc"
+
+COMPILER_RT_ABI ou_int __fixunssfoi(fp_t a) { return __fixuint(a); }
+#endif
diff --git a/compiler-rt/lib/builtins/fixunstfoi.c b/compiler-rt/lib/builtins/fixunstfoi.c
new file mode 100644
index 0000000000000..00c9aff080a70
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixunstfoi.c
@@ -0,0 +1,17 @@
+//===-- fixunstfoi.c - Implement __fixunstfoi -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+typedef ou_int fixuint_t;
+#include "fp_fixuint_impl.inc"
+
+COMPILER_RT_ABI ou_int __fixunstfoi(fp_t a) { return __fixuint(a); }
+#endif
diff --git a/compiler-rt/lib/builtins/fixunsxfoi.c b/compiler-rt/lib/builtins/fixunsxfoi.c
new file mode 100644
index 0000000000000..2297f9d3dc335
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixunsxfoi.c
@@ -0,0 +1,44 @@
+//===-- fixunsxfoi.c - Implement __fixunsxfoi -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __fixunsxfoi for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: convert a to an unsigned 256-bit integer, rounding toward zero.
+// Negative values all become zero.
+
+// Assumption: long double is an intel 80 bit floating point type padded with 6
+// bytes ou_int is a 256 bit integral type value in long double is representable
+// in ou_int or is negative
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI ou_int __fixunsxfoi(xf_float a) {
+ xf_bits fb;
+ fb.f = a;
+ int e = (fb.u.high.s.low & 0x00007FFF) - 16383;
+ if (e < 0 || (fb.u.high.s.low & 0x00008000))
+ return 0;
+ if ((unsigned)e > sizeof(ou_int) * CHAR_BIT)
+ return ~(ou_int)0;
+ ou_int r = fb.u.low.all;
+ if (e > 63)
+ r <<= (e - 63);
+ else
+ r >>= (63 - e);
+ return r;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/fixxfoi.c b/compiler-rt/lib/builtins/fixxfoi.c
new file mode 100644
index 0000000000000..3a1003728be92
--- /dev/null
+++ b/compiler-rt/lib/builtins/fixxfoi.c
@@ -0,0 +1,46 @@
+//===-- fixxfoi.c - Implement __fixxfoi -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __fixxfoi for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: convert a to a signed 256-bit integer, rounding toward zero.
+
+// Assumption: long double is an intel 80 bit floating point type padded with 6
+// bytes oi_int is a 256 bit integral type value in long double is representable
+// in oi_int
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI oi_int __fixxfoi(xf_float a) {
+ const oi_int oi_max = (oi_int)((~(ou_int)0) / 2);
+ const oi_int oi_min = -oi_max - 1;
+ xf_bits fb;
+ fb.f = a;
+ int e = (fb.u.high.s.low & 0x00007FFF) - 16383;
+ if (e < 0)
+ return 0;
+ oi_int s = -(si_int)((fb.u.high.s.low & 0x00008000) >> 15);
+ oi_int r = fb.u.low.all;
+ if ((unsigned)e >= sizeof(oi_int) * CHAR_BIT)
+ return a > 0 ? oi_max : oi_min;
+ if (e > 63)
+ r <<= (e - 63);
+ else
+ r >>= (63 - e);
+ return (r ^ s) - s;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatoidf.c b/compiler-rt/lib/builtins/floatoidf.c
new file mode 100644
index 0000000000000..89cc399b061fe
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatoidf.c
@@ -0,0 +1,23 @@
+//===-- floatoidf.c - Implement __floatoidf -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatoidf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+#define SRC_I256
+#define DST_DOUBLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI double __floatoidf(oi_int a) { return __floatXiYf__(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatoisf.c b/compiler-rt/lib/builtins/floatoisf.c
new file mode 100644
index 0000000000000..3efaa1ebbeec4
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatoisf.c
@@ -0,0 +1,23 @@
+//===-- floatoisf.c - Implement __floatoisf -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatoisf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+#define SRC_I256
+#define DST_SINGLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI float __floatoisf(oi_int a) { return __floatXiYf__(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatoitf.c b/compiler-rt/lib/builtins/floatoitf.c
new file mode 100644
index 0000000000000..5ca149d73c966
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatoitf.c
@@ -0,0 +1,26 @@
+//===-- floatoitf.c - int256 -> quad-precision conversion ---------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements oi_int to quad-precision conversion for the
+// compiler-rt library in the IEEE-754 default round-to-nearest, ties-to-even
+// mode.
+//
+//===----------------------------------------------------------------------===//
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+#include "int_lib.h"
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+#define SRC_I256
+#define DST_QUAD
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI fp_t __floatoitf(oi_int a) { return __floatXiYf__(a); }
+
+#endif
diff --git a/compiler-rt/lib/builtins/floatoixf.c b/compiler-rt/lib/builtins/floatoixf.c
new file mode 100644
index 0000000000000..253a89847c401
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatoixf.c
@@ -0,0 +1,73 @@
+//===-- floatoixf.c - Implement __floatoixf -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatoixf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: convert a to a long double, rounding toward even.
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits oi_int is a 256 bit integral type
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI xf_float __floatoixf(oi_int a) {
+ if (a == 0)
+ return 0.0;
+ const unsigned N = sizeof(oi_int) * CHAR_BIT;
+ const oi_int s = a >> (N - 1);
+ a = (a ^ s) - s;
+ int sd = N - __clzoi2(a); // number of significant digits
+ int e = sd - 1; // exponent
+ if (sd > LDBL_MANT_DIG) {
+ // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
+ // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
+ // 12345678901234567890123456
+ // 1 = msb 1 bit
+ // P = bit LDBL_MANT_DIG-1 bits to the right of 1
+ // Q = bit LDBL_MANT_DIG bits to the right of 1
+ // R = "or" of all bits to the right of Q
+ switch (sd) {
+ case LDBL_MANT_DIG + 1:
+ a <<= 1;
+ break;
+ case LDBL_MANT_DIG + 2:
+ break;
+ default:
+ a = ((ou_int)a >> (sd - (LDBL_MANT_DIG + 2))) |
+ ((a & ((ou_int)(-1) >> ((N + LDBL_MANT_DIG + 2) - sd))) != 0);
+ };
+ // finish:
+ a |= (a & 4) != 0; // Or P into R
+ ++a; // round - this step may add a significant bit
+ a >>= 2; // dump Q and R
+ // a is now rounded to LDBL_MANT_DIG or LDBL_MANT_DIG+1 bits
+ if (a & ((ou_int)1 << LDBL_MANT_DIG)) {
+ a >>= 1;
+ ++e;
+ }
+ // a is now rounded to LDBL_MANT_DIG bits
+ } else {
+ a <<= (LDBL_MANT_DIG - sd);
+ // a is now rounded to LDBL_MANT_DIG bits
+ }
+ xf_bits fb;
+ fb.u.high.s.low = ((su_int)s & 0x8000) | // sign
+ (e + 16383); // exponent
+ fb.u.low.all = (du_int)a; // mantissa
+ return fb.f;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatunoidf.c b/compiler-rt/lib/builtins/floatunoidf.c
new file mode 100644
index 0000000000000..22eb74d08bfd8
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatunoidf.c
@@ -0,0 +1,23 @@
+//===-- floatunoidf.c - Implement __floatunoidf ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatunoidf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+#define SRC_U256
+#define DST_DOUBLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI double __floatunoidf(ou_int a) { return __floatXiYf__(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatunoisf.c b/compiler-rt/lib/builtins/floatunoisf.c
new file mode 100644
index 0000000000000..e0d13f4baee2d
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatunoisf.c
@@ -0,0 +1,23 @@
+//===-- floatunoisf.c - Implement __floatunoisf ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatunoisf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+#define SRC_U256
+#define DST_SINGLE
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI float __floatunoisf(ou_int a) { return __floatXiYf__(a); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/floatunoitf.c b/compiler-rt/lib/builtins/floatunoitf.c
new file mode 100644
index 0000000000000..d4a8de96b517e
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatunoitf.c
@@ -0,0 +1,26 @@
+//===-- floatunoitf.c - uint256 -> quad-precision conversion ------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ou_int to quad-precision conversion for the
+// compiler-rt library in the IEEE-754 default round-to-nearest, ties-to-even
+// mode.
+//
+//===----------------------------------------------------------------------===//
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+#include "int_lib.h"
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+#define SRC_U256
+#define DST_QUAD
+#include "int_to_fp_impl.inc"
+
+COMPILER_RT_ABI fp_t __floatunoitf(ou_int a) { return __floatXiYf__(a); }
+
+#endif
diff --git a/compiler-rt/lib/builtins/floatunoixf.c b/compiler-rt/lib/builtins/floatunoixf.c
new file mode 100644
index 0000000000000..49b15ca7e242c
--- /dev/null
+++ b/compiler-rt/lib/builtins/floatunoixf.c
@@ -0,0 +1,70 @@
+//===-- floatunoixf.c - Implement __floatunoixf ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __floatunoixf for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: convert a to a long double, rounding toward even.
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits ou_int is a 256 bit integral type
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI xf_float __floatunoixf(ou_int a) {
+ if (a == 0)
+ return 0.0;
+ const unsigned N = sizeof(ou_int) * CHAR_BIT;
+ int sd = N - __clzoi2(a); // number of significant digits
+ int e = sd - 1; // exponent
+ if (sd > LDBL_MANT_DIG) {
+ // start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
+ // finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
+ // 12345678901234567890123456
+ // 1 = msb 1 bit
+ // P = bit LDBL_MANT_DIG-1 bits to the right of 1
+ // Q = bit LDBL_MANT_DIG bits to the right of 1
+ // R = "or" of all bits to the right of Q
+ switch (sd) {
+ case LDBL_MANT_DIG + 1:
+ a <<= 1;
+ break;
+ case LDBL_MANT_DIG + 2:
+ break;
+ default:
+ a = (a >> (sd - (LDBL_MANT_DIG + 2))) |
+ ((a & ((ou_int)(-1) >> ((N + LDBL_MANT_DIG + 2) - sd))) != 0);
+ };
+ // finish:
+ a |= (a & 4) != 0; // Or P into R
+ ++a; // round - this step may add a significant bit
+ a >>= 2; // dump Q and R
+ // a is now rounded to LDBL_MANT_DIG or LDBL_MANT_DIG+1 bits
+ if (a & ((ou_int)1 << LDBL_MANT_DIG)) {
+ a >>= 1;
+ ++e;
+ }
+ // a is now rounded to LDBL_MANT_DIG bits
+ } else {
+ a <<= (LDBL_MANT_DIG - sd);
+ // a is now rounded to LDBL_MANT_DIG bits
+ }
+ xf_bits fb;
+ fb.u.high.s.low = (e + 16383); // exponent
+ fb.u.low.all = (du_int)a; // mantissa
+ return fb.f;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/fp_fixint_impl.inc b/compiler-rt/lib/builtins/fp_fixint_impl.inc
index 2f2f77ce781ae..245b29b7ba7ab 100644
--- a/compiler-rt/lib/builtins/fp_fixint_impl.inc
+++ b/compiler-rt/lib/builtins/fp_fixint_impl.inc
@@ -27,8 +27,8 @@ static __inline fixint_t __fixint(fp_t a) {
if (exponent < 0)
return 0;
- // If the value is too large for the integer type, saturate.
- if ((unsigned)exponent >= sizeof(fixint_t) * CHAR_BIT)
+ // If the value is too large for the integer type, or is inf/NaN, saturate.
+ if ((unsigned)exponent >= sizeof(fixint_t) * CHAR_BIT || aAbs >= infRep)
return sign == 1 ? fixint_max : fixint_min;
// If 0 <= exponent < significandBits, right shift to get the result.
diff --git a/compiler-rt/lib/builtins/int_lib.h b/compiler-rt/lib/builtins/int_lib.h
index 943430de259d8..79271308a448b 100644
--- a/compiler-rt/lib/builtins/int_lib.h
+++ b/compiler-rt/lib/builtins/int_lib.h
@@ -112,9 +112,15 @@ COMPILER_RT_ABI su_int __udivmodsi4(su_int a, su_int b, su_int *rem);
COMPILER_RT_ABI du_int __udivmoddi4(du_int a, du_int b, du_int *rem);
#ifdef CRT_HAS_128BIT
COMPILER_RT_ABI int __clzti2(ti_int a);
+COMPILER_RT_ABI int __ctzti2(ti_int a);
COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem);
#endif
+#ifdef CRT_HAS_256BIT
+COMPILER_RT_ABI int __clzoi2(oi_int a);
+COMPILER_RT_ABI ou_int __udivmodoi4(ou_int a, ou_int b, ou_int *rem);
+#endif
+
// Definitions for builtins unavailable on MSVC
#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h>
diff --git a/compiler-rt/lib/builtins/int_to_fp.h b/compiler-rt/lib/builtins/int_to_fp.h
index 2c1218f1e89c4..3393b2c0f4aaa 100644
--- a/compiler-rt/lib/builtins/int_to_fp.h
+++ b/compiler-rt/lib/builtins/int_to_fp.h
@@ -36,6 +36,16 @@ typedef __uint128_t src_t;
typedef __uint128_t usrc_t;
static __inline int clzSrcT(usrc_t x) { return __clzti2(x); }
+#elif defined SRC_I256
+typedef __int256_t src_t;
+typedef __uint256_t usrc_t;
+static __inline int clzSrcT(usrc_t x) { return __clzoi2(x); }
+
+#elif defined SRC_U256
+typedef __uint256_t src_t;
+typedef __uint256_t usrc_t;
+static __inline int clzSrcT(usrc_t x) { return __clzoi2(x); }
+
#else
#error Source should be a handled integer type.
#endif
diff --git a/compiler-rt/lib/builtins/int_to_fp_impl.inc b/compiler-rt/lib/builtins/int_to_fp_impl.inc
index 11736ed7aafc8..91eb668de9eb7 100644
--- a/compiler-rt/lib/builtins/int_to_fp_impl.inc
+++ b/compiler-rt/lib/builtins/int_to_fp_impl.inc
@@ -63,7 +63,13 @@ static __inline dst_t __floatXiYf__(src_t a) {
const dst_rep_t dstSignMask = DST_REP_C(1) << (dstBits - 1);
const int dstExpBits = dstBits - dstSigBits - 1;
const int dstExpBias = (1 << (dstExpBits - 1)) - 1;
+ const int dstExpMax = (1 << dstExpBits) - 1;
const dst_rep_t dstSignificandMask = (DST_REP_C(1) << dstSigBits) - 1;
+ // If the exponent exceeds the destination's range, return infinity.
+ if (e + dstExpBias >= dstExpMax) {
+ return dstFromRep(((dst_rep_t)s & dstSignMask) |
+ ((dst_rep_t)dstExpMax << dstSigBits));
+ }
// Combine sign, exponent, and mantissa.
const dst_rep_t result = ((dst_rep_t)s & dstSignMask) |
((dst_rep_t)(e + dstExpBias) << dstSigBits) |
diff --git a/compiler-rt/lib/builtins/int_types.h b/compiler-rt/lib/builtins/int_types.h
index 7c7f8cb64aa9a..6420ef0f65e84 100644
--- a/compiler-rt/lib/builtins/int_types.h
+++ b/compiler-rt/lib/builtins/int_types.h
@@ -121,6 +121,56 @@ static __inline tu_int make_tu(du_int h, du_int l) {
#endif // CRT_HAS_128BIT
+#if defined(__SIZEOF_INT256__)
+#define CRT_HAS_256BIT
+#endif
+
+#ifdef CRT_HAS_256BIT
+typedef __int256_t oi_int;
+typedef __uint256_t ou_int;
+
+typedef union {
+ oi_int all;
+ struct {
+#if _YUGA_LITTLE_ENDIAN
+ tu_int low;
+ ti_int high;
+#else
+ ti_int high;
+ tu_int low;
+#endif // _YUGA_LITTLE_ENDIAN
+ } s;
+} owords;
+
+typedef union {
+ ou_int all;
+ struct {
+#if _YUGA_LITTLE_ENDIAN
+ tu_int low;
+ tu_int high;
+#else
+ tu_int high;
+ tu_int low;
+#endif // _YUGA_LITTLE_ENDIAN
+ } s;
+} uowords;
+
+static __inline oi_int make_oi(ti_int h, ti_int l) {
+ owords r;
+ r.s.high = (tu_int)h;
+ r.s.low = (tu_int)l;
+ return r.all;
+}
+
+static __inline ou_int make_ou(tu_int h, tu_int l) {
+ uowords r;
+ r.s.high = h;
+ r.s.low = l;
+ return r.all;
+}
+
+#endif // CRT_HAS_256BIT
+
// FreeBSD's boot environment does not support using floating-point and poisons
// the float and double keywords.
#if defined(__FreeBSD__) && defined(_STANDALONE)
diff --git a/compiler-rt/lib/builtins/lshroi3.c b/compiler-rt/lib/builtins/lshroi3.c
new file mode 100644
index 0000000000000..d4e4920bda0a1
--- /dev/null
+++ b/compiler-rt/lib/builtins/lshroi3.c
@@ -0,0 +1,38 @@
+//===-- lshroi3.c - Implement __lshroi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __lshroi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: logical a >> b
+
+// Precondition: 0 <= b < bits_in_oword
+
+COMPILER_RT_ABI oi_int __lshroi3(oi_int a, int b) {
+ const int bits_in_tword = (int)(sizeof(ti_int) * CHAR_BIT);
+ uowords input;
+ uowords result;
+ input.all = a;
+ if (b & bits_in_tword) /* bits_in_tword <= b < bits_in_oword */ {
+ result.s.high = 0;
+ result.s.low = input.s.high >> (b - bits_in_tword);
+ } else /* 0 <= b < bits_in_tword */ {
+ if (b == 0)
+ return a;
+ result.s.high = input.s.high >> b;
+ result.s.low = (input.s.high << (bits_in_tword - b)) | (input.s.low >> b);
+ }
+ return result.all;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/modoi3.c b/compiler-rt/lib/builtins/modoi3.c
new file mode 100644
index 0000000000000..117a419019a36
--- /dev/null
+++ b/compiler-rt/lib/builtins/modoi3.c
@@ -0,0 +1,26 @@
+//===-- modoi3.c - Implement __modoi3 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __modoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a % b
+
+#define fixint_t oi_int
+#define fixuint_t ou_int
+#define ASSIGN_UMOD(res, a, b) __udivmodoi4((a), (b), &(res))
+#include "int_div_impl.inc"
+
+COMPILER_RT_ABI oi_int __modoi3(oi_int a, oi_int b) { return __modXi3(a, b); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/muloi5.c b/compiler-rt/lib/builtins/muloi5.c
new file mode 100644
index 0000000000000..60f3831aac959
--- /dev/null
+++ b/compiler-rt/lib/builtins/muloi5.c
@@ -0,0 +1,29 @@
+//===-- muloi5.c - Implement __muloi5 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __muloi5 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a * b
+
+// Effects: sets *overflow to 1 if a * b overflows
+
+#define fixint_t oi_int
+#define fixuint_t ou_int
+#include "int_mulo_impl.inc"
+
+COMPILER_RT_ABI oi_int __muloi5(oi_int a, oi_int b, int *overflow) {
+ return __muloXi4(a, b, overflow);
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/multi5.c b/compiler-rt/lib/builtins/multi5.c
new file mode 100644
index 0000000000000..9172895b7ebf7
--- /dev/null
+++ b/compiler-rt/lib/builtins/multi5.c
@@ -0,0 +1,51 @@
+//===-- multi5.c - Implement __multi5 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __multi5 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a * b
+
+static oi_int __multti3(tu_int a, tu_int b) {
+ owords r;
+ const int bits_in_tword_2 = (int)(sizeof(ti_int) * CHAR_BIT) / 2;
+ const tu_int lower_mask = (tu_int)~0 >> bits_in_tword_2;
+ r.s.low = (a & lower_mask) * (b & lower_mask);
+ tu_int t = (tu_int)r.s.low >> bits_in_tword_2;
+ r.s.low &= lower_mask;
+ t += (a >> bits_in_tword_2) * (b & lower_mask);
+ r.s.low += (t & lower_mask) << bits_in_tword_2;
+ r.s.high = t >> bits_in_tword_2;
+ t = (tu_int)r.s.low >> bits_in_tword_2;
+ r.s.low &= lower_mask;
+ t += (b >> bits_in_tword_2) * (a & lower_mask);
+ r.s.low += (t & lower_mask) << bits_in_tword_2;
+ r.s.high += t >> bits_in_tword_2;
+ r.s.high += (a >> bits_in_tword_2) * (b >> bits_in_tword_2);
+ return r.all;
+}
+
+// Returns: a * b
+
+COMPILER_RT_ABI oi_int __multi5(oi_int a, oi_int b) {
+ owords x;
+ x.all = a;
+ owords y;
+ y.all = b;
+ owords r;
+ r.all = __multti3(x.s.low, y.s.low);
+ r.s.high += x.s.high * y.s.low + x.s.low * y.s.high;
+ return r.all;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/mulvoi3.c b/compiler-rt/lib/builtins/mulvoi3.c
new file mode 100644
index 0000000000000..1ec46d45e7eff
--- /dev/null
+++ b/compiler-rt/lib/builtins/mulvoi3.c
@@ -0,0 +1,27 @@
+//===-- mulvoi3.c - Implement __mulvoi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __mulvoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a * b
+
+// Effects: aborts if a * b overflows
+
+#define fixint_t oi_int
+#define fixuint_t ou_int
+#include "int_mulv_impl.inc"
+
+COMPILER_RT_ABI oi_int __mulvoi3(oi_int a, oi_int b) { return __mulvXi3(a, b); }
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/negoi2.c b/compiler-rt/lib/builtins/negoi2.c
new file mode 100644
index 0000000000000..ae46825fd7416
--- /dev/null
+++ b/compiler-rt/lib/builtins/negoi2.c
@@ -0,0 +1,25 @@
+//===-- negoi2.c - Implement __negoi2 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __negoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: -a
+
+COMPILER_RT_ABI oi_int __negoi2(oi_int a) {
+ // Note: this routine is here for API compatibility; any sane compiler
+ // should expand it inline.
+ return -(ou_int)a;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/negvoi2.c b/compiler-rt/lib/builtins/negvoi2.c
new file mode 100644
index 0000000000000..07d29b6480a68
--- /dev/null
+++ b/compiler-rt/lib/builtins/negvoi2.c
@@ -0,0 +1,28 @@
+//===-- negvoi2.c - Implement __negvoi2 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __negvoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: -a
+
+// Effects: aborts if -a overflows
+
+COMPILER_RT_ABI oi_int __negvoi2(oi_int a) {
+ const oi_int MIN = (ou_int)1 << ((int)(sizeof(oi_int) * CHAR_BIT) - 1);
+ if (a == MIN)
+ compilerrt_abort();
+ return -a;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/parityoi2.c b/compiler-rt/lib/builtins/parityoi2.c
new file mode 100644
index 0000000000000..88ca0791a8a98
--- /dev/null
+++ b/compiler-rt/lib/builtins/parityoi2.c
@@ -0,0 +1,36 @@
+//===-- parityoi2.c - Implement __parityoi2 -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __parityoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: 1 if number of bits is odd else returns 0
+
+COMPILER_RT_ABI int __parityoi2(oi_int a) {
+ owords x;
+ x.all = a;
+ // XOR the two 128-bit halves, then delegate to parityti2's approach.
+ tu_int x2 = x.s.high ^ x.s.low;
+ // XOR the two 64-bit halves of the 128-bit result.
+ dwords x3;
+ utwords t;
+ t.all = x2;
+ x3.all = t.s.high ^ t.s.low;
+ su_int x4 = x3.s.high ^ x3.s.low;
+ x4 ^= x4 >> 16;
+ x4 ^= x4 >> 8;
+ x4 ^= x4 >> 4;
+ return (0x6996 >> (x4 & 0xF)) & 1;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/popcountoi2.c b/compiler-rt/lib/builtins/popcountoi2.c
new file mode 100644
index 0000000000000..2052c3b26c04b
--- /dev/null
+++ b/compiler-rt/lib/builtins/popcountoi2.c
@@ -0,0 +1,27 @@
+//===-- popcountoi2.c - Implement __popcountoi2 ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __popcountoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __popcountti2(ti_int a);
+
+// Returns: count of 1 bits
+
+COMPILER_RT_ABI int __popcountoi2(oi_int a) {
+ uowords x;
+ x.all = (ou_int)a;
+ return __popcountti2(x.s.low) + __popcountti2(x.s.high);
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/subvoi3.c b/compiler-rt/lib/builtins/subvoi3.c
new file mode 100644
index 0000000000000..b4c64c0d9dcf5
--- /dev/null
+++ b/compiler-rt/lib/builtins/subvoi3.c
@@ -0,0 +1,33 @@
+//===-- subvoi3.c - Implement __subvoi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __subvoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a - b
+
+// Effects: aborts if a - b overflows
+
+COMPILER_RT_ABI oi_int __subvoi3(oi_int a, oi_int b) {
+ oi_int s = (ou_int)a - (ou_int)b;
+ if (b >= 0) {
+ if (s > a)
+ compilerrt_abort();
+ } else {
+ if (s <= a)
+ compilerrt_abort();
+ }
+ return s;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/ucmpoi2.c b/compiler-rt/lib/builtins/ucmpoi2.c
new file mode 100644
index 0000000000000..cbfbe23fe0836
--- /dev/null
+++ b/compiler-rt/lib/builtins/ucmpoi2.c
@@ -0,0 +1,37 @@
+//===-- ucmpoi2.c - Implement __ucmpoi2 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __ucmpoi2 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: if (a < b) returns 0
+// if (a == b) returns 1
+// if (a > b) returns 2
+
+COMPILER_RT_ABI si_int __ucmpoi2(ou_int a, ou_int b) {
+ uowords x;
+ x.all = a;
+ uowords y;
+ y.all = b;
+ if (x.s.high < y.s.high)
+ return 0;
+ if (x.s.high > y.s.high)
+ return 2;
+ if (x.s.low < y.s.low)
+ return 0;
+ if (x.s.low > y.s.low)
+ return 2;
+ return 1;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/udivmodoi4.c b/compiler-rt/lib/builtins/udivmodoi4.c
new file mode 100644
index 0000000000000..9ae441a27b745
--- /dev/null
+++ b/compiler-rt/lib/builtins/udivmodoi4.c
@@ -0,0 +1,147 @@
+//===-- udivmodoi4.c - Implement __udivmodoi4 -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __udivmodoi4 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns the 256 bit division result by 128 bit. Result must fit in 128 bits.
+// Remainder stored in r.
+// Adapted from the 128/64 algorithm in udivmodti4.c.
+UNUSED
+static inline tu_int udiv256by128to128default(tu_int u1, tu_int u0, tu_int v,
+ tu_int *r) {
+ const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
+ const tu_int b = (tu_int)1 << (n_utword_bits / 2); // Number base (64 bits)
+ tu_int un1, un0; // Norm. dividend LSD's
+ tu_int vn1, vn0; // Norm. divisor digits
+ tu_int q1, q0; // Quotient digits
+ tu_int un128, un21, un10; // Dividend digit pairs
+ tu_int rhat; // A remainder
+ si_int s; // Shift amount for normalization
+
+ s = __clzti2(v);
+ if (s > 0) {
+ // Normalize the divisor.
+ v = v << s;
+ un128 = (u1 << s) | (u0 >> (n_utword_bits - s));
+ un10 = u0 << s;
+ } else {
+ // Avoid undefined behavior of (u0 >> 128).
+ un128 = u1;
+ un10 = u0;
+ }
+
+ // Break divisor up into two 64-bit digits.
+ vn1 = v >> (n_utword_bits / 2);
+ vn0 = v & (((tu_int)1 << (n_utword_bits / 2)) - 1);
+
+ // Break right half of dividend into two digits.
+ un1 = un10 >> (n_utword_bits / 2);
+ un0 = un10 & (((tu_int)1 << (n_utword_bits / 2)) - 1);
+
+ // Compute the first quotient digit, q1.
+ q1 = un128 / vn1;
+ rhat = un128 - q1 * vn1;
+
+ // q1 has at most error 2. No more than 2 iterations.
+ while (q1 >= b || q1 * vn0 > b * rhat + un1) {
+ q1 = q1 - 1;
+ rhat = rhat + vn1;
+ if (rhat >= b)
+ break;
+ }
+
+ un21 = un128 * b + un1 - q1 * v;
+
+ // Compute the second quotient digit.
+ q0 = un21 / vn1;
+ rhat = un21 - q0 * vn1;
+
+ // q0 has at most error 2. No more than 2 iterations.
+ while (q0 >= b || q0 * vn0 > b * rhat + un0) {
+ q0 = q0 - 1;
+ rhat = rhat + vn1;
+ if (rhat >= b)
+ break;
+ }
+
+ *r = (un21 * b + un0 - q0 * v) >> s;
+ return q1 * b + q0;
+}
+
+static inline tu_int udiv256by128to128(tu_int u1, tu_int u0, tu_int v,
+ tu_int *r) {
+ return udiv256by128to128default(u1, u0, v, r);
+}
+
+// Effects: if rem != 0, *rem = a % b
+// Returns: a / b
+
+COMPILER_RT_ABI ou_int __udivmodoi4(ou_int a, ou_int b, ou_int *rem) {
+ const unsigned n_uoword_bits = sizeof(ou_int) * CHAR_BIT;
+ uowords dividend;
+ dividend.all = a;
+ uowords divisor;
+ divisor.all = b;
+ uowords quotient;
+ uowords remainder;
+ if (divisor.all > dividend.all) {
+ if (rem)
+ *rem = dividend.all;
+ return 0;
+ }
+ // When the divisor fits in 128 bits, we can use an optimized path.
+ if (divisor.s.high == 0) {
+ remainder.s.high = 0;
+ if (dividend.s.high < divisor.s.low) {
+ // The result fits in 128 bits.
+ quotient.s.low = udiv256by128to128(dividend.s.high, dividend.s.low,
+ divisor.s.low, &remainder.s.low);
+ quotient.s.high = 0;
+ } else {
+ // First, divide with the high part to get the remainder in
+ // dividend.s.high. After that dividend.s.high < divisor.s.low.
+ quotient.s.high = dividend.s.high / divisor.s.low;
+ dividend.s.high = dividend.s.high % divisor.s.low;
+ quotient.s.low = udiv256by128to128(dividend.s.high, dividend.s.low,
+ divisor.s.low, &remainder.s.low);
+ }
+ if (rem)
+ *rem = remainder.all;
+ return quotient.all;
+ }
+ // 0 <= shift <= 127.
+ si_int shift = __clzti2(divisor.s.high) - __clzti2(dividend.s.high);
+ divisor.all <<= shift;
+ quotient.s.high = 0;
+ quotient.s.low = 0;
+ for (; shift >= 0; --shift) {
+ quotient.s.low <<= 1;
+ // Branch free version of.
+ // if (dividend.all >= divisor.all)
+ // {
+ // dividend.all -= divisor.all;
+ // carry = 1;
+ // }
+ const oi_int s =
+ (oi_int)(divisor.all - dividend.all - 1) >> (n_uoword_bits - 1);
+ quotient.s.low |= s & 1;
+ dividend.all -= divisor.all & s;
+ divisor.all >>= 1;
+ }
+ if (rem)
+ *rem = dividend.all;
+ return quotient.all;
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/udivoi3.c b/compiler-rt/lib/builtins/udivoi3.c
new file mode 100644
index 0000000000000..a4f489c9c7f77
--- /dev/null
+++ b/compiler-rt/lib/builtins/udivoi3.c
@@ -0,0 +1,23 @@
+//===-- udivoi3.c - Implement __udivoi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __udivoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a / b
+
+COMPILER_RT_ABI ou_int __udivoi3(ou_int a, ou_int b) {
+ return __udivmodoi4(a, b, 0);
+}
+
+#endif // CRT_HAS_256BIT
diff --git a/compiler-rt/lib/builtins/umodoi3.c b/compiler-rt/lib/builtins/umodoi3.c
new file mode 100644
index 0000000000000..3598777e1a78b
--- /dev/null
+++ b/compiler-rt/lib/builtins/umodoi3.c
@@ -0,0 +1,25 @@
+//===-- umodoi3.c - Implement __umodoi3 -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __umodoi3 for the compiler_rt library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "int_lib.h"
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a % b
+
+COMPILER_RT_ABI ou_int __umodoi3(ou_int a, ou_int b) {
+ ou_int r;
+ __udivmodoi4(a, b, &r);
+ return r;
+}
+
+#endif // CRT_HAS_256BIT
>From d441ec5cf91245b60fecc953890c39b9f15c2410 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:40:03 +0100
Subject: [PATCH 08/17] [compiler-rt][test] Add __int256 builtin tests
Add 40 unit tests for all __int256 builtins, covering:
- Full-width big-number arithmetic (multiply, div/mod, divmod)
- Shifts (logical/arithmetic left/right) with various shift amounts
- Bit operations (clz, ctz, ffs, popcount, parity)
- Overflow-checked operations (add, sub, mul, abs, neg)
- Float conversions (fix/fixuns/float/floatun for sf/df/tf/xf)
- Signed/unsigned comparisons
Tests use CRT_HAS_256BIT guard: compile on 32-bit but print "skipped".
Each test covers boundary cases (0, 1, max, min, powers of 2).
Update lit.cfg.py to provide has_int256 feature flag.
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
compiler-rt/test/builtins/Unit/absvoi2_test.c | 67 +++++
compiler-rt/test/builtins/Unit/addvoi3_test.c | 78 +++++
compiler-rt/test/builtins/Unit/ashloi3_test.c | 122 ++++++++
compiler-rt/test/builtins/Unit/ashroi3_test.c | 86 ++++++
compiler-rt/test/builtins/Unit/clzoi2_test.c | 78 +++++
compiler-rt/test/builtins/Unit/cmpoi2_test.c | 93 ++++++
compiler-rt/test/builtins/Unit/ctzoi2_test.c | 83 ++++++
.../test/builtins/Unit/divmodoi4_test.c | 97 +++++++
compiler-rt/test/builtins/Unit/divoi3_test.c | 97 +++++++
compiler-rt/test/builtins/Unit/ffsoi2_test.c | 86 ++++++
compiler-rt/test/builtins/Unit/fixdfoi_test.c | 93 ++++++
compiler-rt/test/builtins/Unit/fixsfoi_test.c | 98 +++++++
compiler-rt/test/builtins/Unit/fixtfoi_test.c | 47 +++
.../test/builtins/Unit/fixunsdfoi_test.c | 47 +++
.../test/builtins/Unit/fixunssfoi_test.c | 47 +++
.../test/builtins/Unit/fixunstfoi_test.c | 43 +++
.../test/builtins/Unit/fixunsxfoi_test.c | 149 ++++++++++
compiler-rt/test/builtins/Unit/fixxfoi_test.c | 144 ++++++++++
.../test/builtins/Unit/floatoidf_test.c | 89 ++++++
.../test/builtins/Unit/floatoisf_test.c | 77 +++++
.../test/builtins/Unit/floatoitf_test.c | 45 +++
.../test/builtins/Unit/floatoixf_test.c | 114 ++++++++
.../test/builtins/Unit/floatunoidf_test.c | 43 +++
.../test/builtins/Unit/floatunoisf_test.c | 41 +++
.../test/builtins/Unit/floatunoitf_test.c | 43 +++
.../test/builtins/Unit/floatunoixf_test.c | 123 ++++++++
compiler-rt/test/builtins/Unit/lit.cfg.py | 11 +
compiler-rt/test/builtins/Unit/lshroi3_test.c | 101 +++++++
compiler-rt/test/builtins/Unit/modoi3_test.c | 82 ++++++
compiler-rt/test/builtins/Unit/muloi5_test.c | 164 +++++++++++
compiler-rt/test/builtins/Unit/multi5_test.c | 174 +++++++++++
compiler-rt/test/builtins/Unit/mulvoi3_test.c | 119 ++++++++
compiler-rt/test/builtins/Unit/negoi2_test.c | 69 +++++
compiler-rt/test/builtins/Unit/negvoi2_test.c | 59 ++++
.../test/builtins/Unit/parityoi2_test.c | 83 ++++++
.../test/builtins/Unit/popcountoi2_test.c | 86 ++++++
compiler-rt/test/builtins/Unit/subvoi3_test.c | 81 ++++++
compiler-rt/test/builtins/Unit/ucmpoi2_test.c | 89 ++++++
.../test/builtins/Unit/udivmodoi4_test.c | 272 ++++++++++++++++++
compiler-rt/test/builtins/Unit/udivoi3_test.c | 92 ++++++
compiler-rt/test/builtins/Unit/umodoi3_test.c | 80 ++++++
41 files changed, 3692 insertions(+)
create mode 100644 compiler-rt/test/builtins/Unit/absvoi2_test.c
create mode 100644 compiler-rt/test/builtins/Unit/addvoi3_test.c
create mode 100644 compiler-rt/test/builtins/Unit/ashloi3_test.c
create mode 100644 compiler-rt/test/builtins/Unit/ashroi3_test.c
create mode 100644 compiler-rt/test/builtins/Unit/clzoi2_test.c
create mode 100644 compiler-rt/test/builtins/Unit/cmpoi2_test.c
create mode 100644 compiler-rt/test/builtins/Unit/ctzoi2_test.c
create mode 100644 compiler-rt/test/builtins/Unit/divmodoi4_test.c
create mode 100644 compiler-rt/test/builtins/Unit/divoi3_test.c
create mode 100644 compiler-rt/test/builtins/Unit/ffsoi2_test.c
create mode 100644 compiler-rt/test/builtins/Unit/fixdfoi_test.c
create mode 100644 compiler-rt/test/builtins/Unit/fixsfoi_test.c
create mode 100644 compiler-rt/test/builtins/Unit/fixtfoi_test.c
create mode 100644 compiler-rt/test/builtins/Unit/fixunsdfoi_test.c
create mode 100644 compiler-rt/test/builtins/Unit/fixunssfoi_test.c
create mode 100644 compiler-rt/test/builtins/Unit/fixunstfoi_test.c
create mode 100644 compiler-rt/test/builtins/Unit/fixunsxfoi_test.c
create mode 100644 compiler-rt/test/builtins/Unit/fixxfoi_test.c
create mode 100644 compiler-rt/test/builtins/Unit/floatoidf_test.c
create mode 100644 compiler-rt/test/builtins/Unit/floatoisf_test.c
create mode 100644 compiler-rt/test/builtins/Unit/floatoitf_test.c
create mode 100644 compiler-rt/test/builtins/Unit/floatoixf_test.c
create mode 100644 compiler-rt/test/builtins/Unit/floatunoidf_test.c
create mode 100644 compiler-rt/test/builtins/Unit/floatunoisf_test.c
create mode 100644 compiler-rt/test/builtins/Unit/floatunoitf_test.c
create mode 100644 compiler-rt/test/builtins/Unit/floatunoixf_test.c
create mode 100644 compiler-rt/test/builtins/Unit/lshroi3_test.c
create mode 100644 compiler-rt/test/builtins/Unit/modoi3_test.c
create mode 100644 compiler-rt/test/builtins/Unit/muloi5_test.c
create mode 100644 compiler-rt/test/builtins/Unit/multi5_test.c
create mode 100644 compiler-rt/test/builtins/Unit/mulvoi3_test.c
create mode 100644 compiler-rt/test/builtins/Unit/negoi2_test.c
create mode 100644 compiler-rt/test/builtins/Unit/negvoi2_test.c
create mode 100644 compiler-rt/test/builtins/Unit/parityoi2_test.c
create mode 100644 compiler-rt/test/builtins/Unit/popcountoi2_test.c
create mode 100644 compiler-rt/test/builtins/Unit/subvoi3_test.c
create mode 100644 compiler-rt/test/builtins/Unit/ucmpoi2_test.c
create mode 100644 compiler-rt/test/builtins/Unit/udivmodoi4_test.c
create mode 100644 compiler-rt/test/builtins/Unit/udivoi3_test.c
create mode 100644 compiler-rt/test/builtins/Unit/umodoi3_test.c
diff --git a/compiler-rt/test/builtins/Unit/absvoi2_test.c b/compiler-rt/test/builtins/Unit/absvoi2_test.c
new file mode 100644
index 0000000000000..f26526f0054f9
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/absvoi2_test.c
@@ -0,0 +1,67 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_absvoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __absvoi2(oi_int a);
+
+int test__absvoi2(oi_int a, oi_int expected) {
+ oi_int x = __absvoi2(a);
+ if (x != expected) {
+ printf("error in __absvoi2\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__absvoi2((oi_int)0, (oi_int)0))
+ return 1;
+ if (test__absvoi2((oi_int)1, (oi_int)1))
+ return 1;
+ if (test__absvoi2((oi_int)-1, (oi_int)1))
+ return 1;
+ if (test__absvoi2((oi_int)42, (oi_int)42))
+ return 1;
+ if (test__absvoi2((oi_int)-42, (oi_int)42))
+ return 1;
+ // Large positive value (already positive, no change)
+ {
+ oi_int big = make_oi(make_ti(0, 1), make_ti(0, 0));
+ if (test__absvoi2(big, big))
+ return 1;
+ }
+ // Large negative value
+ if (test__absvoi2(make_oi(make_ti(-1, -1), make_ti(0, 0)),
+ make_oi(make_ti(0, 1), make_ti(0, 0))))
+ return 1;
+ // MAX (already positive)
+ {
+ oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+ if (test__absvoi2(MAX, MAX))
+ return 1;
+ }
+ // Note: MIN would abort, so we don't test it.
+ // Full-width big-number test (all 4 limbs populated).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ // C is negative signed; abs(C) = |C|
+ if (test__absvoi2(
+ make_oi(make_ti(0xDDDDEEEEFFFF0000LL, 0x1111222233334444ULL),
+ make_ti(0x5555666677778888ULL, 0x9999AAAABBBBCCCCULL)),
+ make_oi(make_ti(0x222211110000FFFFLL, 0xEEEEDDDDCCCCBBBBULL),
+ make_ti(0xAAAA999988887777ULL, 0x6666555544443334ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/addvoi3_test.c b/compiler-rt/test/builtins/Unit/addvoi3_test.c
new file mode 100644
index 0000000000000..6cc2732cf63bd
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/addvoi3_test.c
@@ -0,0 +1,78 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_addvoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __addvoi3(oi_int a, oi_int b);
+
+int test__addvoi3(oi_int a, oi_int b, oi_int expected) {
+ oi_int x = __addvoi3(a, b);
+ if (x != expected) {
+ printf("error in __addvoi3\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__addvoi3((oi_int)0, (oi_int)0, (oi_int)0))
+ return 1;
+ if (test__addvoi3((oi_int)1, (oi_int)1, (oi_int)2))
+ return 1;
+ if (test__addvoi3((oi_int)-1, (oi_int)1, (oi_int)0))
+ return 1;
+ if (test__addvoi3((oi_int)100, (oi_int)200, (oi_int)300))
+ return 1;
+ // Large values in low half (carry across 64-bit boundary)
+ if (test__addvoi3(make_oi(make_ti(0, 0), make_ti(0, 0xFFFFFFFFFFFFFFFFULL)),
+ make_oi(make_ti(0, 0), make_ti(0, 1)),
+ make_oi(make_ti(0, 0), make_ti(1, 0))))
+ return 1;
+ // Carry across 128-bit boundary (low half to high half)
+ if (test__addvoi3(make_oi(make_ti(0, 0), make_ti(-1, -1)),
+ make_oi(make_ti(0, 0), make_ti(0, 1)),
+ make_oi(make_ti(0, 1), make_ti(0, 0))))
+ return 1;
+ // Negative + negative
+ if (test__addvoi3((oi_int)-100, (oi_int)-200, (oi_int)-300))
+ return 1;
+ // Large positive values
+ if (test__addvoi3(make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(0, 2), make_ti(0, 0)),
+ make_oi(make_ti(0, 3), make_ti(0, 0))))
+ return 1;
+ // Identity: x + 0
+ {
+ oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+ if (test__addvoi3(big, (oi_int)0, big))
+ return 1;
+ }
+ // Additive inverse
+ if (test__addvoi3(make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(-1, -1), make_ti(0, 0)), (oi_int)0))
+ return 1;
+ // Full-width big-number test (all 4 limbs populated).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ // A(signed) + B(signed) = 0xBBBBDDDE...99981111
+ if (test__addvoi3(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ make_oi(make_ti(0xBBBBDDDE00002222LL, 0x444466658888AAAAULL),
+ make_ti(0xCCCCEEEF11113333ULL, 0x5555777799981111ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/ashloi3_test.c b/compiler-rt/test/builtins/Unit/ashloi3_test.c
new file mode 100644
index 0000000000000..62f13f21e1941
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/ashloi3_test.c
@@ -0,0 +1,122 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_ashloi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+// Returns: a << b
+
+// Precondition: 0 <= b < bits_in_oword
+
+COMPILER_RT_ABI oi_int __ashloi3(oi_int a, int b);
+
+int test__ashloi3(oi_int a, int b, oi_int expected) {
+ oi_int x = __ashloi3(a, b);
+ if (x != expected) {
+ owords xt;
+ xt.all = x;
+ owords expectedt;
+ expectedt.all = expected;
+ printf("error in __ashloi3: shift by %d\n", b);
+ printf(" got: 0x%.16llX%.16llX%.16llX%.16llX\n",
+ (unsigned long long)((tu_int)xt.s.high >> 64),
+ (unsigned long long)xt.s.high,
+ (unsigned long long)((tu_int)xt.s.low >> 64),
+ (unsigned long long)xt.s.low);
+ printf(" expected: 0x%.16llX%.16llX%.16llX%.16llX\n",
+ (unsigned long long)((tu_int)expectedt.s.high >> 64),
+ (unsigned long long)expectedt.s.high,
+ (unsigned long long)((tu_int)expectedt.s.low >> 64),
+ (unsigned long long)expectedt.s.low);
+ }
+ return x != expected;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // Shift by 0 (identity)
+ if (test__ashloi3(
+ make_oi(make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL),
+ make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL)),
+ 0,
+ make_oi(make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL),
+ make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL))))
+ return 1;
+ // Shift by 1
+ if (test__ashloi3((ou_int)1, 1, (ou_int)2))
+ return 1;
+ if (test__ashloi3((ou_int)1, 2, (ou_int)4))
+ return 1;
+ if (test__ashloi3((ou_int)1, 4, (ou_int)16))
+ return 1;
+ // Shift by 63 (within first 64-bit word)
+ if (test__ashloi3((ou_int)1, 63,
+ make_oi(make_ti(0, 0), make_ti(0, 0x8000000000000000ULL))))
+ return 1;
+ // Shift by 64 (crosses into second 64-bit word)
+ if (test__ashloi3((ou_int)1, 64, make_oi(make_ti(0, 0), make_ti(1, 0))))
+ return 1;
+ // Shift by 127 (top of low 128-bit half)
+ if (test__ashloi3((ou_int)1, 127,
+ make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0))))
+ return 1;
+ // Shift by 128 (crosses into high 128-bit half)
+ if (test__ashloi3((ou_int)1, 128, make_oi(make_ti(0, 1), make_ti(0, 0))))
+ return 1;
+ // Shift by 129
+ if (test__ashloi3((ou_int)1, 129, make_oi(make_ti(0, 2), make_ti(0, 0))))
+ return 1;
+ // Shift by 191
+ if (test__ashloi3((ou_int)1, 191,
+ make_oi(make_ti(0, 0x8000000000000000ULL), make_ti(0, 0))))
+ return 1;
+ // Shift by 192
+ if (test__ashloi3((ou_int)1, 192, make_oi(make_ti(1, 0), make_ti(0, 0))))
+ return 1;
+ // Shift by 255 (MSB)
+ if (test__ashloi3((ou_int)1, 255,
+ make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0))))
+ return 1;
+ // Multi-bit value shift by 64
+ if (test__ashloi3((ou_int)0xFFFFFFFFFFFFFFFFULL, 64,
+ make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFFULL, 0))))
+ return 1;
+ // Multi-bit value shift by 128
+ if (test__ashloi3((ou_int)0xFFFFFFFFFFFFFFFFULL, 128,
+ make_oi(make_ti(0, 0xFFFFFFFFFFFFFFFFULL), make_ti(0, 0))))
+ return 1;
+ // Multi-bit value shift by 192
+ if (test__ashloi3((ou_int)0xFFFFFFFFFFFFFFFFULL, 192,
+ make_oi(make_ti(0xFFFFFFFFFFFFFFFFULL, 0), make_ti(0, 0))))
+ return 1;
+ // Full value shift crossing half boundary
+ if (test__ashloi3(make_oi(make_ti(0, 0), make_ti(0, 0xABCDLL)), 4,
+ make_oi(make_ti(0, 0), make_ti(0, 0xABCD0LL))))
+ return 1;
+ // Shift that spans both halves
+ if (test__ashloi3(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL,
+ 0x0000000000000001LL)),
+ 1,
+ make_oi(make_ti(0, 1), make_ti(0, 0x0000000000000002LL))))
+ return 1;
+ // Full-width big-number test (all 4 limbs populated, shift crosses 64-bit boundary).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__ashloi3(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ 73,
+ make_oi(make_ti(0xDDFFFE2222444466LL, 0x668888AAAACCCCEEULL),
+ make_ti(0xEF11113332000000ULL, 0x0000000000000000ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/ashroi3_test.c b/compiler-rt/test/builtins/Unit/ashroi3_test.c
new file mode 100644
index 0000000000000..a48d3c160edfd
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/ashroi3_test.c
@@ -0,0 +1,86 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_ashroi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __ashroi3(oi_int a, int b);
+
+int test__ashroi3(oi_int a, int b, oi_int expected) {
+ oi_int x = __ashroi3(a, b);
+ if (x != expected) {
+ printf("error in __ashroi3: shift by %d\n", b);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // Shift by 0
+ if (test__ashroi3((oi_int)1, 0, (oi_int)1))
+ return 1;
+ // Shift positive by small amounts
+ if (test__ashroi3((oi_int)2, 1, (oi_int)1))
+ return 1;
+ if (test__ashroi3((oi_int)4, 2, (oi_int)1))
+ return 1;
+ // Shift negative by 1 (sign extension)
+ if (test__ashroi3((oi_int)-2, 1, (oi_int)-1))
+ return 1;
+ // Shift -1 by any amount stays -1 (sign extension)
+ if (test__ashroi3((oi_int)-1, 1, (oi_int)-1))
+ return 1;
+ if (test__ashroi3((oi_int)-1, 64, (oi_int)-1))
+ return 1;
+ if (test__ashroi3((oi_int)-1, 128, (oi_int)-1))
+ return 1;
+ if (test__ashroi3((oi_int)-1, 255, (oi_int)-1))
+ return 1;
+ // Shift by 64 (within low half)
+ if (test__ashroi3(make_oi(make_ti(0, 0), make_ti(0xABCD000000000000LL, 0)),
+ 64,
+ make_oi(make_ti(0, 0), make_ti(0, 0xABCD000000000000ULL))))
+ return 1;
+ // Shift by 128 (crosses half boundary, positive)
+ if (test__ashroi3(make_oi(make_ti(0, 1), make_ti(0, 0)), 128, (oi_int)1))
+ return 1;
+ // Shift by 128 (negative, sign extends)
+ if (test__ashroi3(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+ 128,
+ make_oi(make_ti(-1, -1), make_ti(0x8000000000000000LL, 0))))
+ return 1;
+ // Shift by 192
+ if (test__ashroi3(make_oi(make_ti(0x0000ABCD00000000LL, 0), make_ti(0, 0)),
+ 192, (oi_int)0x0000ABCD00000000LL))
+ return 1;
+ // Shift MSB-only by 255
+ if (test__ashroi3(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+ 255, (oi_int)-1))
+ return 1;
+ // Shift MAX positive by 255
+ if (test__ashroi3(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+ 255, (oi_int)0))
+ return 1;
+ // Full-width big-number test (negative value, shift crosses 64-bit boundary).
+ // A is negative in signed interpretation; arithmetic shift sign-extends.
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__ashroi3(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ 73,
+ make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, 0xFFD5555DDDE6666EULL),
+ make_ti(0xEEF7777FFF888891ULL, 0x111999A2222AAAB3ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/clzoi2_test.c b/compiler-rt/test/builtins/Unit/clzoi2_test.c
new file mode 100644
index 0000000000000..9b58e848b19db
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/clzoi2_test.c
@@ -0,0 +1,78 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_clzoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __clzoi2(oi_int a);
+
+int test__clzoi2(oi_int a, int expected) {
+ int x = __clzoi2(a);
+ if (x != expected) {
+ printf("error in __clzoi2: expected %d, got %d\n", expected, x);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // Single bit in MSB position
+ if (test__clzoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)), 0))
+ return 1;
+ // Single bit in high half, lower position (bit 128)
+ if (test__clzoi2(make_oi(make_ti(0, 1), make_ti(0, 0)), 127))
+ return 1;
+ // Single bit at position 128 (MSB of low half)
+ if (test__clzoi2(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ 128))
+ return 1;
+ // 1
+ if (test__clzoi2((oi_int)1, 255))
+ return 1;
+ // All ones
+ if (test__clzoi2((oi_int)(ou_int)-1, 0))
+ return 1;
+ // Value in high word only
+ if (test__clzoi2(make_oi(make_ti(0, 0xFFLL), make_ti(0, 0)), 120))
+ return 1;
+ // Bit at position 64 (second 64-bit word)
+ if (test__clzoi2(make_oi(make_ti(0, 0), make_ti(1, 0)), 191))
+ return 1;
+ // Bit at position 192
+ if (test__clzoi2(make_oi(make_ti(1, 0), make_ti(0, 0)), 63))
+ return 1;
+ // 0xFF in low word only
+ if (test__clzoi2((oi_int)0xFF, 248))
+ return 1;
+ // Single bit at position 191
+ if (test__clzoi2(make_oi(make_ti(0, 0x8000000000000000ULL), make_ti(0, 0)),
+ 64))
+ return 1;
+ // Power of 2 at position 200
+ if (test__clzoi2(make_oi(make_ti(0x100LL, 0), make_ti(0, 0)), 55))
+ return 1;
+ // Full-width big-number tests.
+ // Expected values verified by Python arbitrary-precision arithmetic.
+ if (test__clzoi2(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ 0))
+ return 1;
+ if (test__clzoi2(
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ 3))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/cmpoi2_test.c b/compiler-rt/test/builtins/Unit/cmpoi2_test.c
new file mode 100644
index 0000000000000..56682d84f7ba2
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/cmpoi2_test.c
@@ -0,0 +1,93 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_cmpoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI si_int __cmpoi2(oi_int a, oi_int b);
+
+int test__cmpoi2(oi_int a, oi_int b, si_int expected) {
+ si_int x = __cmpoi2(a, b);
+ if (x != expected) {
+ printf("error in __cmpoi2: expected %d, got %d\n", expected, x);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // Equal
+ if (test__cmpoi2((oi_int)0, (oi_int)0, 1))
+ return 1;
+ if (test__cmpoi2((oi_int)1, (oi_int)1, 1))
+ return 1;
+ if (test__cmpoi2((oi_int)-1, (oi_int)-1, 1))
+ return 1;
+ // Less than
+ if (test__cmpoi2((oi_int)0, (oi_int)1, 0))
+ return 1;
+ if (test__cmpoi2((oi_int)-1, (oi_int)0, 0))
+ return 1;
+ // Greater than
+ if (test__cmpoi2((oi_int)1, (oi_int)0, 2))
+ return 1;
+ if (test__cmpoi2((oi_int)0, (oi_int)-1, 2))
+ return 1;
+ // Large values: high half > low half
+ if (test__cmpoi2(make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(0, 0), make_ti(-1, -1)), 2))
+ return 1;
+ // Large equal values
+ {
+ oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+ if (test__cmpoi2(big, big, 1))
+ return 1;
+ }
+ // MAX > 0
+ if (test__cmpoi2(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+ (oi_int)0, 2))
+ return 1;
+ // MIN < 0
+ if (test__cmpoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+ (oi_int)0, 0))
+ return 1;
+ // MIN < MAX
+ if (test__cmpoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+ make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+ 0))
+ return 1;
+ // Differ only in low half
+ if (test__cmpoi2(make_oi(make_ti(0, 1), make_ti(0, 1)),
+ make_oi(make_ti(0, 1), make_ti(0, 2)), 0))
+ return 1;
+ if (test__cmpoi2(make_oi(make_ti(0, 1), make_ti(0, 2)),
+ make_oi(make_ti(0, 1), make_ti(0, 1)), 2))
+ return 1;
+ // Negative values: -1 > -2
+ if (test__cmpoi2((oi_int)-1, (oi_int)-2, 2))
+ return 1;
+ if (test__cmpoi2((oi_int)-2, (oi_int)-1, 0))
+ return 1;
+ // Full-width big-number test (all 4 limbs populated).
+ // A is negative signed, B is positive signed, so A < B.
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__cmpoi2(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ 0))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/ctzoi2_test.c b/compiler-rt/test/builtins/Unit/ctzoi2_test.c
new file mode 100644
index 0000000000000..4a891e8b9320b
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/ctzoi2_test.c
@@ -0,0 +1,83 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_ctzoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __ctzoi2(oi_int a);
+
+int test__ctzoi2(oi_int a, int expected) {
+ int x = __ctzoi2(a);
+ if (x != expected) {
+ printf("error in __ctzoi2: expected %d, got %d\n", expected, x);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // 1
+ if (test__ctzoi2((oi_int)1, 0))
+ return 1;
+ // 2
+ if (test__ctzoi2((oi_int)2, 1))
+ return 1;
+ // Bit at position 63
+ if (test__ctzoi2(make_oi(make_ti(0, 0), make_ti(0, 0x8000000000000000ULL)),
+ 63))
+ return 1;
+ // Bit at position 64
+ if (test__ctzoi2(make_oi(make_ti(0, 0), make_ti(1, 0)), 64))
+ return 1;
+ // Bit at position 127
+ if (test__ctzoi2(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ 127))
+ return 1;
+ // Bit at position 128
+ if (test__ctzoi2(make_oi(make_ti(0, 1), make_ti(0, 0)), 128))
+ return 1;
+ // Bit at position 191
+ if (test__ctzoi2(make_oi(make_ti(0, 0x8000000000000000ULL), make_ti(0, 0)),
+ 191))
+ return 1;
+ // Bit at position 192
+ if (test__ctzoi2(make_oi(make_ti(1, 0), make_ti(0, 0)), 192))
+ return 1;
+ // Bit at position 255 (MSB)
+ if (test__ctzoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+ 255))
+ return 1;
+ // All ones
+ if (test__ctzoi2((oi_int)(ou_int)-1, 0))
+ return 1;
+ // Multiple bits, lowest is position 8
+ if (test__ctzoi2((oi_int)0xFF00, 8))
+ return 1;
+ // Bits in both halves, lowest in low half
+ if (test__ctzoi2(make_oi(make_ti(0, 1), make_ti(0, 0x100)), 8))
+ return 1;
+ // Full-width big-number tests.
+ // Expected values verified by Python arbitrary-precision arithmetic.
+ if (test__ctzoi2(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ 16))
+ return 1;
+ if (test__ctzoi2(
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ 0))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/divmodoi4_test.c b/compiler-rt/test/builtins/Unit/divmodoi4_test.c
new file mode 100644
index 0000000000000..c9526a33eee30
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/divmodoi4_test.c
@@ -0,0 +1,97 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_divmodoi4
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __divmodoi4(oi_int a, oi_int b, oi_int *rem);
+
+int test__divmodoi4(oi_int a, oi_int b, oi_int expected_q, oi_int expected_r) {
+ oi_int r;
+ oi_int q = __divmodoi4(a, b, &r);
+ if (q != expected_q || r != expected_r) {
+ printf("error in __divmodoi4\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__divmodoi4((oi_int)0, (oi_int)1, (oi_int)0, (oi_int)0))
+ return 1;
+ if (test__divmodoi4((oi_int)10, (oi_int)3, (oi_int)3, (oi_int)1))
+ return 1;
+ if (test__divmodoi4((oi_int)-10, (oi_int)3, (oi_int)-3, (oi_int)-1))
+ return 1;
+ if (test__divmodoi4((oi_int)10, (oi_int)-3, (oi_int)-3, (oi_int)1))
+ return 1;
+ if (test__divmodoi4((oi_int)-10, (oi_int)-3, (oi_int)3, (oi_int)-1))
+ return 1;
+ if (test__divmodoi4((oi_int)100, (oi_int)7, (oi_int)14, (oi_int)2))
+ return 1;
+ // Exact division
+ if (test__divmodoi4((oi_int)42, (oi_int)42, (oi_int)1, (oi_int)0))
+ return 1;
+ // Dividend smaller than divisor
+ if (test__divmodoi4((oi_int)3, (oi_int)10, (oi_int)0, (oi_int)3))
+ return 1;
+ // (1 << 128) / 2
+ if (test__divmodoi4(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)2,
+ make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ (oi_int)0))
+ return 1;
+ // (1 << 128) / 3 with remainder
+ if (test__divmodoi4(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)3,
+ make_oi(make_ti(0, 0), make_ti(0x5555555555555555LL,
+ 0x5555555555555555ULL)),
+ (oi_int)1))
+ return 1;
+ // Negative large / positive small
+ if (test__divmodoi4(
+ make_oi(make_ti(-1, -1), make_ti(0, 0)), (oi_int)2,
+ make_oi(make_ti(-1, -1), make_ti(0x8000000000000000LL, 0)),
+ (oi_int)0))
+ return 1;
+ // Positive large / negative small
+ if (test__divmodoi4(
+ make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)-2,
+ make_oi(make_ti(-1, -1), make_ti(0x8000000000000000LL, 0)),
+ (oi_int)0))
+ return 1;
+ // Large / large (same value)
+ {
+ oi_int big = make_oi(make_ti(0, 0x100), make_ti(0, 0));
+ if (test__divmodoi4(big, big, (oi_int)1, (oi_int)0))
+ return 1;
+ }
+ // Cross-half boundary value
+ if (test__divmodoi4(make_oi(make_ti(0, 1), make_ti(0, 5)), (oi_int)4,
+ make_oi(make_ti(0, 0), make_ti(0x4000000000000000LL, 1)),
+ (oi_int)1))
+ return 1;
+ // Full-width big-number test (all 4 limbs populated).
+ // A(signed) divmod B(signed): q = -4, r verified by Python: q*b + r == a.
+ // Expected values verified by Python arbitrary-precision arithmetic.
+ if (test__divmodoi4(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFULL),
+ make_ti(0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFCULL)),
+ make_oi(make_ti(0xEEEF44449999EEEFLL, 0x44449998EEEF4444ULL),
+ make_ti(0x9999EEEF44449999ULL, 0xEEEF444499954444ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/divoi3_test.c b/compiler-rt/test/builtins/Unit/divoi3_test.c
new file mode 100644
index 0000000000000..00b8b65496eb2
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/divoi3_test.c
@@ -0,0 +1,97 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_divoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __divoi3(oi_int a, oi_int b);
+
+int test__divoi3(oi_int a, oi_int b, oi_int expected) {
+ oi_int x = __divoi3(a, b);
+ if (x != expected) {
+ printf("error in __divoi3\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__divoi3((oi_int)0, (oi_int)1, (oi_int)0))
+ return 1;
+ if (test__divoi3((oi_int)10, (oi_int)3, (oi_int)3))
+ return 1;
+ if (test__divoi3((oi_int)-10, (oi_int)3, (oi_int)-3))
+ return 1;
+ if (test__divoi3((oi_int)10, (oi_int)-3, (oi_int)-3))
+ return 1;
+ if (test__divoi3((oi_int)-10, (oi_int)-3, (oi_int)3))
+ return 1;
+ if (test__divoi3((oi_int)1, (oi_int)1, (oi_int)1))
+ return 1;
+ if (test__divoi3((oi_int)100, (oi_int)10, (oi_int)10))
+ return 1;
+ // Large dividend in high half / small divisor
+ // (1 << 128) / 2 = (1 << 127)
+ if (test__divoi3(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)2,
+ make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0))))
+ return 1;
+ // (1 << 128) / 3
+ if (test__divoi3(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)3,
+ make_oi(make_ti(0, 0), make_ti(0x5555555555555555LL,
+ 0x5555555555555555ULL))))
+ return 1;
+ // Negative large dividend
+ // -(1 << 128) / 2 = -(1 << 127)
+ if (test__divoi3(make_oi(make_ti(-1, -1), make_ti(0, 0)), (oi_int)2,
+ make_oi(make_ti(-1, -1), make_ti(0x8000000000000000LL, 0))))
+ return 1;
+ // Large / large (same value)
+ {
+ oi_int big = make_oi(make_ti(0, 0x100), make_ti(0, 0));
+ if (test__divoi3(big, big, (oi_int)1))
+ return 1;
+ }
+ // Large / large (double)
+ {
+ oi_int big = make_oi(make_ti(0, 0x100), make_ti(0, 0));
+ oi_int dbl = make_oi(make_ti(0, 0x200), make_ti(0, 0));
+ if (test__divoi3(dbl, big, (oi_int)2))
+ return 1;
+ }
+ // Dividend smaller than divisor
+ if (test__divoi3((oi_int)3, (oi_int)10, (oi_int)0))
+ return 1;
+ // Large negative / large negative
+ {
+ oi_int neg = make_oi(make_ti(-1, -2), make_ti(0, 0));
+ if (test__divoi3(neg, neg, (oi_int)1))
+ return 1;
+ }
+ // Cross-half boundary: value spans both halves
+ if (test__divoi3(make_oi(make_ti(0, 1), make_ti(0, 4)), (oi_int)4,
+ make_oi(make_ti(0, 0), make_ti(0x4000000000000000LL, 1))))
+ return 1;
+ // Full-width big-number test (all 4 limbs populated).
+ // A(signed) / B(signed) = -4 (truncation toward zero).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__divoi3(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFULL),
+ make_ti(0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFCULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/ffsoi2_test.c b/compiler-rt/test/builtins/Unit/ffsoi2_test.c
new file mode 100644
index 0000000000000..30bbfdd3c489e
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/ffsoi2_test.c
@@ -0,0 +1,86 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_ffsoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __ffsoi2(oi_int a);
+
+int test__ffsoi2(oi_int a, int expected) {
+ int x = __ffsoi2(a);
+ if (x != expected) {
+ printf("error in __ffsoi2: expected %d, got %d\n", expected, x);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // Zero
+ if (test__ffsoi2((oi_int)0, 0))
+ return 1;
+ // 1 (bit 0 set)
+ if (test__ffsoi2((oi_int)1, 1))
+ return 1;
+ // 2 (bit 1 set)
+ if (test__ffsoi2((oi_int)2, 2))
+ return 1;
+ // Bit 63 set
+ if (test__ffsoi2(make_oi(make_ti(0, 0), make_ti(0, 0x8000000000000000ULL)),
+ 64))
+ return 1;
+ // Bit 64 set
+ if (test__ffsoi2(make_oi(make_ti(0, 0), make_ti(1, 0)), 65))
+ return 1;
+ // Bit 127 set
+ if (test__ffsoi2(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ 128))
+ return 1;
+ // Bit 128 set
+ if (test__ffsoi2(make_oi(make_ti(0, 1), make_ti(0, 0)), 129))
+ return 1;
+ // Bit 191 set
+ if (test__ffsoi2(make_oi(make_ti(0, 0x8000000000000000ULL), make_ti(0, 0)),
+ 192))
+ return 1;
+ // Bit 192 set
+ if (test__ffsoi2(make_oi(make_ti(1, 0), make_ti(0, 0)), 193))
+ return 1;
+ // Bit 255 set (MSB)
+ if (test__ffsoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+ 256))
+ return 1;
+ // All ones
+ if (test__ffsoi2((oi_int)(ou_int)-1, 1))
+ return 1;
+ // Multiple bits, lowest is bit 8
+ if (test__ffsoi2((oi_int)0xFF00, 9))
+ return 1;
+ // Bits in both halves, lowest in low half
+ if (test__ffsoi2(make_oi(make_ti(0, 1), make_ti(0, 0x100)), 9))
+ return 1;
+ // Full-width big-number tests.
+ // Expected values verified by Python arbitrary-precision arithmetic.
+ if (test__ffsoi2(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ 17))
+ return 1;
+ if (test__ffsoi2(
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ 1))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixdfoi_test.c b/compiler-rt/test/builtins/Unit/fixdfoi_test.c
new file mode 100644
index 0000000000000..29d57ee18a690
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixdfoi_test.c
@@ -0,0 +1,93 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixdfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __fixdfoi(double a);
+
+int test__fixdfoi(double a, oi_int expected) {
+ oi_int x = __fixdfoi(a);
+ if (x != expected) {
+ printf("error in __fixdfoi(%f)\n", a);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__fixdfoi(0.0, (oi_int)0))
+ return 1;
+ if (test__fixdfoi(1.0, (oi_int)1))
+ return 1;
+ if (test__fixdfoi(-1.0, (oi_int)-1))
+ return 1;
+ if (test__fixdfoi(42.0, (oi_int)42))
+ return 1;
+ if (test__fixdfoi(-42.0, (oi_int)-42))
+ return 1;
+ if (test__fixdfoi(1e18, (oi_int)1000000000000000000LL))
+ return 1;
+ if (test__fixdfoi(0.5, (oi_int)0))
+ return 1;
+ if (test__fixdfoi(-0.5, (oi_int)0))
+ return 1;
+ if (test__fixdfoi(1.5, (oi_int)1))
+ return 1;
+ if (test__fixdfoi(-1.5, (oi_int)-1))
+ return 1;
+ if (test__fixdfoi(100.0, (oi_int)100))
+ return 1;
+ if (test__fixdfoi(-100.0, (oi_int)-100))
+ return 1;
+ // Rounding toward zero
+ if (test__fixdfoi(0.99, (oi_int)0))
+ return 1;
+ if (test__fixdfoi(1.99, (oi_int)1))
+ return 1;
+ if (test__fixdfoi(-0.99, (oi_int)0))
+ return 1;
+ if (test__fixdfoi(-1.99, (oi_int)-1))
+ return 1;
+ if (test__fixdfoi(2.01, (oi_int)2))
+ return 1;
+ // Double mantissa boundary: 52 bits (53 with implicit 1)
+ // 0x1.FFFFFFFFFFFFFp+62 = max double < 2^63
+ if (test__fixdfoi(0x1.FFFFFFFFFFFFFp+62, (oi_int)0x7FFFFFFFFFFFFC00LL))
+ return 1;
+ if (test__fixdfoi(-0x1.FFFFFFFFFFFFFp+62, -(oi_int)0x7FFFFFFFFFFFFC00LL))
+ return 1;
+ // Exact powers of 2 in the 128+ bit range
+ if (test__fixdfoi(0x1.0p+64, (oi_int)1 << 64))
+ return 1;
+ if (test__fixdfoi(0x1.0p+127, (oi_int)1 << 127))
+ return 1;
+ if (test__fixdfoi(0x1.0p+200, (oi_int)1 << 200))
+ return 1;
+ // Negative large
+ if (test__fixdfoi(-0x1.0p+127, -((oi_int)1 << 127)))
+ return 1;
+ // Values at the double mantissa limit (52-bit precision):
+ // 0x1.FFFFFFFFFFFFFp+126 -- max double in ~127-bit range
+ if (test__fixdfoi(0x1.FFFFFFFFFFFFFp+126,
+ make_oi(make_ti(0, 0), make_ti(0x7FFFFFFFFFFFFC00LL, 0))))
+ return 1;
+ if (test__fixdfoi(-0x1.FFFFFFFFFFFFFp+126,
+ make_oi(make_ti(-1, -1), make_ti(0x8000000000000400LL, 0))))
+ return 1;
+ // Specific hex value (from 128-bit reference test)
+ if (test__fixdfoi(0x1.1A3CFE870496Ep+57, (oi_int)0x023479FD0E092DC0LL))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixsfoi_test.c b/compiler-rt/test/builtins/Unit/fixsfoi_test.c
new file mode 100644
index 0000000000000..860fa926b373d
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixsfoi_test.c
@@ -0,0 +1,98 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixsfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __fixsfoi(float a);
+
+int test__fixsfoi(float a, oi_int expected) {
+ oi_int x = __fixsfoi(a);
+ if (x != expected) {
+ printf("error in __fixsfoi(%f)\n", a);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__fixsfoi(0.0f, (oi_int)0))
+ return 1;
+ if (test__fixsfoi(1.0f, (oi_int)1))
+ return 1;
+ if (test__fixsfoi(-1.0f, (oi_int)-1))
+ return 1;
+ if (test__fixsfoi(42.0f, (oi_int)42))
+ return 1;
+ if (test__fixsfoi(-42.0f, (oi_int)-42))
+ return 1;
+ if (test__fixsfoi(0.5f, (oi_int)0))
+ return 1;
+ if (test__fixsfoi(1.5f, (oi_int)1))
+ return 1;
+ if (test__fixsfoi(-0.5f, (oi_int)0))
+ return 1;
+ if (test__fixsfoi(-1.5f, (oi_int)-1))
+ return 1;
+ if (test__fixsfoi(100.0f, (oi_int)100))
+ return 1;
+ if (test__fixsfoi(-100.0f, (oi_int)-100))
+ return 1;
+ if (test__fixsfoi(1e6f, (oi_int)1000000))
+ return 1;
+ // Rounding toward zero for fractional parts
+ if (test__fixsfoi(0.99f, (oi_int)0))
+ return 1;
+ if (test__fixsfoi(1.99f, (oi_int)1))
+ return 1;
+ if (test__fixsfoi(-0.99f, (oi_int)0))
+ return 1;
+ if (test__fixsfoi(-1.99f, (oi_int)-1))
+ return 1;
+ if (test__fixsfoi(2.0f, (oi_int)2))
+ return 1;
+ if (test__fixsfoi(2.01f, (oi_int)2))
+ return 1;
+ if (test__fixsfoi(-2.0f, (oi_int)-2))
+ return 1;
+ // Precision boundary: float has 23 mantissa bits
+ // 0x1.FFFFFEp+62 = max float < 2^63, mantissa fully used
+ if (test__fixsfoi(0x1.FFFFFEp+62F, (oi_int)0x7FFFFF8000000000LL))
+ return 1;
+ if (test__fixsfoi(-0x1.FFFFFEp+62F, -(oi_int)0x7FFFFF8000000000LL))
+ return 1;
+ // Large float that needs >64 bits to represent
+ // 0x1.0p+64 = 2^64 = 18446744073709551616
+ if (test__fixsfoi(0x1.0p+64F, (oi_int)1 << 64))
+ return 1;
+ // 0x1.0p+127 = 2^127
+ if (test__fixsfoi(0x1.0p+127F, (oi_int)1 << 127))
+ return 1;
+ // Largest finite float: 0x1.FFFFFEp+127 = (2^24 - 1) * 2^104
+ // This fits in oi_int (it's only ~128 bits).
+ if (test__fixsfoi(0x1.FFFFFEp+127F, (oi_int)0xFFFFFF << 104))
+ return 1;
+ // Negative large
+ if (test__fixsfoi(-0x1.0p+127F, -((oi_int)1 << 127)))
+ return 1;
+ // Infinity should saturate to max
+ if (test__fixsfoi(__builtin_inff(), make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1),
+ make_ti(-1, -1))))
+ return 1;
+ // Negative infinity should saturate to min
+ if (test__fixsfoi(-__builtin_inff(),
+ make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixtfoi_test.c b/compiler-rt/test/builtins/Unit/fixtfoi_test.c
new file mode 100644
index 0000000000000..08879a61d19b3
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixtfoi_test.c
@@ -0,0 +1,47 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixtfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && __LDBL_MANT_DIG__ == 113
+
+COMPILER_RT_ABI oi_int __fixtfoi(long double a);
+
+int test__fixtfoi(long double a, oi_int expected) {
+ oi_int x = __fixtfoi(a);
+ if (x != expected) {
+ printf("error in __fixtfoi\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && __LDBL_MANT_DIG__ == 113
+ if (test__fixtfoi(0.0L, (oi_int)0))
+ return 1;
+ if (test__fixtfoi(1.0L, (oi_int)1))
+ return 1;
+ if (test__fixtfoi(-1.0L, (oi_int)-1))
+ return 1;
+ if (test__fixtfoi(42.0L, (oi_int)42))
+ return 1;
+ if (test__fixtfoi(-42.0L, (oi_int)-42))
+ return 1;
+ if (test__fixtfoi(0.5L, (oi_int)0))
+ return 1;
+ if (test__fixtfoi(1.5L, (oi_int)1))
+ return 1;
+ if (test__fixtfoi(-0.5L, (oi_int)0))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixunsdfoi_test.c b/compiler-rt/test/builtins/Unit/fixunsdfoi_test.c
new file mode 100644
index 0000000000000..01a3a363fc000
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixunsdfoi_test.c
@@ -0,0 +1,47 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixunsdfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI ou_int __fixunsdfoi(double a);
+
+int test__fixunsdfoi(double a, ou_int expected) {
+ ou_int x = __fixunsdfoi(a);
+ if (x != expected) {
+ printf("error in __fixunsdfoi(%f)\n", a);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__fixunsdfoi(0.0, (ou_int)0))
+ return 1;
+ if (test__fixunsdfoi(1.0, (ou_int)1))
+ return 1;
+ if (test__fixunsdfoi(42.0, (ou_int)42))
+ return 1;
+ if (test__fixunsdfoi(1e18, (ou_int)1000000000000000000ULL))
+ return 1;
+ if (test__fixunsdfoi(-1.0, (ou_int)0))
+ return 1;
+ if (test__fixunsdfoi(0.5, (ou_int)0))
+ return 1;
+ if (test__fixunsdfoi(1.5, (ou_int)1))
+ return 1;
+ if (test__fixunsdfoi(100.0, (ou_int)100))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixunssfoi_test.c b/compiler-rt/test/builtins/Unit/fixunssfoi_test.c
new file mode 100644
index 0000000000000..57cea91d4191c
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixunssfoi_test.c
@@ -0,0 +1,47 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixunssfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI ou_int __fixunssfoi(float a);
+
+int test__fixunssfoi(float a, ou_int expected) {
+ ou_int x = __fixunssfoi(a);
+ if (x != expected) {
+ printf("error in __fixunssfoi(%f)\n", a);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__fixunssfoi(0.0f, (ou_int)0))
+ return 1;
+ if (test__fixunssfoi(1.0f, (ou_int)1))
+ return 1;
+ if (test__fixunssfoi(42.0f, (ou_int)42))
+ return 1;
+ if (test__fixunssfoi(-1.0f, (ou_int)0))
+ return 1;
+ if (test__fixunssfoi(0.5f, (ou_int)0))
+ return 1;
+ if (test__fixunssfoi(1.5f, (ou_int)1))
+ return 1;
+ if (test__fixunssfoi(100.0f, (ou_int)100))
+ return 1;
+ if (test__fixunssfoi(1e6f, (ou_int)1000000))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixunstfoi_test.c b/compiler-rt/test/builtins/Unit/fixunstfoi_test.c
new file mode 100644
index 0000000000000..b28859d0d4064
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixunstfoi_test.c
@@ -0,0 +1,43 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixunstfoi
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && __LDBL_MANT_DIG__ == 113
+
+COMPILER_RT_ABI ou_int __fixunstfoi(long double a);
+
+int test__fixunstfoi(long double a, ou_int expected) {
+ ou_int x = __fixunstfoi(a);
+ if (x != expected) {
+ printf("error in __fixunstfoi\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && __LDBL_MANT_DIG__ == 113
+ if (test__fixunstfoi(0.0L, (ou_int)0))
+ return 1;
+ if (test__fixunstfoi(1.0L, (ou_int)1))
+ return 1;
+ if (test__fixunstfoi(42.0L, (ou_int)42))
+ return 1;
+ if (test__fixunstfoi(0.5L, (ou_int)0))
+ return 1;
+ if (test__fixunstfoi(1.5L, (ou_int)1))
+ return 1;
+ if (test__fixunstfoi(1000000.0L, (ou_int)1000000))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixunsxfoi_test.c b/compiler-rt/test/builtins/Unit/fixunsxfoi_test.c
new file mode 100644
index 0000000000000..c906bca167ced
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixunsxfoi_test.c
@@ -0,0 +1,149 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixunsxfoi
+// REQUIRES: x86-target-arch
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+
+// Returns: convert a to an unsigned 256-bit integer, rounding toward zero.
+// Negative values all become zero.
+
+// Assumption: long double is an intel 80 bit floating point type padded with 6
+// bytes
+// ou_int is a 256 bit integral type
+// value in long double is representable in ou_int or is negative
+// (no range checking performed)
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI ou_int __fixunsxfoi(long double a);
+
+int test__fixunsxfoi(long double a, ou_int expected) {
+ ou_int x = __fixunsxfoi(a);
+ if (x != expected) {
+ printf("error in __fixunsxfoi(%LA)\n", a);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(ou_int) == 2 * sizeof(tu_int)] = {0};
+char assumption_2[sizeof(long double) * CHAR_BIT == 128] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+ if (test__fixunsxfoi(0.0, 0))
+ return 1;
+
+ if (test__fixunsxfoi(0.5, 0))
+ return 1;
+ if (test__fixunsxfoi(0.99, 0))
+ return 1;
+ if (test__fixunsxfoi(1.0, 1))
+ return 1;
+ if (test__fixunsxfoi(1.5, 1))
+ return 1;
+ if (test__fixunsxfoi(1.99, 1))
+ return 1;
+ if (test__fixunsxfoi(2.0, 2))
+ return 1;
+ if (test__fixunsxfoi(2.01, 2))
+ return 1;
+ if (test__fixunsxfoi(-0.5, 0))
+ return 1;
+ if (test__fixunsxfoi(-0.99, 0))
+ return 1;
+ if (test__fixunsxfoi(-1.0, 0))
+ return 1;
+ if (test__fixunsxfoi(-1.5, 0))
+ return 1;
+ if (test__fixunsxfoi(-1.99, 0))
+ return 1;
+ if (test__fixunsxfoi(-2.0, 0))
+ return 1;
+ if (test__fixunsxfoi(-2.01, 0))
+ return 1;
+
+ // Float precision boundary tests
+ if (test__fixunsxfoi(0x1.FFFFFEp+62, 0x7FFFFF8000000000LL))
+ return 1;
+ if (test__fixunsxfoi(0x1.FFFFFCp+62, 0x7FFFFF0000000000LL))
+ return 1;
+
+ if (test__fixunsxfoi(-0x1.FFFFFEp+62, 0))
+ return 1;
+ if (test__fixunsxfoi(-0x1.FFFFFCp+62, 0))
+ return 1;
+
+ // Double precision boundary tests
+ if (test__fixunsxfoi(0x1.FFFFFFFFFFFFFp+62, 0x7FFFFFFFFFFFFC00LL))
+ return 1;
+ if (test__fixunsxfoi(0x1.FFFFFFFFFFFFEp+62, 0x7FFFFFFFFFFFF800LL))
+ return 1;
+
+ if (test__fixunsxfoi(-0x1.FFFFFFFFFFFFFp+62, 0))
+ return 1;
+ if (test__fixunsxfoi(-0x1.FFFFFFFFFFFFEp+62, 0))
+ return 1;
+
+ // Long double (80-bit) full precision tests near 64-bit boundary
+ if (test__fixunsxfoi(0x1.FFFFFFFFFFFFFFFEp+63L, 0xFFFFFFFFFFFFFFFFLL))
+ return 1;
+ if (test__fixunsxfoi(0x1.0000000000000002p+63L, 0x8000000000000001LL))
+ return 1;
+ if (test__fixunsxfoi(0x1.0000000000000000p+63L, 0x8000000000000000LL))
+ return 1;
+ if (test__fixunsxfoi(0x1.FFFFFFFFFFFFFFFCp+62L, 0x7FFFFFFFFFFFFFFFLL))
+ return 1;
+ if (test__fixunsxfoi(0x1.FFFFFFFFFFFFFFF8p+62L, 0x7FFFFFFFFFFFFFFELL))
+ return 1;
+
+ if (test__fixunsxfoi(-0x1.0000000000000000p+63L, 0))
+ return 1;
+ if (test__fixunsxfoi(-0x1.FFFFFFFFFFFFFFFCp+62L, 0))
+ return 1;
+ if (test__fixunsxfoi(-0x1.FFFFFFFFFFFFFFF8p+62L, 0))
+ return 1;
+
+ // Tests at 128-bit boundary
+ if (test__fixunsxfoi(
+ 0x1.FFFFFFFFFFFFFFFEp+127L,
+ make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFFLL, 0))))
+ return 1;
+ if (test__fixunsxfoi(
+ 0x1.0000000000000002p+127L,
+ make_oi(make_ti(0, 0), make_ti(0x8000000000000001LL, 0))))
+ return 1;
+ if (test__fixunsxfoi(
+ 0x1.0000000000000000p+127L,
+ make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0))))
+ return 1;
+ if (test__fixunsxfoi(
+ 0x1.FFFFFFFFFFFFFFFCp+126L,
+ make_oi(make_ti(0, 0), make_ti(0x7FFFFFFFFFFFFFFFLL, 0))))
+ return 1;
+
+ // Tests beyond 128-bit boundary
+ // 2^200
+ if (test__fixunsxfoi(0x1.0p+200L, (ou_int)1 << 200))
+ return 1;
+
+ // Near 256-bit boundary
+ if (test__fixunsxfoi(
+ 0x1.FFFFFFFFFFFFFFFEp+255L,
+ make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, 0x0000000000000000LL),
+ make_ti(0, 0))))
+ return 1;
+
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/fixxfoi_test.c b/compiler-rt/test/builtins/Unit/fixxfoi_test.c
new file mode 100644
index 0000000000000..78c59ffe3243b
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/fixxfoi_test.c
@@ -0,0 +1,144 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_fixxfoi
+// REQUIRES: x86-target-arch
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+
+// Returns: convert a to a signed 256-bit integer, rounding toward zero.
+
+// Assumption: long double is an intel 80 bit floating point type padded with 6
+// bytes
+// oi_int is a 256 bit integral type
+// value in long double is representable in oi_int (no range
+// checking performed)
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI oi_int __fixxfoi(long double a);
+
+int test__fixxfoi(long double a, oi_int expected) {
+ oi_int x = __fixxfoi(a);
+ if (x != expected) {
+ printf("error in __fixxfoi(%LA)\n", a);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+char assumption_2[sizeof(long double) * CHAR_BIT == 128] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+ if (test__fixxfoi(0.0, 0))
+ return 1;
+
+ if (test__fixxfoi(0.5, 0))
+ return 1;
+ if (test__fixxfoi(0.99, 0))
+ return 1;
+ if (test__fixxfoi(1.0, 1))
+ return 1;
+ if (test__fixxfoi(1.5, 1))
+ return 1;
+ if (test__fixxfoi(1.99, 1))
+ return 1;
+ if (test__fixxfoi(2.0, 2))
+ return 1;
+ if (test__fixxfoi(2.01, 2))
+ return 1;
+ if (test__fixxfoi(-0.5, 0))
+ return 1;
+ if (test__fixxfoi(-0.99, 0))
+ return 1;
+ if (test__fixxfoi(-1.0, -1))
+ return 1;
+ if (test__fixxfoi(-1.5, -1))
+ return 1;
+ if (test__fixxfoi(-1.99, -1))
+ return 1;
+ if (test__fixxfoi(-2.0, -2))
+ return 1;
+ if (test__fixxfoi(-2.01, -2))
+ return 1;
+
+ // Float precision boundary tests (from 128-bit reference)
+ if (test__fixxfoi(0x1.FFFFFEp+62, 0x7FFFFF8000000000LL))
+ return 1;
+ if (test__fixxfoi(0x1.FFFFFCp+62, 0x7FFFFF0000000000LL))
+ return 1;
+
+ if (test__fixxfoi(-0x1.FFFFFEp+62, -(oi_int)0x7FFFFF8000000000LL))
+ return 1;
+ if (test__fixxfoi(-0x1.FFFFFCp+62, -(oi_int)0x7FFFFF0000000000LL))
+ return 1;
+
+ // Double precision boundary tests
+ if (test__fixxfoi(0x1.FFFFFFFFFFFFFp+62, 0x7FFFFFFFFFFFFC00LL))
+ return 1;
+ if (test__fixxfoi(0x1.FFFFFFFFFFFFEp+62, 0x7FFFFFFFFFFFF800LL))
+ return 1;
+
+ if (test__fixxfoi(-0x1.FFFFFFFFFFFFFp+62, -(oi_int)0x7FFFFFFFFFFFFC00LL))
+ return 1;
+ if (test__fixxfoi(-0x1.FFFFFFFFFFFFEp+62, -(oi_int)0x7FFFFFFFFFFFF800LL))
+ return 1;
+
+ // Long double (80-bit) full precision tests
+ if (test__fixxfoi(0x1.FFFFFFFFFFFFFFFCp+62L, 0x7FFFFFFFFFFFFFFFLL))
+ return 1;
+ if (test__fixxfoi(0x1.FFFFFFFFFFFFFFF8p+62L, 0x7FFFFFFFFFFFFFFELL))
+ return 1;
+
+ if (test__fixxfoi(-0x1.0000000000000000p+63L, -(oi_int)0x8000000000000000LL))
+ return 1;
+ if (test__fixxfoi(-0x1.FFFFFFFFFFFFFFFCp+62L, -(oi_int)0x7FFFFFFFFFFFFFFFLL))
+ return 1;
+ if (test__fixxfoi(-0x1.FFFFFFFFFFFFFFF8p+62L, -(oi_int)0x7FFFFFFFFFFFFFFELL))
+ return 1;
+
+ // Tests at 128-bit boundary (same as ti tests, but still fits in oi)
+ if (test__fixxfoi(0x1.FFFFFFFFFFFFFFFEp+126L,
+ make_oi(make_ti(0, 0), make_ti(0x7FFFFFFFFFFFFFFFLL,
+ 0x8000000000000000LL))))
+ return 1;
+ if (test__fixxfoi(0x1.FFFFFFFFFFFFFFFCp+126L,
+ make_oi(make_ti(0, 0), make_ti(0x7FFFFFFFFFFFFFFFLL, 0))))
+ return 1;
+
+ if (test__fixxfoi(-0x1.0000000000000000p+127L,
+ -make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0))))
+ return 1;
+
+ // Tests beyond 128-bit boundary: values needing >128 bits
+ // 2^200
+ if (test__fixxfoi(0x1.0p+200L, (oi_int)1 << 200))
+ return 1;
+ if (test__fixxfoi(-0x1.0p+200L, -((oi_int)1 << 200)))
+ return 1;
+
+ // Value near 256-bit boundary
+ // 0x1.FFFFFFFFFFFFFFFEp+254L is the largest xf value that fits in oi_int
+ if (test__fixxfoi(0x1.FFFFFFFFFFFFFFFEp+254L,
+ make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, 0x8000000000000000LL),
+ make_ti(0, 0))))
+ return 1;
+ if (test__fixxfoi(
+ -0x1.FFFFFFFFFFFFFFFEp+254L,
+ -make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, 0x8000000000000000LL),
+ make_ti(0, 0))))
+ return 1;
+
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatoidf_test.c b/compiler-rt/test/builtins/Unit/floatoidf_test.c
new file mode 100644
index 0000000000000..7b147e5ce69a3
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatoidf_test.c
@@ -0,0 +1,89 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatoidf
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI double __floatoidf(oi_int a);
+
+int test__floatoidf(oi_int a, double expected) {
+ double x = __floatoidf(a);
+ if (x != expected) {
+ printf("error in __floatoidf: got %f, expected %f\n", x, expected);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__floatoidf((oi_int)0, 0.0))
+ return 1;
+ if (test__floatoidf((oi_int)1, 1.0))
+ return 1;
+ if (test__floatoidf((oi_int)-1, -1.0))
+ return 1;
+ if (test__floatoidf((oi_int)42, 42.0))
+ return 1;
+ if (test__floatoidf((oi_int)-42, -42.0))
+ return 1;
+ if (test__floatoidf((oi_int)1000000, 1e6))
+ return 1;
+ if (test__floatoidf((oi_int)-1000000, -1e6))
+ return 1;
+ if (test__floatoidf((oi_int)100, 100.0))
+ return 1;
+ if (test__floatoidf((oi_int)20, 20.0))
+ return 1;
+ if (test__floatoidf((oi_int)-20, -20.0))
+ return 1;
+ // Double mantissa boundary: 52 bits (53 with implicit 1)
+ // 2^53 = 9007199254740992, exactly representable
+ if (test__floatoidf((oi_int)9007199254740992LL, 9007199254740992.0))
+ return 1;
+ // 2^53 + 1: NOT exactly representable, rounds to 2^53
+ if (test__floatoidf((oi_int)9007199254740993LL, 9007199254740992.0))
+ return 1;
+ // 2^53 + 2: exactly representable
+ if (test__floatoidf((oi_int)9007199254740994LL, 9007199254740994.0))
+ return 1;
+ // Specific values from 128-bit reference tests
+ if (test__floatoidf((oi_int)0x7FFFFF8000000000LL, 0x1.FFFFFEp+62))
+ return 1;
+ if (test__floatoidf((oi_int)0x7FFFFFFFFFFFF800LL, 0x1.FFFFFFFFFFFFEp+62))
+ return 1;
+ // Large values spanning >64 bits
+ if (test__floatoidf((oi_int)1 << 64, 0x1.0p+64))
+ return 1;
+ if (test__floatoidf((oi_int)1 << 127, 0x1.0p+127))
+ return 1;
+ if (test__floatoidf(-((oi_int)1 << 127), -0x1.0p+127))
+ return 1;
+ // Very large value: 2^200
+ if (test__floatoidf((oi_int)1 << 200, 0x1.0p+200))
+ return 1;
+ // Values with high-half mantissa bits:
+ // make_oi(make_ti(0x7FFFFF8000000000, 0), make_ti(0, 0))
+ // = 0x7FFFFF8000000000 << 128, leading 1 at bit 254
+ if (test__floatoidf(make_oi(make_ti(0x7FFFFF8000000000LL, 0), make_ti(0, 0)),
+ 0x1.FFFFFEp+254))
+ return 1;
+ // Negative large
+ if (test__floatoidf(make_oi(make_ti(0x8000008000000000LL, 0), make_ti(0, 0)),
+ -0x1.FFFFFEp+254))
+ return 1;
+ // Specific hex value (adapted from 128-bit reference)
+ if (test__floatoidf((oi_int)0x023479FD0E092DC0LL, 0x1.1A3CFE870496Ep+57))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatoisf_test.c b/compiler-rt/test/builtins/Unit/floatoisf_test.c
new file mode 100644
index 0000000000000..ea33058344892
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatoisf_test.c
@@ -0,0 +1,77 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatoisf
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI float __floatoisf(oi_int a);
+
+int test__floatoisf(oi_int a, float expected) {
+ float x = __floatoisf(a);
+ if (x != expected) {
+ printf("error in __floatoisf: got %f, expected %f\n", x, expected);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__floatoisf((oi_int)0, 0.0f))
+ return 1;
+ if (test__floatoisf((oi_int)1, 1.0f))
+ return 1;
+ if (test__floatoisf((oi_int)-1, -1.0f))
+ return 1;
+ if (test__floatoisf((oi_int)42, 42.0f))
+ return 1;
+ if (test__floatoisf((oi_int)-42, -42.0f))
+ return 1;
+ if (test__floatoisf((oi_int)100, 100.0f))
+ return 1;
+ if (test__floatoisf((oi_int)-100, -100.0f))
+ return 1;
+ if (test__floatoisf((oi_int)1000000, 1e6f))
+ return 1;
+ if (test__floatoisf((oi_int)-1000000, -1e6f))
+ return 1;
+ if (test__floatoisf((oi_int)20, 20.0f))
+ return 1;
+ if (test__floatoisf((oi_int)-20, -20.0f))
+ return 1;
+ // Precision boundary: float has 23 mantissa bits (24 with implicit 1)
+ // 2^24 = 16777216, exactly representable
+ if (test__floatoisf((oi_int)16777216, 16777216.0f))
+ return 1;
+ // 2^24 + 1 = 16777217: NOT exactly representable in float,
+ // rounds to 16777216.0f
+ if (test__floatoisf((oi_int)16777217, 16777216.0f))
+ return 1;
+ // 2^24 + 2 = 16777218: exactly representable (even, rounds-to-even)
+ if (test__floatoisf((oi_int)16777218, 16777218.0f))
+ return 1;
+ // Values at the mantissa boundary:
+ // 0x7FFFFF8000000000 = mantissa all-ones shifted to bit 62
+ if (test__floatoisf((oi_int)0x7FFFFF8000000000LL, 0x1.FFFFFEp+62F))
+ return 1;
+ // Large 256-bit value: 2^127
+ if (test__floatoisf((oi_int)1 << 127, 0x1.0p+127F))
+ return 1;
+ // Large negative
+ if (test__floatoisf(-((oi_int)1 << 127), -0x1.0p+127F))
+ return 1;
+ // Value > 128 bits: 2^200 exceeds float range, should return +inf
+ if (test__floatoisf((oi_int)1 << 200, __builtin_inff()))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatoitf_test.c b/compiler-rt/test/builtins/Unit/floatoitf_test.c
new file mode 100644
index 0000000000000..db95716158f02
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatoitf_test.c
@@ -0,0 +1,45 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatoitf
+// REQUIRES: int256
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+
+COMPILER_RT_ABI fp_t __floatoitf(oi_int a);
+
+int test__floatoitf(oi_int a, fp_t expected) {
+ fp_t x = __floatoitf(a);
+ if (x != expected) {
+ printf("error in __floatoitf\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+ if (test__floatoitf((oi_int)0, TF_C(0.0)))
+ return 1;
+ if (test__floatoitf((oi_int)1, TF_C(1.0)))
+ return 1;
+ if (test__floatoitf((oi_int)-1, TF_C(-1.0)))
+ return 1;
+ if (test__floatoitf((oi_int)42, TF_C(42.0)))
+ return 1;
+ if (test__floatoitf((oi_int)-42, TF_C(-42.0)))
+ return 1;
+ if (test__floatoitf((oi_int)1000000, TF_C(1e6)))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatoixf_test.c b/compiler-rt/test/builtins/Unit/floatoixf_test.c
new file mode 100644
index 0000000000000..f6dde67a9f9fe
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatoixf_test.c
@@ -0,0 +1,114 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatoixf
+// REQUIRES: x86-target-arch
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <float.h>
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+
+// Returns: convert a to a long double, rounding toward even.
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits
+// oi_int is a 256 bit integral type
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI long double __floatoixf(oi_int a);
+
+int test__floatoixf(oi_int a, long double expected) {
+ long double x = __floatoixf(a);
+ if (x != expected) {
+ printf("error in __floatoixf = %LA, expected %LA\n", x, expected);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+char assumption_2[sizeof(long double) * CHAR_BIT == 128] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+ if (test__floatoixf(0, 0.0))
+ return 1;
+
+ if (test__floatoixf(1, 1.0))
+ return 1;
+ if (test__floatoixf(2, 2.0))
+ return 1;
+ if (test__floatoixf(20, 20.0))
+ return 1;
+ if (test__floatoixf(-1, -1.0))
+ return 1;
+ if (test__floatoixf(-2, -2.0))
+ return 1;
+ if (test__floatoixf(-20, -20.0))
+ return 1;
+
+ // Precision boundary tests (from 128-bit reference)
+ if (test__floatoixf(0x7FFFFF8000000000LL, 0x1.FFFFFEp+62))
+ return 1;
+ if (test__floatoixf(0x7FFFFFFFFFFFF800LL, 0x1.FFFFFFFFFFFFEp+62))
+ return 1;
+ if (test__floatoixf(0x7FFFFF0000000000LL, 0x1.FFFFFCp+62))
+ return 1;
+ if (test__floatoixf(0x7FFFFFFFFFFFF000LL, 0x1.FFFFFFFFFFFFCp+62))
+ return 1;
+
+ // Full long double precision (64-bit mantissa)
+ if (test__floatoixf(0x7FFFFFFFFFFFFFFFLL, 0xF.FFFFFFFFFFFFFFEp+59L))
+ return 1;
+ if (test__floatoixf(0x023479FD0E092DC0LL, 0x1.1A3CFE870496Ep+57))
+ return 1;
+ if (test__floatoixf(0x023479FD0E092DA1LL, 0x1.1A3CFE870496D08p+57L))
+ return 1;
+
+ // Values spanning >64 bits (128-bit range, in oi_int)
+ if (test__floatoixf(make_oi(make_ti(0, 0), make_ti(0x023479FD0E092DC0LL, 0)),
+ 0x1.1A3CFE870496Ep+121L))
+ return 1;
+
+ // Negative values
+ if (test__floatoixf(make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, -1),
+ make_ti(0x8000008000000000LL, 0)),
+ -0x1.FFFFFEp+126))
+ return 1;
+ if (test__floatoixf(make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, -1),
+ make_ti(0x8000000000000000LL, 0)),
+ -0x1.000000p+127))
+ return 1;
+ if (test__floatoixf(make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, -1),
+ make_ti(0x8000000000000001LL, 0)),
+ -0x1.FFFFFFFFFFFFFFFCp+126L))
+ return 1;
+
+ // Values beyond 128-bit range: high half set
+ if (test__floatoixf(make_oi(make_ti(0, 1), make_ti(0, 0)), 0x1.0p+128L))
+ return 1;
+ // 2^200
+ if (test__floatoixf((oi_int)1 << 200, 0x1.0p+200L))
+ return 1;
+
+ // Large 256-bit value near max
+ if (test__floatoixf(make_oi(make_ti(0x023479FD0E092DC0LL, 0), make_ti(0, 0)),
+ 0x1.1A3CFE870496Ep+249L))
+ return 1;
+
+ // Max unsigned 64-bit in lower half
+ if (test__floatoixf(make_oi(make_ti(0, 0), make_ti(0, 0xFFFFFFFFFFFFFFFFLL)),
+ 0x1.FFFFFFFFFFFFFFFEp+63L))
+ return 1;
+
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatunoidf_test.c b/compiler-rt/test/builtins/Unit/floatunoidf_test.c
new file mode 100644
index 0000000000000..b822b611bcccc
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatunoidf_test.c
@@ -0,0 +1,43 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatunoidf
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI double __floatunoidf(ou_int a);
+
+int test__floatunoidf(ou_int a, double expected) {
+ double x = __floatunoidf(a);
+ if (x != expected) {
+ printf("error in __floatunoidf: got %f, expected %f\n", x, expected);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__floatunoidf((ou_int)0, 0.0))
+ return 1;
+ if (test__floatunoidf((ou_int)1, 1.0))
+ return 1;
+ if (test__floatunoidf((ou_int)42, 42.0))
+ return 1;
+ if (test__floatunoidf((ou_int)1000000, 1e6))
+ return 1;
+ if (test__floatunoidf((ou_int)1000000000000000000ULL, 1e18))
+ return 1;
+ if (test__floatunoidf((ou_int)100, 100.0))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatunoisf_test.c b/compiler-rt/test/builtins/Unit/floatunoisf_test.c
new file mode 100644
index 0000000000000..6be53202d0e26
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatunoisf_test.c
@@ -0,0 +1,41 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatunoisf
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI float __floatunoisf(ou_int a);
+
+int test__floatunoisf(ou_int a, float expected) {
+ float x = __floatunoisf(a);
+ if (x != expected) {
+ printf("error in __floatunoisf: got %f, expected %f\n", x, expected);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__floatunoisf((ou_int)0, 0.0f))
+ return 1;
+ if (test__floatunoisf((ou_int)1, 1.0f))
+ return 1;
+ if (test__floatunoisf((ou_int)42, 42.0f))
+ return 1;
+ if (test__floatunoisf((ou_int)100, 100.0f))
+ return 1;
+ if (test__floatunoisf((ou_int)1000000, 1e6f))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatunoitf_test.c b/compiler-rt/test/builtins/Unit/floatunoitf_test.c
new file mode 100644
index 0000000000000..0407235b2edcb
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatunoitf_test.c
@@ -0,0 +1,43 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatunoitf
+// REQUIRES: int256
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
+#include "int_lib.h"
+#include <stdio.h>
+
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+
+COMPILER_RT_ABI fp_t __floatunoitf(ou_int a);
+
+int test__floatunoitf(ou_int a, fp_t expected) {
+ fp_t x = __floatunoitf(a);
+ if (x != expected) {
+ printf("error in __floatunoitf\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_TF_MODE) && defined(CRT_HAS_256BIT)
+ if (test__floatunoitf((ou_int)0, TF_C(0.0)))
+ return 1;
+ if (test__floatunoitf((ou_int)1, TF_C(1.0)))
+ return 1;
+ if (test__floatunoitf((ou_int)42, TF_C(42.0)))
+ return 1;
+ if (test__floatunoitf((ou_int)1000000, TF_C(1e6)))
+ return 1;
+ if (test__floatunoitf((ou_int)1000000000000000000ULL, TF_C(1e18)))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatunoixf_test.c b/compiler-rt/test/builtins/Unit/floatunoixf_test.c
new file mode 100644
index 0000000000000..402330b3525bf
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/floatunoixf_test.c
@@ -0,0 +1,123 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_floatunoixf
+// REQUIRES: x86-target-arch
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <float.h>
+#include <stdio.h>
+
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+
+// Returns: convert a to a long double, rounding toward even.
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits
+// ou_int is a 256 bit integral type
+
+// gggg gggg gggg gggg gggg gggg gggg gggg | gggg gggg gggg gggg seee eeee eeee
+// eeee | 1mmm mmmm mmmm mmmm mmmm mmmm mmmm mmmm | mmmm mmmm mmmm mmmm mmmm
+// mmmm mmmm mmmm
+
+COMPILER_RT_ABI long double __floatunoixf(ou_int a);
+
+int test__floatunoixf(ou_int a, long double expected) {
+ long double x = __floatunoixf(a);
+ if (x != expected) {
+ printf("error in __floatunoixf = %LA, expected %LA\n", x, expected);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(ou_int) == 2 * sizeof(tu_int)] = {0};
+char assumption_2[sizeof(long double) * CHAR_BIT == 128] = {0};
+
+#endif
+
+int main() {
+#if defined(CRT_HAS_256BIT) && HAS_80_BIT_LONG_DOUBLE
+ if (test__floatunoixf(0, 0.0))
+ return 1;
+
+ if (test__floatunoixf(1, 1.0))
+ return 1;
+ if (test__floatunoixf(2, 2.0))
+ return 1;
+ if (test__floatunoixf(20, 20.0))
+ return 1;
+
+ // Precision boundary tests
+ if (test__floatunoixf(0x7FFFFF8000000000ULL, 0x1.FFFFFEp+62))
+ return 1;
+ if (test__floatunoixf(0x7FFFFFFFFFFFF800ULL, 0x1.FFFFFFFFFFFFEp+62))
+ return 1;
+ if (test__floatunoixf(0x7FFFFF0000000000ULL, 0x1.FFFFFCp+62))
+ return 1;
+ if (test__floatunoixf(0x7FFFFFFFFFFFF000ULL, 0x1.FFFFFFFFFFFFCp+62))
+ return 1;
+ if (test__floatunoixf(0x7FFFFFFFFFFFFFFFULL, 0xF.FFFFFFFFFFFFFFEp+59L))
+ return 1;
+ if (test__floatunoixf(0xFFFFFFFFFFFFFFFEULL, 0xF.FFFFFFFFFFFFFFEp+60L))
+ return 1;
+ if (test__floatunoixf(0xFFFFFFFFFFFFFFFFULL, 0xF.FFFFFFFFFFFFFFFp+60L))
+ return 1;
+
+ // Specific hex value tests
+ if (test__floatunoixf(0x8000008000000000ULL, 0x8.000008p+60))
+ return 1;
+ if (test__floatunoixf(0x8000000000000800ULL, 0x8.0000000000008p+60))
+ return 1;
+ if (test__floatunoixf(0x8000000000000000ULL, 0x8p+60))
+ return 1;
+ if (test__floatunoixf(0x8000000000000001ULL, 0x8.000000000000001p+60L))
+ return 1;
+
+ if (test__floatunoixf(0x0007FB72E8000000LL, 0x1.FEDCBAp+50))
+ return 1;
+ if (test__floatunoixf(0x023479FD0E092DC0LL, 0x1.1A3CFE870496Ep+57))
+ return 1;
+ if (test__floatunoixf(0x023479FD0E092DA1LL, 0x1.1A3CFE870496D08p+57L))
+ return 1;
+
+ // Values spanning >64 bits (128-bit range, in ou_int)
+ if (test__floatunoixf(
+ make_oi(make_ti(0, 0), make_ti(0x023479FD0E092DC0LL, 0)),
+ 0x1.1A3CFE870496Ep+121L))
+ return 1;
+
+ // Max unsigned 128-bit value in lower half
+ if (test__floatunoixf(make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFFLL,
+ 0xFFFFFFFFFFFFFFFFLL)),
+ 0x1.0000000000000000p+128L))
+ return 1;
+ if (test__floatunoixf(
+ make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFFLL, 0)),
+ 0x1.FFFFFFFFFFFFFFFEp+127L))
+ return 1;
+
+ // Values beyond 128-bit range: high half set
+ if (test__floatunoixf(make_oi(make_ti(0, 1), make_ti(0, 0)), 0x1.0p+128L))
+ return 1;
+ // 2^200
+ if (test__floatunoixf((ou_int)1 << 200, 0x1.0p+200L))
+ return 1;
+
+ // Large 256-bit value near max
+ if (test__floatunoixf(
+ make_oi(make_ti(0x023479FD0E092DC0LL, 0), make_ti(0, 0)),
+ 0x1.1A3CFE870496Ep+249L))
+ return 1;
+
+ // Max 256-bit unsigned value
+ if (test__floatunoixf(
+ make_oi(make_ti(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL),
+ make_ti(0xFFFFFFFFFFFFFFFFLL, 0xFFFFFFFFFFFFFFFFLL)),
+ 0x1.0000000000000000p+256L))
+ return 1;
+
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/lit.cfg.py b/compiler-rt/test/builtins/Unit/lit.cfg.py
index 59da054848f3c..48e07ec43e8ff 100644
--- a/compiler-rt/test/builtins/Unit/lit.cfg.py
+++ b/compiler-rt/test/builtins/Unit/lit.cfg.py
@@ -165,6 +165,17 @@ def get_libgcc_file_name():
if not builtins_is_msvc:
config.available_features.add("int128")
+# Check if __int256 is supported by the target compiler
+import subprocess
+
+int256_check = subprocess.run(
+ [config.clang.strip(), "-x", "c", "-c", "-o", "/dev/null", "-"],
+ input=b"__int256_t x;",
+ capture_output=True,
+)
+if int256_check.returncode == 0:
+ config.available_features.add("int256")
+
clang_wrapper = ""
diff --git a/compiler-rt/test/builtins/Unit/lshroi3_test.c b/compiler-rt/test/builtins/Unit/lshroi3_test.c
new file mode 100644
index 0000000000000..fde52dda538ad
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/lshroi3_test.c
@@ -0,0 +1,101 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_lshroi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __lshroi3(oi_int a, int b);
+
+int test__lshroi3(oi_int a, int b, oi_int expected) {
+ oi_int x = __lshroi3(a, b);
+ if (x != expected) {
+ printf("error in __lshroi3: shift by %d\n", b);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // Shift by 0
+ if (test__lshroi3((oi_int)1, 0, (oi_int)1))
+ return 1;
+ // Shift by 1
+ if (test__lshroi3((oi_int)2, 1, (oi_int)1))
+ return 1;
+ // Logical shift negative by 1 (no sign extension)
+ if (test__lshroi3(
+ (oi_int)-1, 1,
+ make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1))))
+ return 1;
+ // Shift by 63
+ if (test__lshroi3(make_oi(make_ti(0, 0), make_ti(0, 0x8000000000000000ULL)),
+ 63, (oi_int)1))
+ return 1;
+ // Shift by 64
+ if (test__lshroi3(make_oi(make_ti(0, 0), make_ti(1, 0)), 64, (oi_int)1))
+ return 1;
+ // Shift by 127
+ if (test__lshroi3(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ 127, (oi_int)1))
+ return 1;
+ // Shift by 128
+ if (test__lshroi3(make_oi(make_ti(0, 1), make_ti(0, 0)), 128, (oi_int)1))
+ return 1;
+ // Shift by 129
+ if (test__lshroi3(make_oi(make_ti(0, 2), make_ti(0, 0)), 129, (oi_int)1))
+ return 1;
+ // Shift by 191
+ if (test__lshroi3(make_oi(make_ti(0, 0x8000000000000000ULL), make_ti(0, 0)),
+ 191, (oi_int)1))
+ return 1;
+ // Shift by 192
+ if (test__lshroi3(make_oi(make_ti(0xABCDLL, 0), make_ti(0, 0)), 192,
+ (oi_int)0xABCDLL))
+ return 1;
+ // Shift all-ones by 255
+ if (test__lshroi3((oi_int)(ou_int)-1, 255, (oi_int)1))
+ return 1;
+ // Multi-bit value shift by 64
+ if (test__lshroi3(make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFFULL, 0)),
+ 64,
+ make_oi(make_ti(0, 0), make_ti(0, 0xFFFFFFFFFFFFFFFFULL))))
+ return 1;
+ // Multi-bit value shift by 128
+ if (test__lshroi3(make_oi(make_ti(0, 0xFFFFFFFFFFFFFFFFULL), make_ti(0, 0)),
+ 128,
+ make_oi(make_ti(0, 0), make_ti(0, 0xFFFFFFFFFFFFFFFFULL))))
+ return 1;
+ // Shift that spans both halves
+ if (test__lshroi3(make_oi(make_ti(0, 1), make_ti(0, 0)), 1,
+ make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0))))
+ return 1;
+ // Full value shift by 0 (identity)
+ if (test__lshroi3(
+ make_oi(make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL),
+ make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL)),
+ 0,
+ make_oi(make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL),
+ make_ti(0xFEDCBA9876543215LL, 0xFEDCBA9876543215LL))))
+ return 1;
+ // Full-width big-number test (all 4 limbs populated, shift crosses 64-bit boundary).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__lshroi3(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ 73,
+ make_oi(make_ti(0x0000000000000000LL, 0x0055555DDDE6666EULL),
+ make_ti(0xEEF7777FFF888891ULL, 0x111999A2222AAAB3ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/modoi3_test.c b/compiler-rt/test/builtins/Unit/modoi3_test.c
new file mode 100644
index 0000000000000..f7969cbafa407
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/modoi3_test.c
@@ -0,0 +1,82 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_modoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __modoi3(oi_int a, oi_int b);
+
+int test__modoi3(oi_int a, oi_int b, oi_int expected) {
+ oi_int x = __modoi3(a, b);
+ if (x != expected) {
+ printf("error in __modoi3\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__modoi3((oi_int)0, (oi_int)1, (oi_int)0))
+ return 1;
+ if (test__modoi3((oi_int)10, (oi_int)3, (oi_int)1))
+ return 1;
+ if (test__modoi3((oi_int)-10, (oi_int)3, (oi_int)-1))
+ return 1;
+ if (test__modoi3((oi_int)10, (oi_int)-3, (oi_int)1))
+ return 1;
+ if (test__modoi3((oi_int)-10, (oi_int)-3, (oi_int)-1))
+ return 1;
+ if (test__modoi3((oi_int)100, (oi_int)7, (oi_int)2))
+ return 1;
+ // Exact division has zero remainder
+ if (test__modoi3((oi_int)42, (oi_int)42, (oi_int)0))
+ return 1;
+ // Dividend smaller than divisor
+ if (test__modoi3((oi_int)3, (oi_int)10, (oi_int)3))
+ return 1;
+ if (test__modoi3((oi_int)-3, (oi_int)10, (oi_int)-3))
+ return 1;
+ // Large value in high half mod small
+ // (1 << 128) % 3 = 1 (since 2^128 mod 3 = 1)
+ if (test__modoi3(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)3, (oi_int)1))
+ return 1;
+ // (1 << 128) % 2 = 0
+ if (test__modoi3(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)2, (oi_int)0))
+ return 1;
+ // Negative large value mod
+ if (test__modoi3(make_oi(make_ti(-1, -1), make_ti(0, 0)), (oi_int)3,
+ (oi_int)-1))
+ return 1;
+ // Cross-half boundary value mod small
+ if (test__modoi3(make_oi(make_ti(0, 1), make_ti(0, 5)), (oi_int)4, (oi_int)1))
+ return 1;
+ // Large mod large (same value)
+ {
+ oi_int big = make_oi(make_ti(0, 0x100), make_ti(0, 0));
+ if (test__modoi3(big, big, (oi_int)0))
+ return 1;
+ }
+ // Full-width big-number test (all 4 limbs populated).
+ // A(signed) % B(signed), verified by Python: q*b + r == a.
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__modoi3(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ make_oi(make_ti(0xEEEF44449999EEEFLL, 0x44449998EEEF4444ULL),
+ make_ti(0x9999EEEF44449999ULL, 0xEEEF444499954444ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/muloi5_test.c b/compiler-rt/test/builtins/Unit/muloi5_test.c
new file mode 100644
index 0000000000000..9f42881b43d28
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/muloi5_test.c
@@ -0,0 +1,164 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_muloi5
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __muloi5(oi_int a, oi_int b, int *overflow);
+
+int test__muloi5(oi_int a, oi_int b, oi_int expected, int expected_overflow) {
+ int overflow;
+ oi_int x = __muloi5(a, b, &overflow);
+ if (overflow != expected_overflow || (!expected_overflow && x != expected)) {
+ printf("error in __muloi5: overflow=%d (expected %d)\n", overflow,
+ expected_overflow);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__muloi5((oi_int)0, (oi_int)0, (oi_int)0, 0))
+ return 1;
+ if (test__muloi5((oi_int)1, (oi_int)1, (oi_int)1, 0))
+ return 1;
+ if (test__muloi5((oi_int)2, (oi_int)3, (oi_int)6, 0))
+ return 1;
+ if (test__muloi5((oi_int)-1, (oi_int)1, (oi_int)-1, 0))
+ return 1;
+ if (test__muloi5((oi_int)-1, (oi_int)-1, (oi_int)1, 0))
+ return 1;
+ // Large * 0 = 0, no overflow
+ if (test__muloi5(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+ (oi_int)0, (oi_int)0, 0))
+ return 1;
+ // 0 * large = 0, no overflow
+ if (test__muloi5((oi_int)0,
+ make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+ (oi_int)0, 0))
+ return 1;
+ // Cross-half multiplication without overflow
+ // (1 << 64) * (1 << 64) = (1 << 128)
+ if (test__muloi5(make_oi(make_ti(0, 0), make_ti(1, 0)),
+ make_oi(make_ti(0, 0), make_ti(1, 0)),
+ make_oi(make_ti(0, 1), make_ti(0, 0)), 0))
+ return 1;
+ // (1 << 127) * 2 = (1 << 128), no overflow
+ if (test__muloi5(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ (oi_int)2, make_oi(make_ti(0, 1), make_ti(0, 0)), 0))
+ return 1;
+ // MAX * 1 = MAX, no overflow
+ {
+ oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+ if (test__muloi5(MAX, (oi_int)1, MAX, 0))
+ return 1;
+ }
+ // MAX * 2 overflows
+ {
+ oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+ if (test__muloi5(MAX, (oi_int)2, (oi_int)0, 1))
+ return 1;
+ }
+ // MIN * -1 overflows
+ {
+ oi_int MIN = make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0));
+ if (test__muloi5(MIN, (oi_int)-1, (oi_int)0, 1))
+ return 1;
+ }
+ // MIN * 1 = MIN, no overflow
+ {
+ oi_int MIN = make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0));
+ if (test__muloi5(MIN, (oi_int)1, MIN, 0))
+ return 1;
+ }
+ // (1 << 128) * (1 << 128) overflows (result would be 1 << 256)
+ if (test__muloi5(make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)0, 1))
+ return 1;
+ // Negative * negative = positive, no overflow
+ if (test__muloi5((oi_int)-100, (oi_int)-200, (oi_int)20000, 0))
+ return 1;
+ // === Near-overflow boundary tests ===
+ {
+ oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+ oi_int MIN = make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0));
+ // MAX / 2 * 2 = MAX - 1 (since MAX is odd), no overflow
+ oi_int half_max = MAX >> 1; // = (MAX-1)/2
+ if (test__muloi5(
+ half_max, (oi_int)2,
+ make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -2)), 0))
+ return 1;
+ // (MAX/2 + 1) * 2 = MAX + 1, overflows
+ if (test__muloi5(half_max + 1, (oi_int)2, (oi_int)0, 1))
+ return 1;
+ // MIN / 2 * 2 = MIN, no overflow
+ oi_int half_min = MIN >> 1; // = MIN/2
+ if (test__muloi5(half_min, (oi_int)2, MIN, 0))
+ return 1;
+ // (MIN/2 - 1) * 2 = MIN - 2, overflows
+ if (test__muloi5(half_min - 1, (oi_int)2, (oi_int)0, 1))
+ return 1;
+ // MAX * -1 = -MAX (= MIN + 1), no overflow
+ if (test__muloi5(MAX, (oi_int)-1,
+ make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 1)),
+ 0))
+ return 1;
+ // MIN * 2 overflows
+ if (test__muloi5(MIN, (oi_int)2, (oi_int)0, 1))
+ return 1;
+ // MAX * -2 overflows
+ if (test__muloi5(MAX, (oi_int)-2, (oi_int)0, 1))
+ return 1;
+ // (1 << 127) * (1 << 127) = (1 << 254), no overflow
+ if (test__muloi5(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ make_oi(make_ti(0x4000000000000000LL, 0), make_ti(0, 0)),
+ 0))
+ return 1;
+ // (1 << 128) * (1 << 126) = (1 << 254), no overflow
+ if (test__muloi5(make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(0, 0), make_ti(0x4000000000000000LL, 0)),
+ make_oi(make_ti(0x4000000000000000LL, 0), make_ti(0, 0)),
+ 0))
+ return 1;
+ // (1 << 128) * (1 << 127) = (1 << 255) overflows (== MIN as unsigned,
+ // but as signed this is negative and the operands are both positive)
+ if (test__muloi5(make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ (oi_int)0, 1))
+ return 1;
+ }
+ // === Commutativity check ===
+ {
+ int ov1, ov2;
+ oi_int a = make_oi(make_ti(0x12345678LL, 0), make_ti(0, 0xABCDEF01ULL));
+ oi_int b = make_oi(make_ti(0, 0), make_ti(0, 0xFEDCBA98ULL));
+ oi_int r1 = __muloi5(a, b, &ov1);
+ oi_int r2 = __muloi5(b, a, &ov2);
+ if (r1 != r2 || ov1 != ov2)
+ return 1;
+ }
+ // Full-width big-number multiplication (fits in 255 bits, no overflow).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__muloi5(
+ make_oi(make_ti(0x0000000000000000LL, 0x0000000000000000ULL),
+ make_ti(0x7766554433221100ULL, 0xFFEEDDCCBBAA9988ULL)),
+ make_oi(make_ti(0x0000000000000000LL, 0x0000000000000000ULL),
+ make_ti(0x0000000000000002ULL, 0x1111111111111111ULL)),
+ make_oi(make_ti(0x0000000000000000LL, 0xF6C26BF3589BBCBDULL),
+ make_ti(0xC4B3A291806F5E4CULL, 0x3334579D048E3A08ULL)),
+ 0))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/multi5_test.c b/compiler-rt/test/builtins/Unit/multi5_test.c
new file mode 100644
index 0000000000000..f0c594c18d203
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/multi5_test.c
@@ -0,0 +1,174 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_multi5
+// REQUIRES: int256
+//
+// Tests for 256-bit multiplication (__multi5). The 128-bit equivalent
+// (multi3_test.c) has ~125 lines of hand-picked cases; this test matches that
+// approach and adds cases specifically targeting 256-bit partial product carry
+// propagation (4 x 128-bit partial products), commutativity, and squaring
+// identities.
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __multi5(oi_int a, oi_int b);
+
+int test__multi5(oi_int a, oi_int b, oi_int expected) {
+ oi_int x = __multi5(a, b);
+ if (x != expected) {
+ printf("error in __multi5\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // 0 * 0
+ if (test__multi5((oi_int)0, (oi_int)0, (oi_int)0))
+ return 1;
+ // 1 * 1
+ if (test__multi5((oi_int)1, (oi_int)1, (oi_int)1))
+ return 1;
+ // 2 * 3
+ if (test__multi5((oi_int)2, (oi_int)3, (oi_int)6))
+ return 1;
+ // -1 * 1
+ if (test__multi5((oi_int)-1, (oi_int)1, (oi_int)-1))
+ return 1;
+ // -1 * -1
+ if (test__multi5((oi_int)-1, (oi_int)-1, (oi_int)1))
+ return 1;
+ // Large * 0
+ if (test__multi5(make_oi(make_ti(0xFFFF, 0xFFFF), make_ti(0xFFFF, 0xFFFF)),
+ (oi_int)0, (oi_int)0))
+ return 1;
+ // 0 * large
+ if (test__multi5((oi_int)0,
+ make_oi(make_ti(0xFFFF, 0xFFFF), make_ti(0xFFFF, 0xFFFF)),
+ (oi_int)0))
+ return 1;
+ // 0x10000 * 0x10000 = 0x100000000
+ if (test__multi5((oi_int)0x10000, (oi_int)0x10000, (oi_int)0x100000000LL))
+ return 1;
+ // Large value multiplication within low half
+ if (test__multi5(make_oi(make_ti(0, 0), make_ti(0, 0x100000000LL)),
+ make_oi(make_ti(0, 0), make_ti(0, 0x100000000LL)),
+ make_oi(make_ti(0, 0), make_ti(1, 0))))
+ return 1;
+ // Cross-half multiplication: low_half * small -> result in high half
+ // (1 << 64) * (1 << 64) = (1 << 128)
+ if (test__multi5(make_oi(make_ti(0, 0), make_ti(1, 0)),
+ make_oi(make_ti(0, 0), make_ti(1, 0)),
+ make_oi(make_ti(0, 1), make_ti(0, 0))))
+ return 1;
+ // (1 << 127) * 2 = (1 << 128)
+ if (test__multi5(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ (oi_int)2, make_oi(make_ti(0, 1), make_ti(0, 0))))
+ return 1;
+ // Negative * positive with cross-half result
+ // -(1 << 64) * (1 << 64) = -(1 << 128)
+ if (test__multi5(make_oi(make_ti(-1, -1), make_ti(-1, 0)),
+ make_oi(make_ti(0, 0), make_ti(1, 0)),
+ make_oi(make_ti(-1, -1), make_ti(0, 0))))
+ return 1;
+ // Large * 1 = identity
+ {
+ oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+ if (test__multi5(big, (oi_int)1, big))
+ return 1;
+ }
+ // Large * -1 = negation
+ if (test__multi5(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)-1,
+ make_oi(make_ti(-1, -1), make_ti(0, 0))))
+ return 1;
+ // High half * small
+ if (test__multi5(make_oi(make_ti(0, 1), make_ti(0, 0)), (oi_int)3,
+ make_oi(make_ti(0, 3), make_ti(0, 0))))
+ return 1;
+ // Commutativity
+ if (test__multi5((oi_int)3, make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(0, 3), make_ti(0, 0))))
+ return 1;
+ // (2^64 - 1) * (2^64 - 1) = 2^128 - 2^65 + 1
+ // Exercises partial product carry propagation across 64-bit boundaries.
+ if (test__multi5(make_oi(make_ti(0, 0), make_ti(0, -1)),
+ make_oi(make_ti(0, 0), make_ti(0, -1)),
+ make_oi(make_ti(0, 0), make_ti(0xFFFFFFFFFFFFFFFELL,
+ 0x0000000000000001LL))))
+ return 1;
+ // (2^128 - 1) * 3 = 3 * 2^128 - 3
+ // Cross-half multiplication with borrow from low half.
+ if (test__multi5(make_oi(make_ti(0, 0), make_ti(-1, -1)), (oi_int)3,
+ make_oi(make_ti(0, 2), make_ti(-1, -3))))
+ return 1;
+ // Power-of-2 multiplication: (1 << 200) * (1 << 40) = (1 << 240)
+ if (test__multi5(make_oi(make_ti(0x100, 0), make_ti(0, 0)),
+ make_oi(make_ti(0, 0), make_ti(0, 1LL << 40)),
+ make_oi(make_ti(0x1000000000000LL, 0), make_ti(0, 0))))
+ return 1;
+ // (2^64 + 1) * 3 = 3 * 2^64 + 3 -- small cross-word carry
+ if (test__multi5(make_oi(make_ti(0, 0), make_ti(1, 1)), (oi_int)3,
+ make_oi(make_ti(0, 0), make_ti(3, 3))))
+ return 1;
+ // (2^128 + 1) * (2^128 - 1) = 2^256 - 1 (wraps to -1 in signed)
+ if (test__multi5(make_oi(make_ti(0, 1), make_ti(0, 1)),
+ make_oi(make_ti(0, 0), make_ti(-1, -1)), (oi_int)-1))
+ return 1;
+ // All-ones * all-ones = 1 (in modular arithmetic, (-1)*(-1) = 1)
+ if (test__multi5((oi_int)-1, (oi_int)-1, (oi_int)1))
+ return 1;
+ // === Large * large where all 4 partial products contribute ===
+ // a = (2^192 + 2^64 + 1), b = (2^192 + 2^64 + 1)
+ // a^2 = 2^384 + 2*2^256 + 2*2^192 + 2^128 + 2*2^64 + 1
+ // Mod 2^256: 2^193 + 2^128 + 2^65 + 1 (2^384 and 2*2^256 overflow away)
+ {
+ oi_int a = make_oi(make_ti(1, 0), make_ti(1, 1));
+ oi_int expected = make_oi(make_ti(2, 1), make_ti(2, 1));
+ if (test__multi5(a, a, expected))
+ return 1;
+ }
+ // Verify a * b == b * a for all partial product combinations
+ // a has bits set in all 4 64-bit words, b likewise
+ {
+ oi_int a = make_oi(make_ti(0xAAAAAAAA, 0xBBBBBBBB),
+ make_ti(0xCCCCCCCC, 0xDDDDDDDD));
+ oi_int b = make_oi(make_ti(0x11111111, 0x22222222),
+ make_ti(0x33333333, 0x44444444));
+ oi_int r1 = __multi5(a, b);
+ oi_int r2 = __multi5(b, a);
+ if (r1 != r2)
+ return 1;
+ // Also verify (a * b) / b == a (division is separately tested)
+ }
+ // Squaring: (2^128 - 1)^2 = 2^256 - 2^129 + 1
+ // Mod 2^256: -2^129 + 1 = -(2^129) + 1
+ {
+ oi_int a = make_oi(make_ti(0, 0), make_ti(-1, -1)); // 2^128 - 1
+ // Expected: 2^256 - 2^129 + 1 mod 2^256
+ // = 0xFFFF...FFFE 0000...0000 0000...0001
+ oi_int expected = make_oi(make_ti(-1, -2), make_ti(0, 1));
+ if (test__multi5(a, a, expected))
+ return 1;
+ }
+ // Full-width big-number test (all 4 limbs populated).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__multi5(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ make_oi(make_ti(0x0B609752EEEECDEFLL, 0xF01311110ECA71C7ULL),
+ make_ti(0x06D389ABB60B47ADULL, 0xFA4F89AC5C290000ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/mulvoi3_test.c b/compiler-rt/test/builtins/Unit/mulvoi3_test.c
new file mode 100644
index 0000000000000..c9771c88f24d6
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/mulvoi3_test.c
@@ -0,0 +1,119 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_mulvoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __mulvoi3(oi_int a, oi_int b);
+
+int test__mulvoi3(oi_int a, oi_int b, oi_int expected) {
+ oi_int x = __mulvoi3(a, b);
+ if (x != expected) {
+ printf("error in __mulvoi3\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__mulvoi3((oi_int)0, (oi_int)0, (oi_int)0))
+ return 1;
+ if (test__mulvoi3((oi_int)1, (oi_int)1, (oi_int)1))
+ return 1;
+ if (test__mulvoi3((oi_int)2, (oi_int)3, (oi_int)6))
+ return 1;
+ if (test__mulvoi3((oi_int)-1, (oi_int)1, (oi_int)-1))
+ return 1;
+ if (test__mulvoi3((oi_int)-1, (oi_int)-1, (oi_int)1))
+ return 1;
+ if (test__mulvoi3((oi_int)0x10000, (oi_int)0x10000, (oi_int)0x100000000LL))
+ return 1;
+ // Large * 0 = 0
+ if (test__mulvoi3(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+ (oi_int)0, (oi_int)0))
+ return 1;
+ // Cross-half multiplication: (1 << 64) * (1 << 64) = (1 << 128)
+ if (test__mulvoi3(make_oi(make_ti(0, 0), make_ti(1, 0)),
+ make_oi(make_ti(0, 0), make_ti(1, 0)),
+ make_oi(make_ti(0, 1), make_ti(0, 0))))
+ return 1;
+ // Negative * positive
+ if (test__mulvoi3((oi_int)-100, (oi_int)200, (oi_int)-20000))
+ return 1;
+ // Negative * negative
+ if (test__mulvoi3((oi_int)-100, (oi_int)-200, (oi_int)20000))
+ return 1;
+ // Large * 1 = identity
+ {
+ oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+ if (test__mulvoi3(big, (oi_int)1, big))
+ return 1;
+ }
+ // Note: overflow cases would abort, so we don't test them.
+ // Instead, we test the maximum non-overflowing products.
+
+ // MAX * 1 = MAX
+ {
+ oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+ if (test__mulvoi3(MAX, (oi_int)1, MAX))
+ return 1;
+ }
+ // MAX * -1 = -MAX (= MIN + 1)
+ {
+ oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+ oi_int NEG_MAX = make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 1));
+ if (test__mulvoi3(MAX, (oi_int)-1, NEG_MAX))
+ return 1;
+ }
+ // MIN * 1 = MIN
+ {
+ oi_int MIN = make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0));
+ if (test__mulvoi3(MIN, (oi_int)1, MIN))
+ return 1;
+ }
+ // (MAX/2) * 2 = MAX - 1 (MAX is odd)
+ {
+ oi_int MAX = make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1));
+ oi_int half = MAX >> 1;
+ oi_int expected =
+ make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -2));
+ if (test__mulvoi3(half, (oi_int)2, expected))
+ return 1;
+ }
+ // (1 << 127) * (1 << 127) = (1 << 254), near MAX but not overflow
+ if (test__mulvoi3(make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ make_oi(make_ti(0, 0), make_ti(0x8000000000000000LL, 0)),
+ make_oi(make_ti(0x4000000000000000LL, 0), make_ti(0, 0))))
+ return 1;
+ // Commutativity
+ if (test__mulvoi3((oi_int)17, (oi_int)19, (oi_int)323))
+ return 1;
+ if (test__mulvoi3((oi_int)19, (oi_int)17, (oi_int)323))
+ return 1;
+ // Large negative * negative = positive
+ if (test__mulvoi3(make_oi(make_ti(-1, -1), make_ti(-1, -100)), (oi_int)-1,
+ make_oi(make_ti(0, 0), make_ti(0, 100))))
+ return 1;
+ // Full-width big-number multiplication (fits in 255 bits, no overflow).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__mulvoi3(
+ make_oi(make_ti(0x0000000000000000LL, 0x0000000000000000ULL),
+ make_ti(0x7766554433221100ULL, 0xFFEEDDCCBBAA9988ULL)),
+ make_oi(make_ti(0x0000000000000000LL, 0x0000000000000000ULL),
+ make_ti(0x0000000000000002ULL, 0x1111111111111111ULL)),
+ make_oi(make_ti(0x0000000000000000LL, 0xF6C26BF3589BBCBDULL),
+ make_ti(0xC4B3A291806F5E4CULL, 0x3334579D048E3A08ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/negoi2_test.c b/compiler-rt/test/builtins/Unit/negoi2_test.c
new file mode 100644
index 0000000000000..c51d7db210d9c
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/negoi2_test.c
@@ -0,0 +1,69 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_negoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __negoi2(oi_int a);
+
+int test__negoi2(oi_int a, oi_int expected) {
+ oi_int x = __negoi2(a);
+ if (x != expected) {
+ printf("error in __negoi2\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__negoi2((oi_int)0, (oi_int)0))
+ return 1;
+ if (test__negoi2((oi_int)1, (oi_int)-1))
+ return 1;
+ if (test__negoi2((oi_int)-1, (oi_int)1))
+ return 1;
+ if (test__negoi2((oi_int)42, (oi_int)-42))
+ return 1;
+ if (test__negoi2((oi_int)-42, (oi_int)42))
+ return 1;
+ // Large value in high half
+ if (test__negoi2(make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(-1, -1), make_ti(0, 0))))
+ return 1;
+ // Negate back
+ if (test__negoi2(make_oi(make_ti(-1, -1), make_ti(0, 0)),
+ make_oi(make_ti(0, 1), make_ti(0, 0))))
+ return 1;
+ // MAX
+ if (test__negoi2(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+ make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 1))))
+ return 1;
+ // Value with bits in low half only
+ if (test__negoi2(make_oi(make_ti(0, 0), make_ti(0, 1)),
+ make_oi(make_ti(-1, -1), make_ti(-1, -1))))
+ return 1;
+ // Value spanning both halves
+ if (test__negoi2(make_oi(make_ti(0, 1), make_ti(0, 1)),
+ make_oi(make_ti(-1, -2), make_ti(-1, -1))))
+ return 1;
+ // Full-width big-number test (all 4 limbs populated).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__negoi2(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_oi(make_ti(0x5555444433332222LL, 0x11110000EEEEDDDDULL),
+ make_ti(0xCCCCBBBBAAAA9999ULL, 0x8888777766670000ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/negvoi2_test.c b/compiler-rt/test/builtins/Unit/negvoi2_test.c
new file mode 100644
index 0000000000000..6e11f6a855284
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/negvoi2_test.c
@@ -0,0 +1,59 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_negvoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __negvoi2(oi_int a);
+
+int test__negvoi2(oi_int a, oi_int expected) {
+ oi_int x = __negvoi2(a);
+ if (x != expected) {
+ printf("error in __negvoi2\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__negvoi2((oi_int)0, (oi_int)0))
+ return 1;
+ if (test__negvoi2((oi_int)1, (oi_int)-1))
+ return 1;
+ if (test__negvoi2((oi_int)-1, (oi_int)1))
+ return 1;
+ if (test__negvoi2((oi_int)42, (oi_int)-42))
+ return 1;
+ if (test__negvoi2((oi_int)-42, (oi_int)42))
+ return 1;
+ // Large value in high half
+ if (test__negvoi2(make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(-1, -1), make_ti(0, 0))))
+ return 1;
+ // MAX
+ if (test__negvoi2(make_oi(make_ti(0x7FFFFFFFFFFFFFFFLL, -1), make_ti(-1, -1)),
+ make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 1))))
+ return 1;
+ // Note: MIN would abort, so we don't test it.
+ // Full-width big-number test (all 4 limbs populated).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ // C is negative signed; -C = |C|
+ if (test__negvoi2(
+ make_oi(make_ti(0xDDDDEEEEFFFF0000LL, 0x1111222233334444ULL),
+ make_ti(0x5555666677778888ULL, 0x9999AAAABBBBCCCCULL)),
+ make_oi(make_ti(0x222211110000FFFFLL, 0xEEEEDDDDCCCCBBBBULL),
+ make_ti(0xAAAA999988887777ULL, 0x6666555544443334ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/parityoi2_test.c b/compiler-rt/test/builtins/Unit/parityoi2_test.c
new file mode 100644
index 0000000000000..fd8094091240d
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/parityoi2_test.c
@@ -0,0 +1,83 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_parityoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __parityoi2(oi_int a);
+
+int test__parityoi2(oi_int a, int expected) {
+ int x = __parityoi2(a);
+ if (x != expected) {
+ printf("error in __parityoi2: expected %d, got %d\n", expected, x);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // Zero (even parity)
+ if (test__parityoi2((oi_int)0, 0))
+ return 1;
+ // One (odd parity)
+ if (test__parityoi2((oi_int)1, 1))
+ return 1;
+ // Two bits set (even parity)
+ if (test__parityoi2((oi_int)3, 0))
+ return 1;
+ // Three bits set (odd parity)
+ if (test__parityoi2((oi_int)7, 1))
+ return 1;
+ // All ones = 256 bits set (even parity)
+ if (test__parityoi2((oi_int)(ou_int)-1, 0))
+ return 1;
+ // One bit in high half (odd parity)
+ if (test__parityoi2(make_oi(make_ti(0, 1), make_ti(0, 0)), 1))
+ return 1;
+ // One bit in each half (even parity)
+ if (test__parityoi2(make_oi(make_ti(0, 1), make_ti(0, 1)), 0))
+ return 1;
+ // High half all ones (128 bits = even), low half zero
+ if (test__parityoi2(make_oi(make_ti(-1, -1), make_ti(0, 0)), 0))
+ return 1;
+ // 0xFF (8 bits = even parity)
+ if (test__parityoi2((oi_int)0xFF, 0))
+ return 1;
+ // 0x7F (7 bits = odd parity)
+ if (test__parityoi2((oi_int)0x7F, 1))
+ return 1;
+ // MSB only (odd parity)
+ if (test__parityoi2(make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)),
+ 1))
+ return 1;
+ // One bit in each 64-bit word (4 bits = even parity)
+ if (test__parityoi2(make_oi(make_ti(1, 1), make_ti(1, 1)), 0))
+ return 1;
+ // Three bits across multiple words (odd parity)
+ if (test__parityoi2(make_oi(make_ti(1, 1), make_ti(1, 0)), 1))
+ return 1;
+ // Full-width big-number tests.
+ // Expected values verified by Python arbitrary-precision arithmetic.
+ if (test__parityoi2(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ 0))
+ return 1;
+ if (test__parityoi2(
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ 0))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/popcountoi2_test.c b/compiler-rt/test/builtins/Unit/popcountoi2_test.c
new file mode 100644
index 0000000000000..87b4237a0821d
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/popcountoi2_test.c
@@ -0,0 +1,86 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_popcountoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI int __popcountoi2(oi_int a);
+
+int test__popcountoi2(oi_int a, int expected) {
+ int x = __popcountoi2(a);
+ if (x != expected) {
+ printf("error in __popcountoi2: expected %d, got %d\n", expected, x);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // Zero
+ if (test__popcountoi2((oi_int)0, 0))
+ return 1;
+ // One
+ if (test__popcountoi2((oi_int)1, 1))
+ return 1;
+ // All ones (256 bits)
+ if (test__popcountoi2((oi_int)(ou_int)-1, 256))
+ return 1;
+ // 0xFF (8 bits set)
+ if (test__popcountoi2((oi_int)0xFF, 8))
+ return 1;
+ // One bit in each 128-bit half
+ if (test__popcountoi2(make_oi(make_ti(0, 1), make_ti(0, 1)), 2))
+ return 1;
+ // 0xFF in high half only
+ if (test__popcountoi2(make_oi(make_ti(0, 0xFF), make_ti(0, 0)), 8))
+ return 1;
+ // Alternating bits (0xAA...AA) = 128 bits set
+ if (test__popcountoi2(
+ make_oi(make_ti(0xAAAAAAAAAAAAAAAALL, 0xAAAAAAAAAAAAAAAALL),
+ make_ti(0xAAAAAAAAAAAAAAAALL, 0xAAAAAAAAAAAAAAAALL)),
+ 128))
+ return 1;
+ // Alternating bits (0x55...55) = 128 bits set
+ if (test__popcountoi2(
+ make_oi(make_ti(0x5555555555555555LL, 0x5555555555555555LL),
+ make_ti(0x5555555555555555LL, 0x5555555555555555LL)),
+ 128))
+ return 1;
+ // One bit in each 64-bit word (4 bits total)
+ if (test__popcountoi2(make_oi(make_ti(1, 1), make_ti(1, 1)), 4))
+ return 1;
+ // High half all ones, low half zero = 128
+ if (test__popcountoi2(make_oi(make_ti(-1, -1), make_ti(0, 0)), 128))
+ return 1;
+ // Low half all ones, high half zero = 128
+ if (test__popcountoi2(make_oi(make_ti(0, 0), make_ti(-1, -1)), 128))
+ return 1;
+ // Single high bit = 1
+ if (test__popcountoi2(
+ make_oi(make_ti(0x8000000000000000LL, 0), make_ti(0, 0)), 1))
+ return 1;
+ // Full-width big-number tests.
+ // Expected values verified by Python arbitrary-precision arithmetic.
+ if (test__popcountoi2(
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ 128))
+ return 1;
+ if (test__popcountoi2(
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ 132))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/subvoi3_test.c b/compiler-rt/test/builtins/Unit/subvoi3_test.c
new file mode 100644
index 0000000000000..65969571454a3
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/subvoi3_test.c
@@ -0,0 +1,81 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_subvoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI oi_int __subvoi3(oi_int a, oi_int b);
+
+int test__subvoi3(oi_int a, oi_int b, oi_int expected) {
+ oi_int x = __subvoi3(a, b);
+ if (x != expected) {
+ printf("error in __subvoi3\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__subvoi3((oi_int)0, (oi_int)0, (oi_int)0))
+ return 1;
+ if (test__subvoi3((oi_int)2, (oi_int)1, (oi_int)1))
+ return 1;
+ if (test__subvoi3((oi_int)0, (oi_int)1, (oi_int)-1))
+ return 1;
+ if (test__subvoi3((oi_int)300, (oi_int)200, (oi_int)100))
+ return 1;
+ // Negative result
+ if (test__subvoi3((oi_int)100, (oi_int)200, (oi_int)-100))
+ return 1;
+ // Negative - positive
+ if (test__subvoi3((oi_int)-100, (oi_int)200, (oi_int)-300))
+ return 1;
+ // Negative - negative
+ if (test__subvoi3((oi_int)-100, (oi_int)-200, (oi_int)100))
+ return 1;
+ // Borrow across 128-bit boundary (high half to low half)
+ if (test__subvoi3(make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(0, 0), make_ti(0, 1)),
+ make_oi(make_ti(0, 0), make_ti(-1, -1))))
+ return 1;
+ // Large values
+ if (test__subvoi3(make_oi(make_ti(0, 3), make_ti(0, 0)),
+ make_oi(make_ti(0, 1), make_ti(0, 0)),
+ make_oi(make_ti(0, 2), make_ti(0, 0))))
+ return 1;
+ // x - x = 0
+ {
+ oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+ if (test__subvoi3(big, big, (oi_int)0))
+ return 1;
+ }
+ // x - 0 = x
+ {
+ oi_int big = make_oi(make_ti(0x1234, 0x5678), make_ti(0x9ABC, 0xDEF0));
+ if (test__subvoi3(big, (oi_int)0, big))
+ return 1;
+ }
+ // Full-width big-number test (all 4 limbs populated).
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ // B(signed) - A(signed) = 0x66666666...66661111
+ if (test__subvoi3(
+ make_oi(make_ti(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_ti(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ make_oi(make_ti(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_ti(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_oi(make_ti(0x6666666666666666LL, 0x6666666766666666ULL),
+ make_ti(0x6666666666666666ULL, 0x6666666666661111ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/ucmpoi2_test.c b/compiler-rt/test/builtins/Unit/ucmpoi2_test.c
new file mode 100644
index 0000000000000..5881f3b8c01e9
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/ucmpoi2_test.c
@@ -0,0 +1,89 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_ucmpoi2
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI si_int __ucmpoi2(ou_int a, ou_int b);
+
+int test__ucmpoi2(ou_int a, ou_int b, si_int expected) {
+ si_int x = __ucmpoi2(a, b);
+ if (x != expected) {
+ printf("error in __ucmpoi2: expected %d, got %d\n", expected, x);
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // Equal
+ if (test__ucmpoi2((ou_int)0, (ou_int)0, 1))
+ return 1;
+ if (test__ucmpoi2((ou_int)1, (ou_int)1, 1))
+ return 1;
+ // Less than
+ if (test__ucmpoi2((ou_int)0, (ou_int)1, 0))
+ return 1;
+ // Greater than
+ if (test__ucmpoi2((ou_int)1, (ou_int)0, 2))
+ return 1;
+ // All-ones is maximum unsigned
+ if (test__ucmpoi2((ou_int)-1, (ou_int)0, 2))
+ return 1;
+ if (test__ucmpoi2((ou_int)0, (ou_int)-1, 0))
+ return 1;
+ // High half comparison
+ if (test__ucmpoi2(make_ou(make_tu(0, 1), make_tu(0, 0)),
+ make_ou(make_tu(0, 0), make_tu(-1, -1)), 2))
+ return 1;
+ // Large equal values
+ {
+ ou_int big = make_ou(make_tu(0x1234, 0x5678), make_tu(0x9ABC, 0xDEF0));
+ if (test__ucmpoi2(big, big, 1))
+ return 1;
+ }
+ // MAX > 0
+ if (test__ucmpoi2((ou_int)-1, (ou_int)0, 2))
+ return 1;
+ // Differ only in low half
+ if (test__ucmpoi2(make_ou(make_tu(0, 1), make_tu(0, 1)),
+ make_ou(make_tu(0, 1), make_tu(0, 2)), 0))
+ return 1;
+ if (test__ucmpoi2(make_ou(make_tu(0, 1), make_tu(0, 2)),
+ make_ou(make_tu(0, 1), make_tu(0, 1)), 2))
+ return 1;
+ // Differ only in highest 64-bit word
+ if (test__ucmpoi2(make_ou(make_tu(1, 0), make_tu(0, 0)),
+ make_ou(make_tu(2, 0), make_tu(0, 0)), 0))
+ return 1;
+ if (test__ucmpoi2(make_ou(make_tu(2, 0), make_tu(0, 0)),
+ make_ou(make_tu(1, 0), make_tu(0, 0)), 2))
+ return 1;
+ // Adjacent values
+ if (test__ucmpoi2((ou_int)100, (ou_int)101, 0))
+ return 1;
+ if (test__ucmpoi2((ou_int)101, (ou_int)100, 2))
+ return 1;
+ // Full-width big-number test (all 4 limbs populated).
+ // A > B unsigned.
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__ucmpoi2(
+ make_ou(make_tu(0xAAAABBBBCCCCDDDDLL, 0xEEEEFFFF11112222ULL),
+ make_tu(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_ou(make_tu(0x1111222233334444LL, 0x5555666677778888ULL),
+ make_tu(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ 2))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/udivmodoi4_test.c b/compiler-rt/test/builtins/Unit/udivmodoi4_test.c
new file mode 100644
index 0000000000000..f7482dae78fd9
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/udivmodoi4_test.c
@@ -0,0 +1,272 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_udivmodoi4
+// REQUIRES: int256
+//
+// Testing strategy: The 128-bit equivalent (udivmodti4_test.c) uses a 65K-line
+// auto-generated exhaustive test vector file from the initial compiler-rt
+// import (no generator script available). Instead of replicating that approach
+// for 256-bit, this test uses:
+// 1. Hand-picked edge cases covering both code paths in the Knuth algorithm
+// (Path 1: divisor fits in 128 bits, Path 2: divisor spans both halves)
+// 2. A 100-iteration pseudo-random invariant checker that verifies
+// q * b + r == a and r < b for diverse LCG-generated inputs
+// 3. A divisor size sweep from 1-bit to 255-bit divisors
+// This catches the same class of bugs as exhaustive enumeration while being
+// maintainable and readable.
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI ou_int __udivmodoi4(ou_int a, ou_int b, ou_int *rem);
+
+int test__udivmodoi4(ou_int a, ou_int b, ou_int expected_q, ou_int expected_r) {
+ ou_int r;
+ ou_int q = __udivmodoi4(a, b, &r);
+ if (q != expected_q || r != expected_r) {
+ printf("error in __udivmodoi4\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ // 0 / 1
+ if (test__udivmodoi4((ou_int)0, (ou_int)1, (ou_int)0, (ou_int)0))
+ return 1;
+ // 1 / 1
+ if (test__udivmodoi4((ou_int)1, (ou_int)1, (ou_int)1, (ou_int)0))
+ return 1;
+ // 10 / 3
+ if (test__udivmodoi4((ou_int)10, (ou_int)3, (ou_int)3, (ou_int)1))
+ return 1;
+ // 100 / 7
+ if (test__udivmodoi4((ou_int)100, (ou_int)7, (ou_int)14, (ou_int)2))
+ return 1;
+ // Large value / small
+ if (test__udivmodoi4(
+ make_ou(make_tu(0, 0), make_tu(1, 0)), (ou_int)2,
+ make_ou(make_tu(0, 0), make_tu(0, 0x8000000000000000ULL)), (ou_int)0))
+ return 1;
+ // Dividend < divisor
+ if (test__udivmodoi4((ou_int)3, (ou_int)10, (ou_int)0, (ou_int)3))
+ return 1;
+ // Equal
+ if (test__udivmodoi4((ou_int)42, (ou_int)42, (ou_int)1, (ou_int)0))
+ return 1;
+ // Large divisor (both halves)
+ {
+ ou_int big = make_ou(make_tu(0, 1), make_tu(0, 0));
+ if (test__udivmodoi4(big, big, (ou_int)1, (ou_int)0))
+ return 1;
+ }
+ // (1 << 128) / 3 = quotient with remainder 1
+ if (test__udivmodoi4(make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)3,
+ make_ou(make_tu(0, 0), make_tu(0x5555555555555555ULL,
+ 0x5555555555555555ULL)),
+ (ou_int)1))
+ return 1;
+ // All-ones / 2 = 0x7FFF...FFFF remainder 1
+ if (test__udivmodoi4(
+ (ou_int)-1, (ou_int)2,
+ make_ou(make_tu(0x7FFFFFFFFFFFFFFFULL, -1), make_tu(-1, -1)),
+ (ou_int)1))
+ return 1;
+ // Cross-half boundary: value spans both halves
+ if (test__udivmodoi4(
+ make_ou(make_tu(0, 1), make_tu(0, 5)), (ou_int)4,
+ make_ou(make_tu(0, 0), make_tu(0x4000000000000000ULL, 1)), (ou_int)1))
+ return 1;
+ // Large / large (double)
+ {
+ ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+ ou_int dbl = make_ou(make_tu(0, 0x200), make_tu(0, 0));
+ if (test__udivmodoi4(dbl, big, (ou_int)2, (ou_int)0))
+ return 1;
+ }
+ // Very large divisor in high half
+ {
+ ou_int big = make_ou(make_tu(1, 0), make_tu(0, 0));
+ if (test__udivmodoi4(big, big, (ou_int)1, (ou_int)0))
+ return 1;
+ }
+ // Large value with remainder
+ {
+ ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 7));
+ ou_int div = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+ if (test__udivmodoi4(big, div, (ou_int)1, (ou_int)7))
+ return 1;
+ }
+ // Division by power of 2 vs equivalent shift: (1 << 192) / (1 << 64)
+ // = (1 << 128). Path 1: divisor.s.high == 0.
+ if (test__udivmodoi4(make_ou(make_tu(1, 0), make_tu(0, 0)),
+ make_ou(make_tu(0, 0), make_tu(1, 0)),
+ make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)0))
+ return 1;
+ // Path 1: Large dividend / medium 128-bit divisor.
+ // (2^192 + 2^64) / (2^64) = 2^128 + 1, remainder 0.
+ if (test__udivmodoi4(make_ou(make_tu(1, 0), make_tu(1, 0)),
+ make_ou(make_tu(0, 0), make_tu(1, 0)),
+ make_ou(make_tu(0, 1), make_tu(0, 1)), (ou_int)0))
+ return 1;
+ // Path 1: dividend.s.high >= divisor.s.low (needs two-step division).
+ // (3 * 2^128) / (2^128 - 1) = 3, remainder 3.
+ if (test__udivmodoi4(make_ou(make_tu(0, 3), make_tu(0, 0)),
+ make_ou(make_tu(0, 0), make_tu(-1, -1)), (ou_int)3,
+ (ou_int)3))
+ return 1;
+ // Path 2: Both halves set in divisor. Bit-by-bit division.
+ // (2^256 - 1) / (2^128 + 1) = 2^128 - 1, remainder 0.
+ if (test__udivmodoi4((ou_int)-1, make_ou(make_tu(0, 1), make_tu(0, 1)),
+ make_ou(make_tu(0, 0), make_tu(-1, -1)), (ou_int)0))
+ return 1;
+ // Path 2: Large 256-bit divisor with remainder.
+ // (2^255) / (2^254 + 1): quotient = 1, remainder = 2^254 - 1.
+ {
+ ou_int dividend = make_ou(make_tu(0x8000000000000000ULL, 0), make_tu(0, 0));
+ ou_int divisor = make_ou(make_tu(0x4000000000000000ULL, 0), make_tu(0, 1));
+ ou_int exp_q = (ou_int)1;
+ ou_int exp_r = make_ou(make_tu(0x3FFFFFFFFFFFFFFFULL, -1), make_tu(-1, -1));
+ if (test__udivmodoi4(dividend, divisor, exp_q, exp_r))
+ return 1;
+ }
+ // Verify q * b + r == a invariant for a non-trivial case.
+ // a = 0xDEADBEEF12345678 (repeated), b = 0xCAFEBABE (fits in 128 bits).
+ {
+ ou_int a = make_ou(make_tu(0xDEADBEEF12345678ULL, 0xDEADBEEF12345678ULL),
+ make_tu(0xDEADBEEF12345678ULL, 0xDEADBEEF12345678ULL));
+ ou_int b = (ou_int)0xCAFEBABEULL;
+ ou_int r;
+ ou_int q = __udivmodoi4(a, b, &r);
+ if (q * b + r != a)
+ return 1;
+ // Remainder must be less than divisor.
+ if (r >= b)
+ return 1;
+ }
+ // Verify q * b + r == a for a large divisor spanning both halves.
+ {
+ ou_int a = make_ou(make_tu(0xAAAAAAAAAAAAAAAAULL, 0xBBBBBBBBBBBBBBBBULL),
+ make_tu(0xCCCCCCCCCCCCCCCCULL, 0xDDDDDDDDDDDDDDDDULL));
+ ou_int b = make_ou(make_tu(0, 0x1234567890ABCDEFULL),
+ make_tu(0xFEDCBA0987654321ULL, 0x1111111111111111ULL));
+ ou_int r;
+ ou_int q = __udivmodoi4(a, b, &r);
+ if (q * b + r != a)
+ return 1;
+ if (r >= b)
+ return 1;
+ }
+ // Full-width big-number test (all 4 limbs populated).
+ // A / B (unsigned): q = 9, r verified by Python: q*b + r == a.
+ // Expected values verified by Python arbitrary-precision arithmetic.
+ if (test__udivmodoi4(
+ make_ou(make_tu(0xAAAABBBBCCCCDDDDULL, 0xEEEEFFFF11112222ULL),
+ make_tu(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_ou(make_tu(0x1111222233334444ULL, 0x5555666677778888ULL),
+ make_tu(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ (ou_int)9,
+ make_ou(make_tu(0x11108887FFFF7776ULL, 0xEEEE6664DDDD5554ULL),
+ make_tu(0xCCCC4443BBBB3332ULL, 0xAAAA222199A16667ULL))))
+ return 1;
+ // === Pseudo-random invariant checker ===
+ // Generate ~100 test vectors using a simple LCG and verify q * b + r == a
+ // and r < b for each. This catches systematic bugs in the Knuth algorithm
+ // that hand-picked cases might miss.
+ {
+ // LCG parameters (Numerical Recipes)
+ unsigned long long seed = 0xDEADBEEFCAFEBABEULL;
+ int failures = 0;
+ for (int i = 0; i < 100; ++i) {
+ // Generate pseudo-random a and b using LCG
+ seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+ unsigned long long w0 = seed;
+ seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+ unsigned long long w1 = seed;
+ seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+ unsigned long long w2 = seed;
+ seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+ unsigned long long w3 = seed;
+ ou_int a = make_ou(make_tu(w3, w2), make_tu(w1, w0));
+
+ seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+ unsigned long long d0 = seed;
+ seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+ unsigned long long d1 = seed;
+ seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+ unsigned long long d2 = seed;
+ seed = seed * 6364136223846793005ULL + 1442695040888963407ULL;
+ unsigned long long d3 = seed;
+ ou_int b = make_ou(make_tu(d3, d2), make_tu(d1, d0));
+ if (b == 0)
+ b = 1;
+
+ ou_int r;
+ ou_int q = __udivmodoi4(a, b, &r);
+ // Invariant: q * b + r == a
+ if (q * b + r != a) {
+ printf("invariant failure at i=%d: q * b + r != a\n", i);
+ failures++;
+ }
+ // Invariant: r < b
+ if (r >= b) {
+ printf("invariant failure at i=%d: r >= b\n", i);
+ failures++;
+ }
+ }
+ if (failures)
+ return 1;
+ }
+ // === Divisor size sweep ===
+ // Test with divisors of varying sizes: 1 bit, 32 bits, 64 bits,
+ // 128 bits, 192 bits, 255 bits. This exercises both Path 1
+ // (divisor.high == 0) and Path 2 (both halves).
+ {
+ ou_int dividend = (ou_int)-1; // max value
+ ou_int r;
+ ou_int q;
+ // 1-bit divisor
+ q = __udivmodoi4(dividend, (ou_int)1, &r);
+ if (q != dividend || r != 0)
+ return 1;
+ // 32-bit divisor
+ q = __udivmodoi4(dividend, (ou_int)0xFFFFFFFFULL, &r);
+ if (q * (ou_int)0xFFFFFFFFULL + r != dividend)
+ return 1;
+ // 64-bit divisor
+ q = __udivmodoi4(dividend, (ou_int)0xFFFFFFFFFFFFFFFFULL, &r);
+ if (q * (ou_int)0xFFFFFFFFFFFFFFFFULL + r != dividend)
+ return 1;
+ // 128-bit divisor (all ones in low half)
+ {
+ ou_int d128 = make_ou(make_tu(0, 0), make_tu(-1, -1));
+ q = __udivmodoi4(dividend, d128, &r);
+ if (q * d128 + r != dividend)
+ return 1;
+ }
+ // 192-bit divisor
+ {
+ ou_int d192 = make_ou(make_tu(0, -1), make_tu(-1, -1));
+ q = __udivmodoi4(dividend, d192, &r);
+ if (q * d192 + r != dividend)
+ return 1;
+ }
+ // 255-bit divisor (max >> 1)
+ {
+ ou_int d255 = (ou_int)-1 >> 1;
+ q = __udivmodoi4(dividend, d255, &r);
+ if (q * d255 + r != dividend)
+ return 1;
+ }
+ }
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/udivoi3_test.c b/compiler-rt/test/builtins/Unit/udivoi3_test.c
new file mode 100644
index 0000000000000..ffe90f5175611
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/udivoi3_test.c
@@ -0,0 +1,92 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_udivoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI ou_int __udivoi3(ou_int a, ou_int b);
+
+int test__udivoi3(ou_int a, ou_int b, ou_int expected) {
+ ou_int x = __udivoi3(a, b);
+ if (x != expected) {
+ printf("error in __udivoi3\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__udivoi3((ou_int)0, (ou_int)1, (ou_int)0))
+ return 1;
+ if (test__udivoi3((ou_int)1, (ou_int)1, (ou_int)1))
+ return 1;
+ if (test__udivoi3((ou_int)10, (ou_int)3, (ou_int)3))
+ return 1;
+ if (test__udivoi3((ou_int)100, (ou_int)7, (ou_int)14))
+ return 1;
+ if (test__udivoi3((ou_int)42, (ou_int)42, (ou_int)1))
+ return 1;
+ // Dividend < divisor
+ if (test__udivoi3((ou_int)3, (ou_int)10, (ou_int)0))
+ return 1;
+ // Large value in high half / small
+ // (1 << 128) / 2 = (1 << 127)
+ if (test__udivoi3(make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)2,
+ make_ou(make_tu(0, 0), make_tu(0x8000000000000000ULL, 0))))
+ return 1;
+ // (1 << 128) / 3
+ if (test__udivoi3(make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)3,
+ make_ou(make_tu(0, 0), make_tu(0x5555555555555555ULL,
+ 0x5555555555555555ULL))))
+ return 1;
+ // Large / large (same value)
+ {
+ ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+ if (test__udivoi3(big, big, (ou_int)1))
+ return 1;
+ }
+ // Large / large (double)
+ {
+ ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+ ou_int dbl = make_ou(make_tu(0, 0x200), make_tu(0, 0));
+ if (test__udivoi3(dbl, big, (ou_int)2))
+ return 1;
+ }
+ // All-ones / 2
+ if (test__udivoi3(
+ (ou_int)-1, (ou_int)2,
+ make_ou(make_tu(0x7FFFFFFFFFFFFFFFULL, -1), make_tu(-1, -1))))
+ return 1;
+ // Cross-half boundary value / small
+ if (test__udivoi3(make_ou(make_tu(0, 1), make_tu(0, 4)), (ou_int)4,
+ make_ou(make_tu(0, 0), make_tu(0x4000000000000000ULL, 1))))
+ return 1;
+ // Very large divisor in high half
+ {
+ ou_int big = make_ou(make_tu(1, 0), make_tu(0, 0));
+ if (test__udivoi3(big, big, (ou_int)1))
+ return 1;
+ }
+ // Full-width big-number test (all 4 limbs populated).
+ // A / B (unsigned) = 9.
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__udivoi3(
+ make_ou(make_tu(0xAAAABBBBCCCCDDDDULL, 0xEEEEFFFF11112222ULL),
+ make_tu(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_ou(make_tu(0x1111222233334444ULL, 0x5555666677778888ULL),
+ make_tu(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ (ou_int)9))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/umodoi3_test.c b/compiler-rt/test/builtins/Unit/umodoi3_test.c
new file mode 100644
index 0000000000000..db8397bc6081c
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/umodoi3_test.c
@@ -0,0 +1,80 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_umodoi3
+// REQUIRES: int256
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#ifdef CRT_HAS_256BIT
+
+COMPILER_RT_ABI ou_int __umodoi3(ou_int a, ou_int b);
+
+int test__umodoi3(ou_int a, ou_int b, ou_int expected) {
+ ou_int x = __umodoi3(a, b);
+ if (x != expected) {
+ printf("error in __umodoi3\n");
+ return 1;
+ }
+ return 0;
+}
+
+char assumption_1[sizeof(oi_int) == 2 * sizeof(ti_int)] = {0};
+
+#endif
+
+int main() {
+#ifdef CRT_HAS_256BIT
+ if (test__umodoi3((ou_int)0, (ou_int)1, (ou_int)0))
+ return 1;
+ if (test__umodoi3((ou_int)10, (ou_int)3, (ou_int)1))
+ return 1;
+ if (test__umodoi3((ou_int)100, (ou_int)7, (ou_int)2))
+ return 1;
+ if (test__umodoi3((ou_int)42, (ou_int)42, (ou_int)0))
+ return 1;
+ if (test__umodoi3((ou_int)3, (ou_int)10, (ou_int)3))
+ return 1;
+ // (1 << 128) % 2 = 0
+ if (test__umodoi3(make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)2,
+ (ou_int)0))
+ return 1;
+ // (1 << 128) % 3 = 1
+ if (test__umodoi3(make_ou(make_tu(0, 1), make_tu(0, 0)), (ou_int)3,
+ (ou_int)1))
+ return 1;
+ // All-ones % 2 = 1
+ if (test__umodoi3((ou_int)-1, (ou_int)2, (ou_int)1))
+ return 1;
+ // Cross-half boundary value mod small
+ if (test__umodoi3(make_ou(make_tu(0, 1), make_tu(0, 5)), (ou_int)4,
+ (ou_int)1))
+ return 1;
+ // Large mod large (same value)
+ {
+ ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+ if (test__umodoi3(big, big, (ou_int)0))
+ return 1;
+ }
+ // Large mod large (double)
+ {
+ ou_int big = make_ou(make_tu(0, 0x100), make_tu(0, 0));
+ ou_int dbl = make_ou(make_tu(0, 0x200), make_tu(0, 0));
+ if (test__umodoi3(dbl, big, (ou_int)0))
+ return 1;
+ }
+ // Full-width big-number test (all 4 limbs populated).
+ // A % B (unsigned), verified by Python: q*b + r == a.
+ // Expected value verified by Python arbitrary-precision arithmetic.
+ if (test__umodoi3(
+ make_ou(make_tu(0xAAAABBBBCCCCDDDDULL, 0xEEEEFFFF11112222ULL),
+ make_tu(0x3333444455556666ULL, 0x7777888899990000ULL)),
+ make_ou(make_tu(0x1111222233334444ULL, 0x5555666677778888ULL),
+ make_tu(0x9999AAAABBBBCCCCULL, 0xDDDDEEEEFFFF1111ULL)),
+ make_ou(make_tu(0x11108887FFFF7776ULL, 0xEEEE6664DDDD5554ULL),
+ make_tu(0xCCCC4443BBBB3332ULL, 0xAAAA222199A16667ULL))))
+ return 1;
+#else
+ printf("skipped\n");
+#endif
+ return 0;
+}
>From e7a72f8342d67c4a3ad069108dde284938b58bcb Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:40:12 +0100
Subject: [PATCH 09/17] [compiler-rt] Add __int256 UBSan support
Extend UndefinedBehaviorSanitizer to handle 256-bit integers:
- ubsan_value.h: Add 256-bit case to getIntegerBitWidth/Value
- ubsan_value.cpp: Support 256-bit integer rendering in diagnostics
- ubsan_diag.cpp: Handle 256-bit integer formatting
- Test: signed overflow detection with __int256 values
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
compiler-rt/lib/ubsan/ubsan_diag.cpp | 8 +++-
compiler-rt/lib/ubsan/ubsan_value.cpp | 15 +++++++
compiler-rt/lib/ubsan/ubsan_value.h | 13 +++++-
.../TestCases/Integer/int256-overflow.cpp | 45 +++++++++++++++++++
4 files changed, 79 insertions(+), 2 deletions(-)
create mode 100644 compiler-rt/test/ubsan/TestCases/Integer/int256-overflow.cpp
diff --git a/compiler-rt/lib/ubsan/ubsan_diag.cpp b/compiler-rt/lib/ubsan/ubsan_diag.cpp
index 2146ed3c27287..cdb2b6b489622 100644
--- a/compiler-rt/lib/ubsan/ubsan_diag.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_diag.cpp
@@ -133,7 +133,13 @@ Diag &Diag::operator<<(const Value &V) {
/// Hexadecimal printing for numbers too large for Printf to handle directly.
static void RenderHex(InternalScopedString *Buffer, UIntMax Val) {
-#if HAVE_INT128_T
+#if HAVE_INT256_T
+ Buffer->AppendF("0x%08x%08x%08x%08x%08x%08x%08x%08x",
+ (unsigned int)(Val >> 224), (unsigned int)(Val >> 192),
+ (unsigned int)(Val >> 160), (unsigned int)(Val >> 128),
+ (unsigned int)(Val >> 96), (unsigned int)(Val >> 64),
+ (unsigned int)(Val >> 32), (unsigned int)(Val));
+#elif HAVE_INT128_T
Buffer->AppendF("0x%08x%08x%08x%08x", (unsigned int)(Val >> 96),
(unsigned int)(Val >> 64), (unsigned int)(Val >> 32),
(unsigned int)(Val));
diff --git a/compiler-rt/lib/ubsan/ubsan_value.cpp b/compiler-rt/lib/ubsan/ubsan_value.cpp
index 6e88ebaf34d4b..64ec0cc374a3e 100644
--- a/compiler-rt/lib/ubsan/ubsan_value.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_value.cpp
@@ -85,6 +85,14 @@ SIntMax Value::getSIntValue() const {
#else
if (getType().getIntegerBitWidth() == 128)
UNREACHABLE("libclang_rt.ubsan was built without __int128 support");
+#endif
+#if HAVE_INT256_T
+ if (getType().getIntegerBitWidth() == 256)
+ return SIntMax(UIntMax(*reinterpret_cast<s256 *>(Val)) << ExtraBits) >>
+ ExtraBits;
+#else
+ if (getType().getIntegerBitWidth() == 256)
+ UNREACHABLE("libclang_rt.ubsan was built without __int256 support");
#endif
UNREACHABLE("unexpected bit width");
}
@@ -101,6 +109,13 @@ UIntMax Value::getUIntValue() const {
#else
if (getType().getIntegerBitWidth() == 128)
UNREACHABLE("libclang_rt.ubsan was built without __int128 support");
+#endif
+#if HAVE_INT256_T
+ if (getType().getIntegerBitWidth() == 256)
+ return *reinterpret_cast<u256 *>(Val);
+#else
+ if (getType().getIntegerBitWidth() == 256)
+ UNREACHABLE("libclang_rt.ubsan was built without __int256 support");
#endif
UNREACHABLE("unexpected bit width");
}
diff --git a/compiler-rt/lib/ubsan/ubsan_value.h b/compiler-rt/lib/ubsan/ubsan_value.h
index ee523cf5ddda5..9e3699d47f29b 100644
--- a/compiler-rt/lib/ubsan/ubsan_value.h
+++ b/compiler-rt/lib/ubsan/ubsan_value.h
@@ -25,10 +25,21 @@ __extension__ typedef unsigned __int128 u128;
#define HAVE_INT128_T 0
#endif
+#if __SIZEOF_INT256__
+__extension__ typedef __int256_t s256;
+__extension__ typedef __uint256_t u256;
+#define HAVE_INT256_T 1
+#else
+#define HAVE_INT256_T 0
+#endif
+
namespace __ubsan {
/// \brief Largest integer types we support.
-#if HAVE_INT128_T
+#if HAVE_INT256_T
+typedef s256 SIntMax;
+typedef u256 UIntMax;
+#elif HAVE_INT128_T
typedef s128 SIntMax;
typedef u128 UIntMax;
#else
diff --git a/compiler-rt/test/ubsan/TestCases/Integer/int256-overflow.cpp b/compiler-rt/test/ubsan/TestCases/Integer/int256-overflow.cpp
new file mode 100644
index 0000000000000..a87bbd56dd41d
--- /dev/null
+++ b/compiler-rt/test/ubsan/TestCases/Integer/int256-overflow.cpp
@@ -0,0 +1,45 @@
+// REQUIRES: int256
+//
+// RUN: %clangxx -DADD_I256 -fsanitize=signed-integer-overflow %s -o %t1 && %run %t1 2>&1 | FileCheck %s --check-prefix=CHECK-ADD_I256
+// RUN: %clangxx -DSUB_I256 -fsanitize=signed-integer-overflow %s -o %t2 && %run %t2 2>&1 | FileCheck %s --check-prefix=CHECK-SUB_I256
+// RUN: %clangxx -DNEG_I256 -fsanitize=signed-integer-overflow %s -o %t3 && %run %t3 2>&1 | FileCheck %s --check-prefix=CHECK-NEG_I256
+//
+// Test UBSan detection of signed integer overflow for __int256_t.
+
+#include <stdint.h>
+
+int main() {
+#ifdef ADD_I256
+# if defined(__SIZEOF_INT256__)
+ // Overflow: 2^254 + 2^254 = 2^255, which exceeds __int256_t max (2^255 - 1)
+ (void)((__int256_t(1) << 254) + (__int256_t(1) << 254));
+# else
+ // Fallback message for platforms without __int256
+ __builtin_printf("__int256 not supported\n");
+# endif
+ // CHECK-ADD_I256: {{0x[0-9a-f]+ \+ 0x[0-9a-f]+ cannot be represented in type '__int256_t'|__int256 not supported}}
+#endif
+
+#ifdef SUB_I256
+# if defined(__SIZEOF_INT256__)
+ // Overflow: min - 1
+ __int256_t min_val = (__int256_t)1
+ << 255; // This is the minimum (negative) value
+ (void)(min_val - 1);
+# else
+ __builtin_printf("__int256 not supported\n");
+# endif
+ // CHECK-SUB_I256: {{0x[0-9a-f]+ - 1 cannot be represented in type '__int256_t'|__int256 not supported}}
+#endif
+
+#ifdef NEG_I256
+# if defined(__SIZEOF_INT256__)
+ // Overflow: -min = -(-2^255) overflows because max is 2^255 - 1
+ __int256_t min_val = (__int256_t)1 << 255;
+ (void)(-min_val);
+# else
+ __builtin_printf("__int256 not supported\n");
+# endif
+ // CHECK-NEG_I256: {{negation of -?0x[0-9a-f]+ cannot be represented in type '__int256_t'|__int256 not supported}}
+#endif
+}
>From 307e69a423384b8a347ebd77fa0f3d8eb171bf07 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:40:37 +0100
Subject: [PATCH 10/17] [libc++] Add __int256/__uint256 support
Wire __int256 through libc++ infrastructure (guarded by _LIBCPP_HAS_INT256):
- Type traits: is_integral, make_signed/unsigned, promote, integer_traits,
make_32_64_128_or_256_bit, convert_to_integral
- numeric_limits<__int256_t/__uint256_t> specialization
- <charconv>: to_chars/from_chars tables and traits for 256-bit
- <format>: format_arg, format_arg_store, formatter_integer support
- <algorithm>: radix_sort key type support
- <ranges>: iota_view difference type
- <bit>: byteswap specialization
- <random>: is_valid integer trait extension
- Module map: add make_32_64_128_or_256_bit.h header
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
libcxx/include/CMakeLists.txt | 2 +-
libcxx/include/__algorithm/radix_sort.h | 7 ++
libcxx/include/__bit/byteswap.h | 5 +
libcxx/include/__charconv/tables.h | 110 ++++++++++++++++++
libcxx/include/__charconv/to_chars_base_10.h | 61 ++++++++++
libcxx/include/__charconv/to_chars_integral.h | 24 +++-
libcxx/include/__charconv/traits.h | 24 ++++
libcxx/include/__config | 6 +
libcxx/include/__format/format_arg.h | 71 ++++++++++-
libcxx/include/__format/format_arg_store.h | 8 ++
libcxx/include/__format/format_functions.h | 14 +++
libcxx/include/__format/formatter_integer.h | 20 +++-
libcxx/include/__random/is_valid.h | 7 ++
libcxx/include/__ranges/iota_view.h | 7 +-
libcxx/include/__type_traits/integer_traits.h | 8 ++
libcxx/include/__type_traits/is_integral.h | 4 +
..._128_bit.h => make_32_64_128_or_256_bit.h} | 18 +--
libcxx/include/__type_traits/make_signed.h | 8 ++
libcxx/include/__type_traits/make_unsigned.h | 8 ++
libcxx/include/__type_traits/promote.h | 4 +
.../include/__utility/convert_to_integral.h | 6 +
libcxx/include/limits | 5 +-
libcxx/include/module.modulemap.in | 2 +-
23 files changed, 411 insertions(+), 18 deletions(-)
rename libcxx/include/__type_traits/{make_32_64_or_128_bit.h => make_32_64_128_or_256_bit.h} (70%)
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 5cdf29b94e3eb..3eb049b59dbf2 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -892,7 +892,7 @@ set(files
__type_traits/is_volatile.h
__type_traits/is_within_lifetime.h
__type_traits/lazy.h
- __type_traits/make_32_64_or_128_bit.h
+ __type_traits/make_32_64_128_or_256_bit.h
__type_traits/make_const_lvalue_ref.h
__type_traits/make_signed.h
__type_traits/make_transparent.h
diff --git a/libcxx/include/__algorithm/radix_sort.h b/libcxx/include/__algorithm/radix_sort.h
index 5549a69f5e220..f40adee79906a 100644
--- a/libcxx/include/__algorithm/radix_sort.h
+++ b/libcxx/include/__algorithm/radix_sort.h
@@ -334,6 +334,13 @@ struct __unsigned_integer_of_size<16> {
};
# endif
+# if _LIBCPP_HAS_INT256
+template <>
+struct __unsigned_integer_of_size<32> {
+ using type _LIBCPP_NODEBUG = unsigned __int256;
+};
+# endif
+
template <size_t _Size>
using __unsigned_integer_of_size_t _LIBCPP_NODEBUG = typename __unsigned_integer_of_size<_Size>::type;
diff --git a/libcxx/include/__bit/byteswap.h b/libcxx/include/__bit/byteswap.h
index 7ce7e069b4142..326ff5dbb9cf5 100644
--- a/libcxx/include/__bit/byteswap.h
+++ b/libcxx/include/__bit/byteswap.h
@@ -41,6 +41,11 @@ template <integral _Tp>
static_cast<_Tp>(byteswap(static_cast<uint64_t>(__val >> 64)));
# endif // __has_builtin(__builtin_bswap128)
# endif // _LIBCPP_HAS_INT128
+# if _LIBCPP_HAS_INT256
+ } else if constexpr (sizeof(_Tp) == 32) {
+ return (static_cast<_Tp>(byteswap(static_cast<__uint128_t>(__val))) << 128) |
+ static_cast<_Tp>(byteswap(static_cast<__uint128_t>(__val >> 128)));
+# endif // _LIBCPP_HAS_INT256
} else {
static_assert(sizeof(_Tp) == 0, "byteswap is unimplemented for integral types of this size");
}
diff --git a/libcxx/include/__charconv/tables.h b/libcxx/include/__charconv/tables.h
index b8c6fd8af0a0f..4e4aa8b4ef68c 100644
--- a/libcxx/include/__charconv/tables.h
+++ b/libcxx/include/__charconv/tables.h
@@ -138,6 +138,116 @@ inline _LIBCPP_CONSTEXPR const __uint128_t __pow10_128[40] = {
(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * 10};
# endif
+#if _LIBCPP_HAS_INT256
+inline _LIBCPP_CONSTEXPR const int __pow10_256_offset = 0;
+inline _LIBCPP_CONSTEXPR const __uint256_t __pow10_256[78] = {
+ UINT64_C(0),
+ UINT64_C(10),
+ UINT64_C(100),
+ UINT64_C(1000),
+ UINT64_C(10000),
+ UINT64_C(100000),
+ UINT64_C(1000000),
+ UINT64_C(10000000),
+ UINT64_C(100000000),
+ UINT64_C(1000000000),
+ UINT64_C(10000000000),
+ UINT64_C(100000000000),
+ UINT64_C(1000000000000),
+ UINT64_C(10000000000000),
+ UINT64_C(100000000000000),
+ UINT64_C(1000000000000000),
+ UINT64_C(10000000000000000),
+ UINT64_C(100000000000000000),
+ UINT64_C(1000000000000000000),
+ UINT64_C(10000000000000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000000000),
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * 10,
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(100),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(1000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(10000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(100000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(1000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(10000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(100000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(1000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(10000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(100000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) * UINT64_C(1000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ UINT64_C(10000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ UINT64_C(100000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ UINT64_C(1000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ UINT64_C(10000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ UINT64_C(100000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ UINT64_C(1000000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ UINT64_C(10000000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(100000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(1000000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000),
+ __uint256_t(__uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000)) *
+ __uint128_t(UINT64_C(10000000000000000000)) * UINT64_C(10000000000000000000) * 10};
+#endif
+
inline _LIBCPP_CONSTEXPR const char __digits_base_10[200] = {
// clang-format off
'0', '0', '0', '1', '0', '2', '0', '3', '0', '4', '0', '5', '0', '6', '0', '7', '0', '8', '0', '9',
diff --git a/libcxx/include/__charconv/to_chars_base_10.h b/libcxx/include/__charconv/to_chars_base_10.h
index d90952ea71f35..15888ac0573d2 100644
--- a/libcxx/include/__charconv/to_chars_base_10.h
+++ b/libcxx/include/__charconv/to_chars_base_10.h
@@ -175,6 +175,67 @@ __base_10_u128(char* __buffer, __uint128_t __value) _NOEXCEPT {
return __buffer;
}
# endif
+
+#if _LIBCPP_HAS_INT256
+/// \returns 10^\a exp
+///
+/// \pre \a exp [0, 77]
+_LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI inline __uint256_t __pow_10_256(int __exp) _NOEXCEPT {
+ _LIBCPP_ASSERT_INTERNAL(__exp >= __pow10_256_offset, "Index out of bounds");
+ return __pow10_256[__exp - __pow10_256_offset];
+}
+
+_LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI inline char*
+__base_10_u256(char* __buffer, __uint256_t __value) _NOEXCEPT {
+ _LIBCPP_ASSERT_INTERNAL(
+ __value > numeric_limits<__uint128_t>::max(), "The optimizations for this algorithm fail when this isn't true.");
+
+ // Maximum unsigned values:
+ // 128 bit 340'282'366'920'938'463'463'374'607'431'768'211'455 (39 digits)
+ // 256 bit 115'792'089'237'316'195'423'570'985'008'687'907'853'
+ // 269'984'665'640'564'039'457'584'007'913'129'639'935 (78 digits)
+ //
+ // Strategy: divide into chunks of 19 digits (10^19 fits in uint64_t).
+ // A 256-bit number has at most 78 digits = 4 chunks of 19 + 2 leading digits.
+ // We peel off 19-digit chunks from the bottom using 256-bit division by 10^19.
+
+ __uint256_t __p19 = __pow_10_256(19);
+
+ // A 256-bit number has at most 78 digits = 5 chunks of up to 19 digits.
+ // Extract 5 chunks of at most 19 digits each from the bottom.
+ uint64_t __c0 = static_cast<uint64_t>(__value % __p19);
+ __value /= __p19;
+ uint64_t __c1 = static_cast<uint64_t>(__value % __p19);
+ __value /= __p19;
+ uint64_t __c2 = static_cast<uint64_t>(__value % __p19);
+ __value /= __p19;
+ uint64_t __c3 = static_cast<uint64_t>(__value % __p19);
+ __value /= __p19;
+ uint64_t __c4 = static_cast<uint64_t>(__value); // at most 2 digits
+
+ // Emit 19-digit zero-padded chunk: [9 digits] + [10 digits]
+ auto __emit_padded = [&](uint64_t __c) {
+ __buffer = __itoa::__append9(__buffer, static_cast<uint32_t>(__c / 10000000000));
+ __buffer = __itoa::__append10(__buffer, __c % 10000000000);
+ };
+
+ // Find the first non-zero chunk and emit it with variable width.
+ if (__c4) {
+ __buffer = __base_10_u64(__buffer, __c4);
+ __emit_padded(__c3);
+ __emit_padded(__c2);
+ } else if (__c3) {
+ __buffer = __base_10_u64(__buffer, __c3);
+ __emit_padded(__c2);
+ } else {
+ __buffer = __base_10_u64(__buffer, __c2);
+ }
+ __emit_padded(__c1);
+ __emit_padded(__c0);
+
+ return __buffer;
+}
+#endif
} // namespace __itoa
_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__charconv/to_chars_integral.h b/libcxx/include/__charconv/to_chars_integral.h
index 6d425139260b6..66680b2f1be4d 100644
--- a/libcxx/include/__charconv/to_chars_integral.h
+++ b/libcxx/include/__charconv/to_chars_integral.h
@@ -25,7 +25,7 @@
#include <__type_traits/is_integral.h>
#include <__type_traits/is_same.h>
#include <__type_traits/is_signed.h>
-#include <__type_traits/make_32_64_or_128_bit.h>
+#include <__type_traits/make_32_64_128_or_256_bit.h>
#include <__type_traits/make_unsigned.h>
#include <__utility/unreachable.h>
#include <cstdint>
@@ -89,6 +89,24 @@ __to_chars_itoa(char* __first, char* __last, __uint128_t __value, false_type) {
}
# endif
+#if _LIBCPP_HAS_INT256
+template <>
+inline _LIBCPP_CONSTEXPR_SINCE_CXX23
+_LIBCPP_HIDE_FROM_ABI __to_chars_result __to_chars_itoa(char* __first, char* __last, __uint256_t __value, false_type) {
+ // When the value fits in 128-bits use the 128-bit code path.
+ if (__value <= numeric_limits<__uint128_t>::max())
+ return __to_chars_itoa(__first, __last, static_cast<__uint128_t>(__value), false_type());
+
+ using __tx = __itoa::__traits<__uint256_t>;
+ auto __diff = __last - __first;
+
+ if (__tx::digits <= __diff || __tx::__width(__value) <= __diff)
+ return {__tx::__convert(__first, __value), errc(0)};
+ else
+ return {__last, errc::value_too_large};
+}
+#endif
+
template <class _Tp, __enable_if_t<!is_signed<_Tp>::value, int> = 0>
inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI __to_chars_result
__to_chars_integral(char* __first, char* __last, _Tp __value, int __base);
@@ -321,7 +339,7 @@ to_chars_result to_chars(char*, char*, bool, int = 10) = delete;
template <typename _Tp, __enable_if_t<is_integral<_Tp>::value, int> = 0>
inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI to_chars_result
to_chars(char* __first, char* __last, _Tp __value) {
- using _Type = __make_32_64_or_128_bit_t<_Tp>;
+ using _Type = __make_32_64_128_or_256_bit_t<_Tp>;
static_assert(!is_same<_Type, void>::value, "unsupported integral type used in to_chars");
return std::__to_chars_itoa(__first, __last, static_cast<_Type>(__value), is_signed<_Tp>());
}
@@ -331,7 +349,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI to_chars_result
to_chars(char* __first, char* __last, _Tp __value, int __base) {
_LIBCPP_ASSERT_UNCATEGORIZED(2 <= __base && __base <= 36, "base not in [2, 36]");
- using _Type = __make_32_64_or_128_bit_t<_Tp>;
+ using _Type = __make_32_64_128_or_256_bit_t<_Tp>;
return std::__to_chars_integral(__first, __last, static_cast<_Type>(__value), __base);
}
diff --git a/libcxx/include/__charconv/traits.h b/libcxx/include/__charconv/traits.h
index b8c840d1ebe32..46691ae70db89 100644
--- a/libcxx/include/__charconv/traits.h
+++ b/libcxx/include/__charconv/traits.h
@@ -113,6 +113,30 @@ struct _LIBCPP_HIDDEN __traits_base<_Tp, __enable_if_t<sizeof(_Tp) == sizeof(__u
};
# endif
+#if _LIBCPP_HAS_INT256
+template <typename _Tp>
+struct _LIBCPP_HIDDEN __traits_base<_Tp, __enable_if_t<sizeof(_Tp) == sizeof(__uint256_t)> > {
+ using type = __uint256_t;
+
+ static _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI int __width(_Tp __v) {
+ _LIBCPP_ASSERT_INTERNAL(
+ __v > numeric_limits<__uint128_t>::max(), "The optimizations for this algorithm fail when this isn't true.");
+ // There's always a bit set in the upper 128-bits.
+ auto __t = (256 - std::__countl_zero(static_cast<__uint128_t>(__v >> 128))) * 1233 >> 12;
+ _LIBCPP_ASSERT_INTERNAL(__t >= __itoa::__pow10_256_offset, "Index out of bounds");
+ return __t - (__v < __itoa::__pow10_256[__t - __itoa::__pow10_256_offset]) + 1;
+ }
+
+ static _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI char* __convert(char* __p, _Tp __v) {
+ return __itoa::__base_10_u256(__p, __v);
+ }
+
+ static _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI decltype(__pow10_256)& __pow() {
+ return __itoa::__pow10_256;
+ }
+};
+#endif
+
template <typename _Tp, typename _Up>
_LIBCPP_HIDE_FROM_ABI bool _LIBCPP_CONSTEXPR_SINCE_CXX23 __mul_overflowed(_Tp __a, _Up __b, _Tp& __r) {
static_assert(is_unsigned<_Tp>::value);
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 9cb98bbb59341..ab91ec999d2aa 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -268,6 +268,12 @@ typedef __char32_t char32_t;
# define _LIBCPP_HAS_INT128 1
# endif
+# if !defined(__SIZEOF_INT256__) || defined(_MSC_VER)
+# define _LIBCPP_HAS_INT256 0
+# else
+# define _LIBCPP_HAS_INT256 1
+# endif
+
# ifdef _LIBCPP_CXX03_LANG
# define _LIBCPP_DECLARE_STRONG_ENUM(x) \
struct _LIBCPP_EXPORTED_FROM_ABI x { \
diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h
index 19794f0f084ce..3996185029080 100644
--- a/libcxx/include/__format/format_arg.h
+++ b/libcxx/include/__format/format_arg.h
@@ -42,8 +42,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
namespace __format {
/// The type stored in @ref basic_format_arg.
///
-/// @note The 128-bit types are unconditionally in the list to avoid the values
-/// of the enums to depend on the availability of 128-bit integers.
+/// @note The 128-bit and 256-bit types are unconditionally in the list to avoid
+/// the values of the enums to depend on the availability of extended integers.
///
/// @note The value is stored as a 5-bit value in the __packed_arg_t_bits. This
/// limits the maximum number of elements to 32.
@@ -65,9 +65,11 @@ enum class __arg_t : uint8_t {
__int,
__long_long,
__i128, // extension
+ __i256, // extension
__unsigned,
__unsigned_long_long,
__u128, // extension
+ __u256, // extension
__float,
__double,
__long_double,
@@ -118,6 +120,12 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) __visit_format_arg(_Visitor&& __vis, basic_
return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__i128_);
# else
__libcpp_unreachable();
+# endif
+ case __format::__arg_t::__i256:
+# if _LIBCPP_HAS_INT256
+ return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__i256_);
+# else
+ __libcpp_unreachable();
# endif
case __format::__arg_t::__unsigned:
return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__unsigned_);
@@ -128,6 +136,12 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) __visit_format_arg(_Visitor&& __vis, basic_
return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__u128_);
# else
__libcpp_unreachable();
+# endif
+ case __format::__arg_t::__u256:
+# if _LIBCPP_HAS_INT256
+ return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__u256_);
+# else
+ __libcpp_unreachable();
# endif
case __format::__arg_t::__float:
return std::invoke(std::forward<_Visitor>(__vis), __arg.__value_.__float_);
@@ -169,6 +183,12 @@ _LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<
return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__i128_);
# else
__libcpp_unreachable();
+# endif
+ case __format::__arg_t::__i256:
+# if _LIBCPP_HAS_INT256
+ return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__i256_);
+# else
+ __libcpp_unreachable();
# endif
case __format::__arg_t::__unsigned:
return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__unsigned_);
@@ -179,6 +199,12 @@ _LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<
return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__u128_);
# else
__libcpp_unreachable();
+# endif
+ case __format::__arg_t::__u256:
+# if _LIBCPP_HAS_INT256
+ return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__u256_);
+# else
+ __libcpp_unreachable();
# endif
case __format::__arg_t::__float:
return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__float_);
@@ -241,6 +267,10 @@ class __basic_format_arg_value {
# if _LIBCPP_HAS_INT128
__int128_t __i128_;
__uint128_t __u128_;
+# endif
+# if _LIBCPP_HAS_INT256
+ __int256_t __i256_;
+ __uint256_t __u256_;
# endif
float __float_;
double __double_;
@@ -265,6 +295,10 @@ class __basic_format_arg_value {
# if _LIBCPP_HAS_INT128
_LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(__int128_t __value) noexcept : __i128_(__value) {}
_LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(__uint128_t __value) noexcept : __u128_(__value) {}
+# endif
+# if _LIBCPP_HAS_INT256
+ _LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(__int256_t __value) noexcept : __i256_(__value) {}
+ _LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(__uint256_t __value) noexcept : __u256_(__value) {}
# endif
_LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(float __value) noexcept : __float_(__value) {}
_LIBCPP_HIDE_FROM_ABI __basic_format_arg_value(double __value) noexcept : __double_(__value) {}
@@ -302,6 +336,17 @@ class _LIBCPP_NO_SPECIALIZATIONS basic_format_arg {
typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
}
+# endif
+# if _LIBCPP_HAS_INT256
+ case __format::__arg_t::__i256: {
+ typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__i256_};
+ return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+ }
+
+ case __format::__arg_t::__u256: {
+ typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u256_};
+ return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+ }
# endif
default:
return std::__visit_format_arg(std::forward<_Visitor>(__vis), __arg);
@@ -323,6 +368,17 @@ class _LIBCPP_NO_SPECIALIZATIONS basic_format_arg {
typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
}
+# endif
+# if _LIBCPP_HAS_INT256
+ case __format::__arg_t::__i256: {
+ typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__i256_};
+ return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+ }
+
+ case __format::__arg_t::__u256: {
+ typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u256_};
+ return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+ }
# endif
default:
return std::__visit_format_arg<_Rp>(std::forward<_Visitor>(__vis), __arg);
@@ -385,6 +441,17 @@ visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
}
# endif // _LIBCPP_HAS_INT128
+# if _LIBCPP_HAS_INT256
+ case __format::__arg_t::__i256: {
+ typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__i256_};
+ return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+ }
+
+ case __format::__arg_t::__u256: {
+ typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u256_};
+ return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+ }
+# endif // _LIBCPP_HAS_INT256
default:
return std::__visit_format_arg(std::forward<_Visitor>(__vis), __arg);
}
diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h
index fbb4cad21b232..cab2b9736564b 100644
--- a/libcxx/include/__format/format_arg_store.h
+++ b/libcxx/include/__format/format_arg_store.h
@@ -74,6 +74,10 @@ consteval __arg_t __determine_arg_t() {
# if _LIBCPP_HAS_INT128
else if constexpr (sizeof(_Tp) == sizeof(__int128_t))
return __arg_t::__i128;
+# endif
+# if _LIBCPP_HAS_INT256
+ else if constexpr (sizeof(_Tp) == sizeof(__int256_t))
+ return __arg_t::__i256;
# endif
else
static_assert(sizeof(_Tp) == 0, "an unsupported signed integer was used");
@@ -89,6 +93,10 @@ consteval __arg_t __determine_arg_t() {
# if _LIBCPP_HAS_INT128
else if constexpr (sizeof(_Tp) == sizeof(__uint128_t))
return __arg_t::__u128;
+# endif
+# if _LIBCPP_HAS_INT256
+ else if constexpr (sizeof(_Tp) == sizeof(__uint256_t))
+ return __arg_t::__u256;
# endif
else
static_assert(sizeof(_Tp) == 0, "an unsupported unsigned integer was used");
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 873265bc17c24..cf79838b50c2b 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -212,6 +212,13 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __compile_time_visit_format_arg(
return __format::__compile_time_validate_argument<_CharT, __int128_t>(__parse_ctx, __ctx);
# else
std::__throw_format_error("Invalid argument");
+# endif
+ return;
+ case __arg_t::__i256:
+# if _LIBCPP_HAS_INT256
+ return __format::__compile_time_validate_argument<_CharT, __int256_t>(__parse_ctx, __ctx);
+# else
+ std::__throw_format_error("Invalid argument");
# endif
return;
case __arg_t::__unsigned:
@@ -223,6 +230,13 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __compile_time_visit_format_arg(
return __format::__compile_time_validate_argument<_CharT, __uint128_t>(__parse_ctx, __ctx);
# else
std::__throw_format_error("Invalid argument");
+# endif
+ return;
+ case __arg_t::__u256:
+# if _LIBCPP_HAS_INT256
+ return __format::__compile_time_validate_argument<_CharT, __uint256_t>(__parse_ctx, __ctx);
+# else
+ std::__throw_format_error("Invalid argument");
# endif
return;
case __arg_t::__float:
diff --git a/libcxx/include/__format/formatter_integer.h b/libcxx/include/__format/formatter_integer.h
index cf186c64e3d0f..b39194195fe2d 100644
--- a/libcxx/include/__format/formatter_integer.h
+++ b/libcxx/include/__format/formatter_integer.h
@@ -19,7 +19,7 @@
#include <__format/formatter_output.h>
#include <__format/parser_std_format_spec.h>
#include <__type_traits/is_void.h>
-#include <__type_traits/make_32_64_or_128_bit.h>
+#include <__type_traits/make_32_64_128_or_256_bit.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
@@ -46,7 +46,7 @@ struct __formatter_integer {
if (__specs.__std_.__type_ == __format_spec::__type::__char)
return __formatter::__format_char(__value, __ctx.out(), __specs);
- using _Type = __make_32_64_or_128_bit_t<_Tp>;
+ using _Type = __make_32_64_128_or_256_bit_t<_Tp>;
static_assert(!is_void<_Type>::value, "unsupported integral type used in __formatter_integer::__format");
// Reduce the number of instantiation of the integer formatter
@@ -71,6 +71,10 @@ struct formatter<long long, _CharT> : public __formatter_integer<_CharT> {};
template <__fmt_char_type _CharT>
struct formatter<__int128_t, _CharT> : public __formatter_integer<_CharT> {};
# endif
+# if _LIBCPP_HAS_INT256
+template <__fmt_char_type _CharT>
+struct formatter<__int256_t, _CharT> : public __formatter_integer<_CharT> {};
+# endif
// Unsigned integral types.
template <__fmt_char_type _CharT>
@@ -87,6 +91,10 @@ struct formatter<unsigned long long, _CharT> : public __formatter_integer<_CharT
template <__fmt_char_type _CharT>
struct formatter<__uint128_t, _CharT> : public __formatter_integer<_CharT> {};
# endif
+# if _LIBCPP_HAS_INT256
+template <__fmt_char_type _CharT>
+struct formatter<__uint256_t, _CharT> : public __formatter_integer<_CharT> {};
+# endif
# if _LIBCPP_STD_VER >= 23
template <>
@@ -103,6 +111,10 @@ inline constexpr bool enable_nonlocking_formatter_optimization<long long> = true
template <>
inline constexpr bool enable_nonlocking_formatter_optimization<__int128_t> = true;
# endif
+# if _LIBCPP_HAS_INT256
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<__int256_t> = true;
+# endif
template <>
inline constexpr bool enable_nonlocking_formatter_optimization<unsigned char> = true;
@@ -118,6 +130,10 @@ inline constexpr bool enable_nonlocking_formatter_optimization<unsigned long lon
template <>
inline constexpr bool enable_nonlocking_formatter_optimization<__uint128_t> = true;
# endif
+# if _LIBCPP_HAS_INT256
+template <>
+inline constexpr bool enable_nonlocking_formatter_optimization<__uint256_t> = true;
+# endif
# endif // _LIBCPP_STD_VER >= 23
#endif // _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__random/is_valid.h b/libcxx/include/__random/is_valid.h
index f6679b3fdc427..af8be6ddc5ad6 100644
--- a/libcxx/include/__random/is_valid.h
+++ b/libcxx/include/__random/is_valid.h
@@ -73,6 +73,13 @@ template <>
struct __libcpp_random_is_valid_inttype<__uint128_t> : true_type {}; // extension
#endif // _LIBCPP_HAS_INT128
+#if _LIBCPP_HAS_INT256
+template <>
+struct __libcpp_random_is_valid_inttype<__int256_t> : true_type {}; // extension
+template <>
+struct __libcpp_random_is_valid_inttype<__uint256_t> : true_type {}; // extension
+#endif // _LIBCPP_HAS_INT256
+
// [rand.req.urng]/3:
// A class G meets the uniform random bit generator requirements if G models
// uniform_random_bit_generator, invoke_result_t<G&> is an unsigned integer type,
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index 6b2576ec6b23d..83a14883c485d 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -60,7 +60,12 @@ struct __get_wider_signed {
return type_identity<long>{};
else if constexpr (sizeof(_Int) < sizeof(long long))
return type_identity<long long>{};
-# if _LIBCPP_HAS_INT128
+# if _LIBCPP_HAS_INT256
+ else if constexpr (sizeof(_Int) < sizeof(__int128))
+ return type_identity<__int128>{};
+ else if constexpr (sizeof(_Int) <= sizeof(__int256))
+ return type_identity<__int256>{};
+# elif _LIBCPP_HAS_INT128
else if constexpr (sizeof(_Int) <= sizeof(__int128))
return type_identity<__int128>{};
# else
diff --git a/libcxx/include/__type_traits/integer_traits.h b/libcxx/include/__type_traits/integer_traits.h
index fad502c44e301..c0d1937e2935e 100644
--- a/libcxx/include/__type_traits/integer_traits.h
+++ b/libcxx/include/__type_traits/integer_traits.h
@@ -35,6 +35,10 @@ inline const bool __is_signed_integer_v<signed long long> = true;
template <>
inline const bool __is_signed_integer_v<__int128_t> = true;
#endif
+#if _LIBCPP_HAS_INT256
+template <>
+inline const bool __is_signed_integer_v<__int256_t> = true;
+#endif
// This trait is to determine whether a type is an /unsigned integer type/
// See [basic.fundamental]/p2
@@ -54,6 +58,10 @@ inline const bool __is_unsigned_integer_v<unsigned long long> = true;
template <>
inline const bool __is_unsigned_integer_v<__uint128_t> = true;
#endif
+#if _LIBCPP_HAS_INT256
+template <>
+inline const bool __is_unsigned_integer_v<__uint256_t> = true;
+#endif
#if _LIBCPP_STD_VER >= 20
template <class _Tp>
diff --git a/libcxx/include/__type_traits/is_integral.h b/libcxx/include/__type_traits/is_integral.h
index 5a340965f0384..8f982842d54bb 100644
--- a/libcxx/include/__type_traits/is_integral.h
+++ b/libcxx/include/__type_traits/is_integral.h
@@ -57,6 +57,10 @@ template <> struct __libcpp_is_integral<unsigned long long> { enum { va
template <> struct __libcpp_is_integral<__int128_t> { enum { value = 1 }; };
template <> struct __libcpp_is_integral<__uint128_t> { enum { value = 1 }; };
#endif
+#if _LIBCPP_HAS_INT256
+template <> struct __libcpp_is_integral<__int256_t> { enum { value = 1 }; };
+template <> struct __libcpp_is_integral<__uint256_t> { enum { value = 1 }; };
+#endif
// clang-format on
template <class _Tp>
diff --git a/libcxx/include/__type_traits/make_32_64_or_128_bit.h b/libcxx/include/__type_traits/make_32_64_128_or_256_bit.h
similarity index 70%
rename from libcxx/include/__type_traits/make_32_64_or_128_bit.h
rename to libcxx/include/__type_traits/make_32_64_128_or_256_bit.h
index 7016209ec9c0a..f4de69017033c 100644
--- a/libcxx/include/__type_traits/make_32_64_or_128_bit.h
+++ b/libcxx/include/__type_traits/make_32_64_128_or_256_bit.h
@@ -6,8 +6,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef _LIBCPP___TYPE_TRAITS_MAKE_32_64_OR_128_BIT_H
-#define _LIBCPP___TYPE_TRAITS_MAKE_32_64_OR_128_BIT_H
+#ifndef _LIBCPP___TYPE_TRAITS_MAKE_32_64_128_OR_256_BIT_H
+#define _LIBCPP___TYPE_TRAITS_MAKE_32_64_128_OR_256_BIT_H
#include <__config>
#include <__type_traits/conditional.h>
@@ -23,19 +23,23 @@
_LIBCPP_BEGIN_NAMESPACE_STD
-/// Helper to promote an integral to smallest 32, 64, or 128 bit representation.
+/// Helper to promote an integral to smallest 32, 64, 128, or 256 bit representation.
///
-/// The restriction is the same as the integral version of to_char.
+/// The restriction is the same as the integral version of to_chars.
template <class _Tp>
#if _LIBCPP_STD_VER >= 20
requires(is_signed_v<_Tp> || is_unsigned_v<_Tp> || is_same_v<_Tp, char>)
#endif
// clang-format off
-using __make_32_64_or_128_bit_t _LIBCPP_NODEBUG =
+using __make_32_64_128_or_256_bit_t _LIBCPP_NODEBUG =
__copy_unsigned_t<_Tp,
__conditional_t<sizeof(_Tp) <= sizeof(int32_t), int32_t,
__conditional_t<sizeof(_Tp) <= sizeof(int64_t), int64_t,
-#if _LIBCPP_HAS_INT128
+#if _LIBCPP_HAS_INT256
+ __conditional_t<sizeof(_Tp) <= sizeof(__int128_t), __int128_t,
+ __conditional_t<sizeof(_Tp) <= sizeof(__int256_t), __int256_t,
+ /* else */ void> >
+#elif _LIBCPP_HAS_INT128
__conditional_t<sizeof(_Tp) <= sizeof(__int128_t), __int128_t,
/* else */ void>
#else
@@ -46,4 +50,4 @@ using __make_32_64_or_128_bit_t _LIBCPP_NODEBUG =
_LIBCPP_END_NAMESPACE_STD
-#endif // _LIBCPP___TYPE_TRAITS_MAKE_32_64_OR_128_BIT_H
+#endif // _LIBCPP___TYPE_TRAITS_MAKE_32_64_128_OR_256_BIT_H
diff --git a/libcxx/include/__type_traits/make_signed.h b/libcxx/include/__type_traits/make_signed.h
index dff23d880dc30..794de061403bd 100644
--- a/libcxx/include/__type_traits/make_signed.h
+++ b/libcxx/include/__type_traits/make_signed.h
@@ -37,6 +37,10 @@ using __signed_types =
# if _LIBCPP_HAS_INT128
,
__int128_t
+# endif
+# if _LIBCPP_HAS_INT256
+ ,
+ __int256_t
# endif
>;
@@ -62,6 +66,10 @@ template <> struct __make_signed<unsigned long long, true> {typedef long long ty
template <> struct __make_signed<__int128_t, true> {typedef __int128_t type;};
template <> struct __make_signed<__uint128_t, true> {typedef __int128_t type;};
# endif
+# if _LIBCPP_HAS_INT256
+template <> struct __make_signed<__int256_t, true> {typedef __int256_t type;};
+template <> struct __make_signed<__uint256_t, true> {typedef __int256_t type;};
+# endif
// clang-format on
template <class _Tp>
diff --git a/libcxx/include/__type_traits/make_unsigned.h b/libcxx/include/__type_traits/make_unsigned.h
index a83baa658e294..a8bb71bfc8314 100644
--- a/libcxx/include/__type_traits/make_unsigned.h
+++ b/libcxx/include/__type_traits/make_unsigned.h
@@ -39,6 +39,10 @@ using __unsigned_types =
# if _LIBCPP_HAS_INT128
,
__uint128_t
+# endif
+# if _LIBCPP_HAS_INT256
+ ,
+ __uint256_t
# endif
>;
@@ -64,6 +68,10 @@ template <> struct __make_unsigned<unsigned long long, true> {typedef unsigned l
template <> struct __make_unsigned<__int128_t, true> {typedef __uint128_t type;};
template <> struct __make_unsigned<__uint128_t, true> {typedef __uint128_t type;};
# endif
+# if _LIBCPP_HAS_INT256
+template <> struct __make_unsigned<__int256_t, true> {typedef __uint256_t type;};
+template <> struct __make_unsigned<__uint256_t, true> {typedef __uint256_t type;};
+# endif
// clang-format on
template <class _Tp>
diff --git a/libcxx/include/__type_traits/promote.h b/libcxx/include/__type_traits/promote.h
index 96b4903032b18..94a535a3d0d04 100644
--- a/libcxx/include/__type_traits/promote.h
+++ b/libcxx/include/__type_traits/promote.h
@@ -31,6 +31,10 @@ double __promote_impl(unsigned long long);
double __promote_impl(__int128_t);
double __promote_impl(__uint128_t);
#endif
+#if _LIBCPP_HAS_INT256
+double __promote_impl(__int256_t);
+double __promote_impl(__uint256_t);
+#endif
double __promote_impl(double);
long double __promote_impl(long double);
diff --git a/libcxx/include/__utility/convert_to_integral.h b/libcxx/include/__utility/convert_to_integral.h
index c8149b7744984..802f9e1c5a0ec 100644
--- a/libcxx/include/__utility/convert_to_integral.h
+++ b/libcxx/include/__utility/convert_to_integral.h
@@ -48,6 +48,12 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __int128_t __convert_to_integral(
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __uint128_t __convert_to_integral(__uint128_t __val) { return __val; }
#endif
+#if _LIBCPP_HAS_INT256
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __int256_t __convert_to_integral(__int256_t __val) { return __val; }
+
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __uint256_t __convert_to_integral(__uint256_t __val) { return __val; }
+#endif
+
template <class _Tp, bool = is_enum<_Tp>::value>
struct __sfinae_underlying_type {
using type = __underlying_type_t<_Tp>;
diff --git a/libcxx/include/limits b/libcxx/include/limits
index ff40d2051d06f..0e34bb49220ea 100644
--- a/libcxx/include/limits
+++ b/libcxx/include/limits
@@ -186,7 +186,10 @@ protected:
static _LIBCPP_CONSTEXPR const bool is_signed = type(-1) < type(0);
static _LIBCPP_CONSTEXPR const int digits = static_cast<int>(sizeof(type) * __CHAR_BIT__ - is_signed);
- static _LIBCPP_CONSTEXPR const int digits10 = digits * 3 / 10;
+ // floor(digits * log10(2)); 301/1000 approximates log10(2) = 0.30103...
+ // more accurately than 3/10 = 0.3, which under-counts at 256+ bits.
+ // Exact for all bit widths up to at least 33000 (beyond any practical use).
+ static _LIBCPP_CONSTEXPR const int digits10 = digits * 301 / 1000;
static _LIBCPP_CONSTEXPR const int max_digits10 = 0;
static _LIBCPP_CONSTEXPR const type __min = is_signed ? _Tp(_Tp(1) << digits) : 0;
static _LIBCPP_CONSTEXPR const type __max = is_signed ? type(type(~0) ^ __min) : type(~0);
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 9012ed18cbd79..9a779c7c28f94 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -343,7 +343,7 @@ module std_core [system] {
}
module is_within_lifetime { header "__type_traits/is_within_lifetime.h" }
module lazy { header "__type_traits/lazy.h" }
- module make_32_64_or_128_bit { header "__type_traits/make_32_64_or_128_bit.h" }
+ module make_32_64_128_or_256_bit { header "__type_traits/make_32_64_128_or_256_bit.h" }
module make_const_lvalue_ref { header "__type_traits/make_const_lvalue_ref.h" }
module make_signed { header "__type_traits/make_signed.h" }
module make_transparent { header "__type_traits/make_transparent.h" }
>From d8baaf3f131d46005e5ccf51515d89e8531115ef Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:40:53 +0100
Subject: [PATCH 11/17] [libc++][test] Add __int256 tests
Add libc++ test coverage for __int256 support:
- Type traits: is_integral, is_signed/unsigned, make_signed/unsigned,
integer concepts (__libcpp_integer, __libcpp_signed/unsigned_integer),
is_always_bitcastable
- numeric_limits: all members (min, max, digits, digits10, etc.)
- charconv: to_chars/from_chars for various bases
- format: format_arg type enum, integer formatting
- bit operations: byteswap, countl_zero, countr_zero, popcount
- Containers: hash<__int256_t> and hash<__uint256_t>
- convert_to_integral
- Update test_macros.h with TEST_HAS_INT256 feature macro
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
.../__libcpp_integer.compile.pass.cpp | 6 +
.../__libcpp_signed_integer.compile.pass.cpp | 6 +
...__libcpp_unsigned_integer.compile.pass.cpp | 6 +
.../containers/unord/hash/int256.pass.cpp | 136 ++++++++++
.../numerics/bit.ops/int256.byteswap.pass.cpp | 56 ++++
.../libcxx/numerics/bit.ops/int256.pass.cpp | 99 ++++++++
.../charconv/int256.from_chars.pass.cpp | 240 ++++++++++++++++++
.../libcxx/numerics/charconv/int256.pass.cpp | 87 +++++++
.../numerics/numeric.limits/int256.pass.cpp | 211 +++++++++++++++
.../type_traits/convert_to_integral.pass.cpp | 4 +
.../test/libcxx/type_traits/int256.pass.cpp | 93 +++++++
.../is_always_bitcastable.compile.pass.cpp | 5 +
.../format.arg/arg_t.compile.pass.cpp | 24 +-
.../format/format.arguments/int256.pass.cpp | 119 +++++++++
.../format.context/types.compile.pass.cpp | 6 +
libcxx/test/support/test_macros.h | 8 +
16 files changed, 1096 insertions(+), 10 deletions(-)
create mode 100644 libcxx/test/libcxx/containers/unord/hash/int256.pass.cpp
create mode 100644 libcxx/test/libcxx/numerics/bit.ops/int256.byteswap.pass.cpp
create mode 100644 libcxx/test/libcxx/numerics/bit.ops/int256.pass.cpp
create mode 100644 libcxx/test/libcxx/numerics/charconv/int256.from_chars.pass.cpp
create mode 100644 libcxx/test/libcxx/numerics/charconv/int256.pass.cpp
create mode 100644 libcxx/test/libcxx/numerics/numeric.limits/int256.pass.cpp
create mode 100644 libcxx/test/libcxx/type_traits/int256.pass.cpp
create mode 100644 libcxx/test/libcxx/utilities/format/format.arguments/int256.pass.cpp
diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp
index 4958a258137a1..b0380e3d9368f 100644
--- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp
+++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp
@@ -33,6 +33,9 @@ static_assert(std::__signed_or_unsigned_integer<unsigned short int>);
#if _LIBCPP_HAS_INT128
static_assert(std::__signed_or_unsigned_integer<__uint128_t>);
#endif
+#if _LIBCPP_HAS_INT256
+static_assert(std::__signed_or_unsigned_integer<__uint256_t>);
+#endif
// Signed
static_assert(std::__signed_or_unsigned_integer<signed char>);
static_assert(std::__signed_or_unsigned_integer<short int>);
@@ -43,6 +46,9 @@ static_assert(std::__signed_or_unsigned_integer<short int>);
#if _LIBCPP_HAS_INT128
static_assert(std::__signed_or_unsigned_integer<__int128_t>);
#endif
+#if _LIBCPP_HAS_INT256
+static_assert(std::__signed_or_unsigned_integer<__int256_t>);
+#endif
// Non-integer
static_assert(!std::__signed_or_unsigned_integer<bool>);
static_assert(!std::__signed_or_unsigned_integer<char>);
diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp
index 3fa342685770c..8296b1abe4658 100644
--- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp
+++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp
@@ -33,6 +33,9 @@ static_assert(!std::__signed_integer<unsigned short int>);
#if _LIBCPP_HAS_INT128
static_assert(!std::__signed_integer<__uint128_t>);
#endif
+#if _LIBCPP_HAS_INT256
+static_assert(!std::__signed_integer<__uint256_t>);
+#endif
// Signed
static_assert(std::__signed_integer<signed char>);
static_assert(std::__signed_integer<short int>);
@@ -43,6 +46,9 @@ static_assert(std::__signed_integer<short int>);
#if _LIBCPP_HAS_INT128
static_assert(std::__signed_integer<__int128_t>);
#endif
+#if _LIBCPP_HAS_INT256
+static_assert(std::__signed_integer<__int256_t>);
+#endif
// Non-integer
static_assert(!std::__signed_integer<bool>);
static_assert(!std::__signed_integer<char>);
diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp
index ff60f32319171..48dcceb1f8924 100644
--- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp
+++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp
@@ -33,6 +33,9 @@ static_assert(std::__unsigned_integer<unsigned short int>);
#if _LIBCPP_HAS_INT128
static_assert(std::__unsigned_integer<__uint128_t>);
#endif
+#if _LIBCPP_HAS_INT256
+static_assert(std::__unsigned_integer<__uint256_t>);
+#endif
// Signed
static_assert(!std::__unsigned_integer<signed char>);
static_assert(!std::__unsigned_integer<short int>);
@@ -43,6 +46,9 @@ static_assert(!std::__unsigned_integer<short int>);
#if _LIBCPP_HAS_INT128
static_assert(!std::__unsigned_integer<__int128_t>);
#endif
+#if _LIBCPP_HAS_INT256
+static_assert(!std::__unsigned_integer<__int256_t>);
+#endif
// Non-integer
static_assert(!std::__unsigned_integer<bool>);
static_assert(!std::__unsigned_integer<char>);
diff --git a/libcxx/test/libcxx/containers/unord/hash/int256.pass.cpp b/libcxx/test/libcxx/containers/unord/hash/int256.pass.cpp
new file mode 100644
index 0000000000000..8c0a039535eb6
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/hash/int256.pass.cpp
@@ -0,0 +1,136 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// Test std::hash specialization for __int256_t / __uint256_t.
+//
+// The generic __hash_impl dispatches to __scalar_hash<_Tp, N> where
+// N = sizeof(_Tp) / sizeof(size_t). For __int256_t on 64-bit platforms,
+// N = 32/8 = 4, using __scalar_hash<_Tp, 4> which hashes via __hash_memory.
+
+#include <functional>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+int main(int, char**) {
+ std::hash<__int256_t> h_s;
+ std::hash<__uint256_t> h_u;
+
+ // --- Basic consistency: same input always gives same output ---
+ {
+ __int256_t a = 42;
+ if (h_s(a) != h_s(a))
+ return 1;
+
+ __uint256_t b = 42;
+ if (h_u(b) != h_u(b))
+ return 2;
+ }
+
+ // --- Different values should (usually) give different hashes ---
+ {
+ __uint256_t a = 0;
+ __uint256_t b = 1;
+ __uint256_t c = (__uint256_t)1 << 128;
+ __uint256_t d = (__uint256_t)1 << 255;
+
+ // We can't guarantee different hashes for all pairs (pigeonhole),
+ // but for these carefully chosen values it's astronomically unlikely
+ // that all four hash to the same value.
+ size_t ha = h_u(a);
+ size_t hb = h_u(b);
+ size_t hc = h_u(c);
+ size_t hd = h_u(d);
+
+ // At least 2 of the 4 hashes should be distinct
+ int distinct = 1;
+ if (hb != ha)
+ ++distinct;
+ if (hc != ha && hc != hb)
+ ++distinct;
+ if (hd != ha && hd != hb && hd != hc)
+ ++distinct;
+ if (distinct < 2)
+ return 3;
+ }
+
+ // --- Zero and negative values ---
+ {
+ __int256_t zero = 0;
+ __int256_t neg = -1;
+ // Hash of 0 and -1 should differ (very high probability)
+ if (h_s(zero) == h_s(neg)) {
+ // Allow this in theory, but verify the hash function is callable
+ (void)h_s(zero);
+ }
+ }
+
+ // --- Large values near max ---
+ {
+ __uint256_t max_val = ~(__uint256_t)0;
+ __uint256_t max_minus_1 = max_val - 1;
+ // These should produce valid hash values (no crash)
+ size_t h1 = h_u(max_val);
+ size_t h2 = h_u(max_minus_1);
+ (void)h1;
+ (void)h2;
+ }
+
+ // --- std::unordered_set with __uint256_t keys ---
+ {
+ std::unordered_set<__uint256_t> s;
+ s.insert(0);
+ s.insert(1);
+ s.insert((__uint256_t)1 << 128);
+ s.insert(~(__uint256_t)0);
+
+ if (s.size() != 4)
+ return 4;
+ if (s.count(0) != 1)
+ return 5;
+ if (s.count(1) != 1)
+ return 6;
+ if (s.count(2) != 0)
+ return 7;
+ }
+
+ // --- std::unordered_map with __int256_t keys ---
+ {
+ std::unordered_map<__int256_t, int> m;
+ m[0] = 10;
+ m[-1] = 20;
+ m[(__int256_t)1 << 200] = 30;
+
+ if (m.size() != 3)
+ return 8;
+ if (m[0] != 10)
+ return 9;
+ if (m[-1] != 20)
+ return 10;
+ }
+
+ // --- Signed and unsigned hash independence ---
+ // hash<__int256_t>(42) and hash<__uint256_t>(42) may or may not be equal
+ // (implementation defined), but both must be callable
+ {
+ __int256_t sv = 42;
+ __uint256_t uv = 42;
+ (void)h_s(sv);
+ (void)h_u(uv);
+ }
+
+ return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/numerics/bit.ops/int256.byteswap.pass.cpp b/libcxx/test/libcxx/numerics/bit.ops/int256.byteswap.pass.cpp
new file mode 100644
index 0000000000000..fc6e5957bf83d
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/bit.ops/int256.byteswap.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// Test std::byteswap with __int256_t and __uint256_t
+
+#include <bit>
+#include <cassert>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+constexpr __uint256_t
+make256(unsigned long long h3, unsigned long long h2, unsigned long long h1, unsigned long long h0) {
+ __uint256_t v = (__uint256_t)h3;
+ v = (v << 64) | (__uint256_t)h2;
+ v = (v << 64) | (__uint256_t)h1;
+ v = (v << 64) | (__uint256_t)h0;
+ return v;
+}
+
+// Constexpr tests
+static_assert(std::byteswap((__uint256_t)0) == (__uint256_t)0);
+static_assert(std::byteswap(~(__uint256_t)0) == ~(__uint256_t)0);
+
+// Known pattern: bytes 01 02 03 ... 20 reversed: 20 1F 1E ... 01
+static_assert(std::byteswap(make256(0x0102030405060708, 0x090A0B0C0D0E0F10, 0x1112131415161718, 0x191A1B1C1D1E1F20)) ==
+ make256(0x201F1E1D1C1B1A19, 0x1817161514131211, 0x100F0E0D0C0B0A09, 0x0807060504030201));
+
+// Double byteswap is identity
+static_assert(std::byteswap(std::byteswap(make256(0xDEADBEEF, 0xCAFEBABE, 0x12345678, 0x9ABCDEF0))) ==
+ make256(0xDEADBEEF, 0xCAFEBABE, 0x12345678, 0x9ABCDEF0));
+
+// Signed byteswap compiles
+static_assert(std::byteswap((__int256_t)0) == (__int256_t)0);
+
+int main(int, char**) {
+ // Runtime verification
+ __uint256_t val = make256(0x0102030405060708, 0x090A0B0C0D0E0F10, 0x1112131415161718, 0x191A1B1C1D1E1F20);
+ __uint256_t swapped = std::byteswap(val);
+ __uint256_t expected = make256(0x201F1E1D1C1B1A19, 0x1817161514131211, 0x100F0E0D0C0B0A09, 0x0807060504030201);
+ assert(swapped == expected);
+ assert(std::byteswap(swapped) == val);
+
+ return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/numerics/bit.ops/int256.pass.cpp b/libcxx/test/libcxx/numerics/bit.ops/int256.pass.cpp
new file mode 100644
index 0000000000000..f4a7de8378bea
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/bit.ops/int256.pass.cpp
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Test <bit> operations with __uint256_t (Tier 2 -- key for Hamming distance)
+
+#include <bit>
+#include <limits>
+#include <type_traits>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+// std::popcount -- the core operation for Hamming distance in neural search
+static_assert(std::popcount((__uint256_t)0) == 0);
+static_assert(std::popcount((__uint256_t)1) == 1);
+static_assert(std::popcount((__uint256_t)0xFF) == 8);
+static_assert(std::popcount((__uint256_t)0xFFFFFFFFFFFFFFFF) == 64);
+
+// std::countl_zero
+static_assert(std::countl_zero((__uint256_t)0) == 256);
+static_assert(std::countl_zero((__uint256_t)1) == 255);
+
+// std::countr_zero
+static_assert(std::countr_zero((__uint256_t)0) == 256);
+static_assert(std::countr_zero((__uint256_t)1) == 0);
+static_assert(std::countr_zero((__uint256_t)2) == 1);
+
+// std::countl_one
+static_assert(std::countl_one((__uint256_t)0) == 0);
+
+// std::countr_one
+static_assert(std::countr_one((__uint256_t)0) == 0);
+static_assert(std::countr_one((__uint256_t)1) == 1);
+static_assert(std::countr_one((__uint256_t)0xFF) == 8);
+
+// std::has_single_bit
+static_assert(std::has_single_bit((__uint256_t)1));
+static_assert(std::has_single_bit((__uint256_t)2));
+static_assert(std::has_single_bit((__uint256_t)4));
+static_assert(!std::has_single_bit((__uint256_t)3));
+static_assert(!std::has_single_bit((__uint256_t)0));
+
+// std::bit_width
+static_assert(std::bit_width((__uint256_t)0) == 0);
+static_assert(std::bit_width((__uint256_t)1) == 1);
+static_assert(std::bit_width((__uint256_t)2) == 2);
+static_assert(std::bit_width((__uint256_t)255) == 8);
+
+// std::rotl / std::rotr
+static_assert(std::rotl((__uint256_t)1, 1) == 2);
+static_assert(std::rotl((__uint256_t)1, 64) == ((__uint256_t)1 << 64));
+static_assert(std::rotl((__uint256_t)1, 255) == ((__uint256_t)1 << 255));
+static_assert(std::rotr((__uint256_t)2, 1) == 1);
+static_assert(std::rotr((__uint256_t)1, 1) == ((__uint256_t)1 << 255));
+static_assert(std::rotl(std::rotr((__uint256_t)0xFF, 4), 4) == 0xFF);
+
+// std::bit_ceil
+static_assert(std::bit_ceil((__uint256_t)0) == 1);
+static_assert(std::bit_ceil((__uint256_t)1) == 1);
+static_assert(std::bit_ceil((__uint256_t)2) == 2);
+static_assert(std::bit_ceil((__uint256_t)3) == 4);
+static_assert(std::bit_ceil((__uint256_t)255) == 256);
+
+// std::bit_floor
+static_assert(std::bit_floor((__uint256_t)0) == 0);
+static_assert(std::bit_floor((__uint256_t)1) == 1);
+static_assert(std::bit_floor((__uint256_t)2) == 2);
+static_assert(std::bit_floor((__uint256_t)3) == 2);
+static_assert(std::bit_floor((__uint256_t)255) == 128);
+
+int main(int, char**) {
+ // Runtime: Hamming distance pattern (Algolia neural search style)
+ __uint256_t a = (__uint256_t)0xDEADBEEF << 128 | 0xCAFEBABE;
+ __uint256_t b = (__uint256_t)0xFEEDFACE << 128 | 0xBAADF00D;
+ int hamming = std::popcount(a ^ b);
+ (void)hamming;
+
+ // Runtime: Verify popcount of known pattern
+ __uint256_t all_ones_low64 = 0xFFFFFFFFFFFFFFFF;
+ if (std::popcount(all_ones_low64) != 64)
+ return 1;
+
+ __uint256_t all_zeros = 0;
+ if (std::popcount(all_zeros) != 0)
+ return 2;
+
+ return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/numerics/charconv/int256.from_chars.pass.cpp b/libcxx/test/libcxx/numerics/charconv/int256.from_chars.pass.cpp
new file mode 100644
index 0000000000000..eb6b2f3ad3ced
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/charconv/int256.from_chars.pass.cpp
@@ -0,0 +1,240 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// Requires compiler-rt __int256 builtins (__udivoi3, __umodoi3) at runtime.
+// These are not yet available in the system compiler-rt library.
+// REQUIRES: int256-runtime
+
+// Test std::from_chars support for __int256_t / __uint256_t.
+//
+// from_chars works generically for all integral types via SFINAE on
+// is_integral<_Tp>::value. The implementation uses __itoa::__traits<_Tp>
+// for the base-10 fast path, and __itoa::__mul_overflowed (via
+// __builtin_mul_overflow) for other bases. Both support __uint256_t.
+
+#include <charconv>
+#include <cstring>
+#include <limits>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+// Helper: round-trip through to_chars then from_chars, verify value is preserved.
+template <typename T>
+bool round_trip(T value) {
+ char buf[80];
+ auto [to_ptr, to_ec] = std::to_chars(buf, buf + sizeof(buf), value);
+ if (to_ec != std::errc{})
+ return false;
+
+ T parsed{};
+ auto [from_ptr, from_ec] = std::from_chars(buf, to_ptr, parsed);
+ if (from_ec != std::errc{})
+ return false;
+ if (from_ptr != to_ptr)
+ return false;
+ return parsed == value;
+}
+
+// Helper: round-trip with explicit base.
+template <typename T>
+bool round_trip_base(T value, int base) {
+ char buf[260]; // base-2 of 256-bit = 256 chars + sign
+ auto [to_ptr, to_ec] = std::to_chars(buf, buf + sizeof(buf), value, base);
+ if (to_ec != std::errc{})
+ return false;
+
+ T parsed{};
+ auto [from_ptr, from_ec] = std::from_chars(buf, to_ptr, parsed, base);
+ if (from_ec != std::errc{})
+ return false;
+ if (from_ptr != to_ptr)
+ return false;
+ return parsed == value;
+}
+
+int main(int, char**) {
+ // ====================================================================
+ // Basic from_chars (base 10, default)
+ // ====================================================================
+
+ // --- Parse small unsigned values ---
+ {
+ __uint256_t val;
+ const char* str = "42";
+ auto [ptr, ec] = std::from_chars(str, str + 2, val);
+ if (ec != std::errc{} || val != 42 || ptr != str + 2)
+ return 1;
+ }
+
+ // --- Parse zero ---
+ {
+ __uint256_t val;
+ const char* str = "0";
+ auto [ptr, ec] = std::from_chars(str, str + 1, val);
+ if (ec != std::errc{} || val != 0)
+ return 2;
+ }
+
+ // --- Parse negative signed value ---
+ {
+ __int256_t val;
+ const char* str = "-1";
+ auto [ptr, ec] = std::from_chars(str, str + 2, val);
+ if (ec != std::errc{} || val != -1)
+ return 3;
+ }
+
+ // --- Parse value > 64-bit ---
+ {
+ __uint256_t val;
+ const char* str = "18446744073709551616"; // 2^64
+ auto [ptr, ec] = std::from_chars(str, str + std::strlen(str), val);
+ if (ec != std::errc{} || val != ((__uint256_t)1 << 64))
+ return 4;
+ }
+
+ // --- Parse value > 128-bit ---
+ {
+ __uint256_t val;
+ // 2^128 = 340282366920938463463374607431768211456
+ const char* str = "340282366920938463463374607431768211456";
+ auto [ptr, ec] = std::from_chars(str, str + std::strlen(str), val);
+ if (ec != std::errc{} || val != ((__uint256_t)1 << 128))
+ return 5;
+ }
+
+ // --- Invalid input ---
+ {
+ __uint256_t val = 999;
+ const char* str = "abc";
+ auto [ptr, ec] = std::from_chars(str, str + 3, val);
+ if (ec != std::errc::invalid_argument)
+ return 6;
+ // val should be unchanged on error
+ }
+
+ // --- Leading zeros ---
+ {
+ __uint256_t val;
+ const char* str = "00042";
+ auto [ptr, ec] = std::from_chars(str, str + 5, val);
+ if (ec != std::errc{} || val != 42)
+ return 7;
+ }
+
+ // ====================================================================
+ // Round-trip: to_chars -> from_chars for various values
+ // ====================================================================
+
+ // Unsigned values
+ if (!round_trip<__uint256_t>(0))
+ return 10;
+ if (!round_trip<__uint256_t>(1))
+ return 11;
+ if (!round_trip<__uint256_t>(42))
+ return 12;
+ if (!round_trip<__uint256_t>((__uint256_t)1 << 64))
+ return 13;
+ if (!round_trip<__uint256_t>((__uint256_t)1 << 128))
+ return 14;
+ if (!round_trip<__uint256_t>((__uint256_t)1 << 200))
+ return 15;
+ if (!round_trip<__uint256_t>(~(__uint256_t)0)) // max
+ return 16;
+
+ // Signed values
+ if (!round_trip<__int256_t>(0))
+ return 20;
+ if (!round_trip<__int256_t>(1))
+ return 21;
+ if (!round_trip<__int256_t>(-1))
+ return 22;
+ if (!round_trip<__int256_t>((__int256_t)1 << 200))
+ return 23;
+ if (!round_trip<__int256_t>(std::numeric_limits<__int256_t>::max()))
+ return 24;
+ if (!round_trip<__int256_t>(std::numeric_limits<__int256_t>::min()))
+ return 25;
+
+ // ====================================================================
+ // Non-decimal bases: hex, octal, binary
+ // ====================================================================
+
+ // --- Hex (base 16) ---
+ {
+ __uint256_t val;
+ const char* str = "ff";
+ auto [ptr, ec] = std::from_chars(str, str + 2, val, 16);
+ if (ec != std::errc{} || val != 255)
+ return 30;
+ }
+
+ // --- Hex round-trip ---
+ if (!round_trip_base<__uint256_t>((__uint256_t)1 << 128, 16))
+ return 31;
+ if (!round_trip_base<__uint256_t>(~(__uint256_t)0, 16))
+ return 32;
+
+ // --- Octal (base 8) ---
+ {
+ __uint256_t val;
+ const char* str = "777";
+ auto [ptr, ec] = std::from_chars(str, str + 3, val, 8);
+ if (ec != std::errc{} || val != 0777)
+ return 33;
+ }
+
+ // --- Binary (base 2) ---
+ {
+ __uint256_t val;
+ const char* str = "1010";
+ auto [ptr, ec] = std::from_chars(str, str + 4, val, 2);
+ if (ec != std::errc{} || val != 10)
+ return 34;
+ }
+
+ // --- Base 36 ---
+ if (!round_trip_base<__uint256_t>((__uint256_t)1 << 100, 36))
+ return 35;
+
+ // ====================================================================
+ // Overflow detection
+ // ====================================================================
+
+ // --- Unsigned overflow ---
+ {
+ __uint256_t val;
+ // max uint256 + 1 in decimal: append a digit to max
+ // Use a string that's definitely too large
+ const char* str = "115792089237316195423570985008687907853"
+ "269984665640564039457584007913129639936"; // 2^256
+ auto [ptr, ec] = std::from_chars(str, str + std::strlen(str), val);
+ if (ec != std::errc::result_out_of_range)
+ return 40;
+ }
+
+ // --- Signed overflow (positive) ---
+ {
+ __int256_t val;
+ // max int256 + 1 = 2^255
+ const char* str = "57896044618658097711785492504343953926"
+ "634992332820282019728792003956564819968";
+ auto [ptr, ec] = std::from_chars(str, str + std::strlen(str), val);
+ if (ec != std::errc::result_out_of_range)
+ return 41;
+ }
+
+ return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/numerics/charconv/int256.pass.cpp b/libcxx/test/libcxx/numerics/charconv/int256.pass.cpp
new file mode 100644
index 0000000000000..f40af2b5f6278
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/charconv/int256.pass.cpp
@@ -0,0 +1,87 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// Requires compiler-rt __int256 builtins (__udivoi3, __umodoi3) at runtime.
+// These are not yet available in the system compiler-rt library.
+// REQUIRES: int256-runtime
+
+// Test to_chars support for __uint256_t / __int256_t (Tier 3)
+
+#include <charconv>
+#include <cstring>
+#include <limits>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+int main(int, char**) {
+ char buf[80]; // 78 digits max + sign + null
+
+ // to_chars: small values that fit in 64-bit
+ {
+ __uint256_t val = 42;
+ auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), val);
+ *ptr = '\0';
+ if (ec != std::errc{} || std::strcmp(buf, "42") != 0)
+ return 1;
+ }
+
+ // to_chars: value that fits in 128-bit but not 64-bit
+ {
+ __uint256_t val = (__uint256_t)1 << 64;
+ auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), val);
+ *ptr = '\0';
+ if (ec != std::errc{} || std::strcmp(buf, "18446744073709551616") != 0)
+ return 2;
+ }
+
+ // to_chars: value > 128-bit
+ {
+ // 2^128 = 340282366920938463463374607431768211456
+ __uint256_t val = (__uint256_t)1 << 128;
+ auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), val);
+ *ptr = '\0';
+ if (ec != std::errc{} || std::strcmp(buf, "340282366920938463463374607431768211456") != 0)
+ return 3;
+ }
+
+ // to_chars: zero
+ {
+ __uint256_t val = 0;
+ auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), val);
+ *ptr = '\0';
+ if (ec != std::errc{} || std::strcmp(buf, "0") != 0)
+ return 4;
+ }
+
+ // to_chars: signed negative
+ {
+ __int256_t val = -1;
+ auto [ptr, ec] = std::to_chars(buf, buf + sizeof(buf), val);
+ *ptr = '\0';
+ if (ec != std::errc{} || std::strcmp(buf, "-1") != 0)
+ return 5;
+ }
+
+ // to_chars: buffer too small
+ {
+ __uint256_t val = (__uint256_t)1 << 128;
+ char small[5];
+ auto [ptr, ec] = std::to_chars(small, small + sizeof(small), val);
+ if (ec != std::errc::value_too_large)
+ return 6;
+ }
+
+ return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/numerics/numeric.limits/int256.pass.cpp b/libcxx/test/libcxx/numerics/numeric.limits/int256.pass.cpp
new file mode 100644
index 0000000000000..f50fc5ca2be2f
--- /dev/null
+++ b/libcxx/test/libcxx/numerics/numeric.limits/int256.pass.cpp
@@ -0,0 +1,211 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// Test std::numeric_limits specialization for __int256_t / __uint256_t.
+//
+// The generic __libcpp_numeric_limits<_Tp, true> template handles all
+// arithmetic types, including __int256_t and __uint256_t. This test verifies
+// that the specialization produces correct values for all properties.
+
+#include <cstddef>
+#include <limits>
+#include <type_traits>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+// ========================================================================
+// Static properties (compile-time)
+// ========================================================================
+
+// --- is_specialized ---
+static_assert(std::numeric_limits<__int256_t>::is_specialized, "");
+static_assert(std::numeric_limits<__uint256_t>::is_specialized, "");
+static_assert(std::numeric_limits<const __int256_t>::is_specialized, "");
+static_assert(std::numeric_limits<volatile __uint256_t>::is_specialized, "");
+static_assert(std::numeric_limits<const volatile __int256_t>::is_specialized, "");
+
+// --- is_signed ---
+static_assert(std::numeric_limits<__int256_t>::is_signed, "");
+static_assert(!std::numeric_limits<__uint256_t>::is_signed, "");
+
+// --- is_integer, is_exact ---
+static_assert(std::numeric_limits<__int256_t>::is_integer, "");
+static_assert(std::numeric_limits<__uint256_t>::is_integer, "");
+static_assert(std::numeric_limits<__int256_t>::is_exact, "");
+static_assert(std::numeric_limits<__uint256_t>::is_exact, "");
+
+// --- radix ---
+static_assert(std::numeric_limits<__int256_t>::radix == 2, "");
+static_assert(std::numeric_limits<__uint256_t>::radix == 2, "");
+
+// --- digits ---
+// __int256_t: 256 bits - 1 sign bit = 255 value bits
+// __uint256_t: 256 bits, all value bits
+static_assert(std::numeric_limits<__int256_t>::digits == 255, "");
+static_assert(std::numeric_limits<__uint256_t>::digits == 256, "");
+
+// --- digits10 ---
+// digits10 = floor(digits * log10(2))
+// For __int256_t: floor(255 * 0.30103) = floor(76.76) = 76
+// For __uint256_t: floor(256 * 0.30103) = floor(77.06) = 77
+static_assert(std::numeric_limits<__int256_t>::digits10 == 76, "");
+static_assert(std::numeric_limits<__uint256_t>::digits10 == 77, "");
+
+// --- max_digits10 ---
+static_assert(std::numeric_limits<__int256_t>::max_digits10 == 0, "");
+static_assert(std::numeric_limits<__uint256_t>::max_digits10 == 0, "");
+
+// --- is_bounded ---
+static_assert(std::numeric_limits<__int256_t>::is_bounded, "");
+static_assert(std::numeric_limits<__uint256_t>::is_bounded, "");
+
+// --- is_modulo ---
+// Signed: not modulo (overflow is UB). Unsigned: modulo (wraps).
+static_assert(!std::numeric_limits<__int256_t>::is_modulo, "");
+static_assert(std::numeric_limits<__uint256_t>::is_modulo, "");
+
+// --- has_infinity, has_quiet_NaN, etc. ---
+static_assert(!std::numeric_limits<__int256_t>::has_infinity, "");
+static_assert(!std::numeric_limits<__uint256_t>::has_infinity, "");
+static_assert(!std::numeric_limits<__int256_t>::has_quiet_NaN, "");
+static_assert(!std::numeric_limits<__uint256_t>::has_quiet_NaN, "");
+static_assert(!std::numeric_limits<__int256_t>::has_signaling_NaN, "");
+static_assert(!std::numeric_limits<__uint256_t>::has_signaling_NaN, "");
+
+// --- is_iec559 ---
+static_assert(!std::numeric_limits<__int256_t>::is_iec559, "");
+static_assert(!std::numeric_limits<__uint256_t>::is_iec559, "");
+
+// --- exponent fields ---
+static_assert(std::numeric_limits<__int256_t>::min_exponent == 0, "");
+static_assert(std::numeric_limits<__int256_t>::max_exponent == 0, "");
+static_assert(std::numeric_limits<__int256_t>::min_exponent10 == 0, "");
+static_assert(std::numeric_limits<__int256_t>::max_exponent10 == 0, "");
+
+// --- round_style ---
+static_assert(std::numeric_limits<__int256_t>::round_style == std::round_toward_zero, "");
+static_assert(std::numeric_limits<__uint256_t>::round_style == std::round_toward_zero, "");
+
+// --- Relationship to __int128 ---
+static_assert(std::numeric_limits<__int256_t>::digits == 2 * std::numeric_limits<__int128_t>::digits + 1, "");
+static_assert(std::numeric_limits<__uint256_t>::digits == 2 * std::numeric_limits<__uint128_t>::digits, "");
+
+// ========================================================================
+// Runtime value checks
+// ========================================================================
+
+int main(int, char**) {
+ // --- unsigned min/max ---
+ {
+ __uint256_t umin = std::numeric_limits<__uint256_t>::min();
+ __uint256_t umax = std::numeric_limits<__uint256_t>::max();
+ __uint256_t ulow = std::numeric_limits<__uint256_t>::lowest();
+
+ // min() for unsigned is 0
+ if (umin != 0)
+ return 1;
+
+ // max() is all-ones (2^256 - 1)
+ if (umax != ~(__uint256_t)0)
+ return 2;
+
+ // lowest() == min() for integers
+ if (ulow != umin)
+ return 3;
+
+ // max + 1 wraps to 0 (unsigned modulo)
+ __uint256_t wrapped = umax + 1;
+ if (wrapped != 0)
+ return 4;
+ }
+
+ // --- signed min/max ---
+ {
+ __int256_t smin = std::numeric_limits<__int256_t>::min();
+ __int256_t smax = std::numeric_limits<__int256_t>::max();
+ __int256_t slow = std::numeric_limits<__int256_t>::lowest();
+
+ // min() is negative (sign bit set)
+ if (smin >= 0)
+ return 5;
+
+ // max() is positive
+ if (smax <= 0)
+ return 6;
+
+ // lowest() == min() for integers
+ if (slow != smin)
+ return 7;
+
+ // min() == -(2^255)
+ // Verify by checking that min() has only the MSB set when viewed as unsigned
+ __uint256_t umin_bits = (__uint256_t)smin;
+ __uint256_t expected_msb = (__uint256_t)1 << 255;
+ if (umin_bits != expected_msb)
+ return 8;
+
+ // max() == 2^255 - 1
+ // All bits except MSB are set
+ __uint256_t umax_bits = (__uint256_t)smax;
+ if (umax_bits != (expected_msb - 1))
+ return 9;
+
+ // min + max == -1 (two's complement identity)
+ if (smin + smax != -1)
+ return 10;
+ }
+
+ // --- epsilon, denorm_min, infinity, NaN are all zero for integers ---
+ {
+ if (std::numeric_limits<__int256_t>::epsilon() != 0)
+ return 11;
+ if (std::numeric_limits<__int256_t>::round_error() != 0)
+ return 12;
+ if (std::numeric_limits<__int256_t>::infinity() != 0)
+ return 13;
+ if (std::numeric_limits<__int256_t>::quiet_NaN() != 0)
+ return 14;
+ if (std::numeric_limits<__int256_t>::signaling_NaN() != 0)
+ return 15;
+ if (std::numeric_limits<__int256_t>::denorm_min() != 0)
+ return 16;
+ }
+
+ // --- const/volatile qualifiers preserve behavior ---
+ {
+ if (std::numeric_limits<const __uint256_t>::max() != std::numeric_limits<__uint256_t>::max())
+ return 17;
+ if (std::numeric_limits<volatile __int256_t>::min() != std::numeric_limits<__int256_t>::min())
+ return 18;
+ if (std::numeric_limits<const volatile __uint256_t>::digits != 256)
+ return 19;
+ }
+
+ // --- Cross-check with __int128 ---
+ {
+ // max(__uint256_t) > max(__uint128_t)
+ __uint256_t u256_max = std::numeric_limits<__uint256_t>::max();
+ __uint128_t u128_max = std::numeric_limits<__uint128_t>::max();
+ if (u256_max <= (__uint256_t)u128_max)
+ return 20;
+
+ // The upper 128 bits of max(__uint256_t) should be max(__uint128_t)
+ __uint128_t upper = (__uint128_t)(u256_max >> 128);
+ if (upper != u128_max)
+ return 21;
+ }
+
+ return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/type_traits/convert_to_integral.pass.cpp b/libcxx/test/libcxx/type_traits/convert_to_integral.pass.cpp
index f1036b3929f06..d77b1dd626bd6 100644
--- a/libcxx/test/libcxx/type_traits/convert_to_integral.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/convert_to_integral.pass.cpp
@@ -112,6 +112,10 @@ int main(int, char**)
#ifndef TEST_HAS_NO_INT128
check_integral_types<__int128_t, __int128_t>();
check_integral_types<__uint128_t, __uint128_t>();
+#endif
+#ifndef TEST_HAS_NO_INT256
+ check_integral_types<__int256_t, __int256_t>();
+ check_integral_types<__uint256_t, __uint256_t>();
#endif
// TODO(ericwf): Not standard
typedef std::underlying_type<enum1>::type Enum1UT;
diff --git a/libcxx/test/libcxx/type_traits/int256.pass.cpp b/libcxx/test/libcxx/type_traits/int256.pass.cpp
new file mode 100644
index 0000000000000..a50fbb4959ffd
--- /dev/null
+++ b/libcxx/test/libcxx/type_traits/int256.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// Test type traits support for __int256_t / __uint256_t
+
+#include <cstddef>
+#include <limits>
+#include <type_traits>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+// is_integral
+static_assert(std::is_integral<__int256_t>::value, "");
+static_assert(std::is_integral<__uint256_t>::value, "");
+static_assert(std::is_integral<const __int256_t>::value, "");
+static_assert(std::is_integral<volatile __uint256_t>::value, "");
+
+// is_arithmetic (derived from is_integral)
+static_assert(std::is_arithmetic<__int256_t>::value, "");
+static_assert(std::is_arithmetic<__uint256_t>::value, "");
+
+// is_signed / is_unsigned
+static_assert(std::is_signed<__int256_t>::value, "");
+static_assert(!std::is_unsigned<__int256_t>::value, "");
+static_assert(!std::is_signed<__uint256_t>::value, "");
+static_assert(std::is_unsigned<__uint256_t>::value, "");
+
+// is_fundamental
+static_assert(std::is_fundamental<__int256_t>::value, "");
+static_assert(std::is_fundamental<__uint256_t>::value, "");
+
+// is_scalar
+static_assert(std::is_scalar<__int256_t>::value, "");
+static_assert(std::is_scalar<__uint256_t>::value, "");
+
+// make_signed / make_unsigned
+static_assert(std::is_same<std::make_signed<__uint256_t>::type, __int256_t>::value, "");
+static_assert(std::is_same<std::make_signed<__int256_t>::type, __int256_t>::value, "");
+static_assert(std::is_same<std::make_unsigned<__int256_t>::type, __uint256_t>::value, "");
+static_assert(std::is_same<std::make_unsigned<__uint256_t>::type, __uint256_t>::value, "");
+
+# if TEST_STD_VER >= 14
+static_assert(std::is_same<std::make_signed_t<__uint256_t>, __int256_t>::value, "");
+static_assert(std::is_same<std::make_unsigned_t<__int256_t>, __uint256_t>::value, "");
+# endif
+
+// numeric_limits
+static_assert(std::numeric_limits<__int256_t>::is_specialized, "");
+static_assert(std::numeric_limits<__uint256_t>::is_specialized, "");
+static_assert(std::numeric_limits<__int256_t>::is_integer, "");
+static_assert(std::numeric_limits<__uint256_t>::is_integer, "");
+static_assert(std::numeric_limits<__int256_t>::is_signed, "");
+static_assert(!std::numeric_limits<__uint256_t>::is_signed, "");
+static_assert(std::numeric_limits<__int256_t>::digits == 255, ""); // 256 - 1 sign bit
+static_assert(std::numeric_limits<__uint256_t>::digits == 256, "");
+static_assert(std::numeric_limits<__int256_t>::is_exact, "");
+static_assert(std::numeric_limits<__uint256_t>::radix == 2, "");
+
+// sizeof
+static_assert(sizeof(__int256_t) == 32, "");
+static_assert(sizeof(__uint256_t) == 32, "");
+
+// Comparison with __int128
+static_assert(sizeof(__int256_t) == 2 * sizeof(__int128_t), "");
+static_assert(std::numeric_limits<__uint256_t>::digits == 2 * std::numeric_limits<__uint128_t>::digits, "");
+
+int main(int, char**) {
+ // Runtime basic sanity
+ __int256_t a = 42;
+ __uint256_t b = 100;
+ __int256_t c = a + (__int256_t)b;
+ (void)c;
+
+ // make_signed / make_unsigned runtime
+ std::make_unsigned<__int256_t>::type u = 1;
+ std::make_signed<__uint256_t>::type s = -1;
+ (void)u;
+ (void)s;
+
+ return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/type_traits/is_always_bitcastable.compile.pass.cpp b/libcxx/test/libcxx/type_traits/is_always_bitcastable.compile.pass.cpp
index 9bbb85f2fe30c..e0cfd74153344 100644
--- a/libcxx/test/libcxx/type_traits/is_always_bitcastable.compile.pass.cpp
+++ b/libcxx/test/libcxx/type_traits/is_always_bitcastable.compile.pass.cpp
@@ -105,6 +105,11 @@ constexpr void test() {
check<true, types::type_list<__int128_t, __uint128_t>>();
#endif
+ // 256-bit types.
+#ifndef TEST_HAS_NO_INT256
+ check<true, types::type_list<__int256_t, __uint256_t>>();
+#endif
+
// Bool.
check<true, types::type_list<bool>, types::concatenate_t<types::type_list<bool>, integral_8>>();
diff --git a/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp b/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
index 8ecca81cdfe58..85497629b3261 100644
--- a/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.arguments/format.arg/arg_t.compile.pass.cpp
@@ -21,19 +21,23 @@
static_assert(std::is_same_v<std::underlying_type_t<std::__format::__arg_t>, std::uint8_t>);
+// The 128-bit and 256-bit types are unconditionally in the enum to avoid
+// the values depending on the availability of extended integer types.
static_assert(std::uint8_t(std::__format::__arg_t::__none) == 0);
static_assert(std::uint8_t(std::__format::__arg_t::__boolean) == 1);
static_assert(std::uint8_t(std::__format::__arg_t::__char_type) == 2);
static_assert(std::uint8_t(std::__format::__arg_t::__int) == 3);
static_assert(std::uint8_t(std::__format::__arg_t::__long_long) == 4);
static_assert(std::uint8_t(std::__format::__arg_t::__i128) == 5);
-static_assert(std::uint8_t(std::__format::__arg_t::__unsigned) == 6);
-static_assert(std::uint8_t(std::__format::__arg_t::__unsigned_long_long) == 7);
-static_assert(std::uint8_t(std::__format::__arg_t::__u128) == 8);
-static_assert(std::uint8_t(std::__format::__arg_t::__float) == 9);
-static_assert(std::uint8_t(std::__format::__arg_t::__double) == 10);
-static_assert(std::uint8_t(std::__format::__arg_t::__long_double) == 11);
-static_assert(std::uint8_t(std::__format::__arg_t::__const_char_type_ptr) == 12);
-static_assert(std::uint8_t(std::__format::__arg_t::__string_view) == 13);
-static_assert(std::uint8_t(std::__format::__arg_t::__ptr) == 14);
-static_assert(std::uint8_t(std::__format::__arg_t::__handle) == 15);
+static_assert(std::uint8_t(std::__format::__arg_t::__i256) == 6);
+static_assert(std::uint8_t(std::__format::__arg_t::__unsigned) == 7);
+static_assert(std::uint8_t(std::__format::__arg_t::__unsigned_long_long) == 8);
+static_assert(std::uint8_t(std::__format::__arg_t::__u128) == 9);
+static_assert(std::uint8_t(std::__format::__arg_t::__u256) == 10);
+static_assert(std::uint8_t(std::__format::__arg_t::__float) == 11);
+static_assert(std::uint8_t(std::__format::__arg_t::__double) == 12);
+static_assert(std::uint8_t(std::__format::__arg_t::__long_double) == 13);
+static_assert(std::uint8_t(std::__format::__arg_t::__const_char_type_ptr) == 14);
+static_assert(std::uint8_t(std::__format::__arg_t::__string_view) == 15);
+static_assert(std::uint8_t(std::__format::__arg_t::__ptr) == 16);
+static_assert(std::uint8_t(std::__format::__arg_t::__handle) == 17);
diff --git a/libcxx/test/libcxx/utilities/format/format.arguments/int256.pass.cpp b/libcxx/test/libcxx/utilities/format/format.arguments/int256.pass.cpp
new file mode 100644
index 0000000000000..d4bee9ab9f0f7
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/format/format.arguments/int256.pass.cpp
@@ -0,0 +1,119 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: 32-bit-pointer
+// UNSUPPORTED: gcc
+
+// Decimal formatting of __uint256_t requires division builtins from compiler-rt.
+// ADDITIONAL_COMPILE_FLAGS: --rtlib=compiler-rt
+
+// Test std::format support for __int256_t / __uint256_t
+
+#include <cassert>
+#include <format>
+#include <string>
+
+#include "test_macros.h"
+
+#ifdef TEST_HAS_NO_INT256
+int main(int, char**) { return 0; }
+#else
+
+int main(int, char**) {
+ // Basic decimal formatting
+ assert(std::format("{}", (__uint256_t)0) == "0");
+ assert(std::format("{}", (__uint256_t)42) == "42");
+ assert(std::format("{}", (__int256_t)-42) == "-42");
+ assert(std::format("{}", (__int256_t)0) == "0");
+
+ // Large values
+ assert(std::format("{}", (__uint256_t)1 << 64) == "18446744073709551616");
+ assert(std::format("{}", (__uint256_t)1 << 128) == "340282366920938463463374607431768211456");
+
+ // Max value (2^256 - 1)
+ assert(std::format("{}", (__uint256_t)-1) ==
+ "115792089237316195423570985008687907853269984665640564039457584007913129639935");
+
+ // Width and alignment
+ assert(std::format("{:>5}", (__uint256_t)42) == " 42");
+ assert(std::format("{:<5}", (__uint256_t)42) == "42 ");
+ assert(std::format("{:^5}", (__uint256_t)42) == " 42 ");
+
+ // Fill character
+ assert(std::format("{:*>5}", (__uint256_t)42) == "***42");
+ assert(std::format("{:0>5}", (__uint256_t)42) == "00042");
+
+ // Sign
+ assert(std::format("{:+}", (__int256_t)42) == "+42");
+ assert(std::format("{:+}", (__int256_t)-42) == "-42");
+ assert(std::format("{: }", (__int256_t)42) == " 42");
+
+ // Hexadecimal
+ assert(std::format("{:x}", (__uint256_t)255) == "ff");
+ assert(std::format("{:X}", (__uint256_t)255) == "FF");
+ assert(std::format("{:#x}", (__uint256_t)255) == "0xff");
+ assert(std::format("{:#X}", (__uint256_t)255) == "0XFF");
+
+ // Octal
+ assert(std::format("{:o}", (__uint256_t)8) == "10");
+ assert(std::format("{:#o}", (__uint256_t)8) == "010");
+
+ // Binary
+ assert(std::format("{:b}", (__uint256_t)10) == "1010");
+ assert(std::format("{:#b}", (__uint256_t)10) == "0b1010");
+
+ // Zero-padded with width
+ assert(std::format("{:010}", (__uint256_t)42) == "0000000042");
+ assert(std::format("{:010}", (__int256_t)-42) == "-000000042");
+
+ // Comparison with __int128 formatting (should produce identical results
+ // for values that fit in both types)
+ __int128_t i128val = 123456789012345LL;
+ __int256_t i256val = 123456789012345LL;
+ assert(std::format("{}", i128val) == std::format("{}", i256val));
+ assert(std::format("{:+020x}", i128val) == std::format("{:+020x}", i256val));
+
+ // Full-width big-number tests (all 4 x 64-bit limbs populated).
+ // Hex output directly corresponds to the hex digits of the input value.
+ {
+ __uint256_t big = ((__uint256_t)0xAAAABBBBCCCCDDDDULL << 192) | ((__uint256_t)0xEEEEFFFF11112222ULL << 128) |
+ ((__uint256_t)0x3333444455556666ULL << 64) | (__uint256_t)0x7777888899990000ULL;
+ assert(std::format("{:x}", big) == "aaaabbbbccccddddeeeeffff1111222233334444555566667777888899990000");
+ assert(std::format("{:X}", big) == "AAAABBBBCCCCDDDDEEEEFFFF1111222233334444555566667777888899990000");
+ assert(std::format("{:#x}", big) == "0xaaaabbbbccccddddeeeeffff1111222233334444555566667777888899990000");
+ // Width and alignment (64 hex digits, padded to 70)
+ assert(std::format("{:>70x}", big) == " aaaabbbbccccddddeeeeffff1111222233334444555566667777888899990000");
+ assert(std::format("{:*<70x}", big) == "aaaabbbbccccddddeeeeffff1111222233334444555566667777888899990000******");
+ // Zero-padded hex with prefix
+ assert(std::format("{:#070x}", big) == "0x0000aaaabbbbccccddddeeeeffff1111222233334444555566667777888899990000");
+ }
+
+ // INT256_MIN: -(2^255).
+ // Decimal verified: 2^256 = 11579...9936 (from UINT256_MAX + 1),
+ // so 2^255 = 5789604461865809771178549250434395392663499233282028201972879200395656481
+ // 9968
+ {
+ __uint256_t u_min = (__uint256_t)1 << 255;
+ __int256_t min_val = (__int256_t)u_min;
+ assert(std::format("{}", min_val) ==
+ "-57896044618658097711785492504343953926634992332820282019728792003956564819968");
+ }
+
+ // Large signed negative value in decimal (all limbs significant)
+ {
+ __int256_t neg = (__int256_t)-42;
+ // Verify hex representation: -42 in hex is "-2a"
+ assert(std::format("{:x}", neg) == "-2a");
+ // Wide format of a negative value
+ assert(std::format("{:+80}", neg) == std::string(77, ' ') + "-42");
+ }
+
+ return 0;
+}
+#endif
diff --git a/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp b/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
index cd06c509ffda2..a5fa0900eb7e7 100644
--- a/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.formatter/format.context/types.compile.pass.cpp
@@ -69,6 +69,12 @@ constexpr void test() {
std::is_same_v<typename std::basic_format_context<
OutIt, CharT>::template formatter_type<__uint128_t>,
std::formatter<__uint128_t, CharT>>);
+#endif
+#ifndef TEST_HAS_NO_INT256
+ static_assert(std::is_same_v<typename std::basic_format_context< OutIt, CharT>::template formatter_type<__int256_t>,
+ std::formatter<__int256_t, CharT>>);
+ static_assert(std::is_same_v<typename std::basic_format_context< OutIt, CharT>::template formatter_type<__uint256_t>,
+ std::formatter<__uint256_t, CharT>>);
#endif
static_assert(
std::is_same_v<typename std::basic_format_context<
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 8d88d6fad7d0b..e518fd9a2dd56 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -435,6 +435,14 @@ inline Tp const& DoNotOptimize(Tp const& value) {
# define TEST_HAS_NO_INT128
#endif
+#ifdef _LIBCPP_USE_FROZEN_CXX03_HEADERS
+# define TEST_HAS_NO_INT256
+#elif defined(_LIBCPP_VERSION) && (!defined(_LIBCPP_HAS_INT256) || !_LIBCPP_HAS_INT256)
+# define TEST_HAS_NO_INT256
+#elif !defined(__SIZEOF_INT256__)
+# define TEST_HAS_NO_INT256
+#endif
+
#if defined(_LIBCPP_VERSION) && !_LIBCPP_HAS_LOCALIZATION
# define TEST_HAS_NO_LOCALIZATION
#endif
>From 7dc5519b4e56e1511c6646f313e5750a78bbb8cc Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:41:03 +0100
Subject: [PATCH 12/17] [lldb] Add __int256/__uint256 debugger support
Extend LLDB to handle 256-bit integer types:
- Scalar.h/cpp: Add e_sint256/e_uint256 to Scalar::Type enum, extend
APInt operations for 256-bit width
- lldb-enumerations.h: Add eEncodingSint256/eEncodingUint256
- TypeSystemClang.cpp: Map clang's Int256/UInt256 builtin types to
LLDB scalar type, handle in GetEncoding/GetFormat/GetBasicTypeFromAST
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
lldb/include/lldb/Utility/Scalar.h | 4 +++
lldb/include/lldb/lldb-enumerations.h | 2 ++
.../TypeSystem/Clang/TypeSystemClang.cpp | 28 +++++++++++++++++++
lldb/source/Utility/Scalar.cpp | 24 ++++++++++++++++
4 files changed, 58 insertions(+)
diff --git a/lldb/include/lldb/Utility/Scalar.h b/lldb/include/lldb/Utility/Scalar.h
index dbb260962f1d6..5567c4ff1c671 100644
--- a/lldb/include/lldb/Utility/Scalar.h
+++ b/lldb/include/lldb/Utility/Scalar.h
@@ -179,6 +179,10 @@ class Scalar {
llvm::APInt UInt128(const llvm::APInt &fail_value) const;
+ llvm::APInt SInt256(const llvm::APInt &fail_value) const;
+
+ llvm::APInt UInt256(const llvm::APInt &fail_value) const;
+
float Float(float fail_value = 0.0f) const;
double Double(double fail_value = 0.0) const;
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 7ebcb2214e0e4..e4a0306784b21 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -836,6 +836,8 @@ enum BasicType {
eBasicTypeUnsignedLongLong,
eBasicTypeInt128,
eBasicTypeUnsignedInt128,
+ eBasicTypeInt256,
+ eBasicTypeUnsignedInt256,
eBasicTypeBool,
eBasicTypeHalf,
eBasicTypeFloat,
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 0984d4d7190e7..354d45210c37d 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -771,6 +771,8 @@ TypeSystemClang::GetBuiltinTypeForEncodingAndBitSize(Encoding encoding,
return GetType(ast.UnsignedLongLongTy);
if (QualTypeMatchesBitSize(bit_size, ast, ast.UnsignedInt128Ty))
return GetType(ast.UnsignedInt128Ty);
+ if (QualTypeMatchesBitSize(bit_size, ast, ast.UnsignedInt256Ty))
+ return GetType(ast.UnsignedInt256Ty);
break;
case eEncodingSint:
@@ -786,6 +788,8 @@ TypeSystemClang::GetBuiltinTypeForEncodingAndBitSize(Encoding encoding,
return GetType(ast.LongLongTy);
if (QualTypeMatchesBitSize(bit_size, ast, ast.Int128Ty))
return GetType(ast.Int128Ty);
+ if (QualTypeMatchesBitSize(bit_size, ast, ast.Int256Ty))
+ return GetType(ast.Int256Ty);
break;
case eEncodingIEEE754:
@@ -864,6 +868,12 @@ lldb::BasicType TypeSystemClang::GetBasicTypeEnumeration(llvm::StringRef name) {
{"__int128", eBasicTypeInt128},
{"unsigned __int128", eBasicTypeUnsignedInt128},
+ // "int256"
+ {"__int256_t", eBasicTypeInt256},
+ {"__uint256_t", eBasicTypeUnsignedInt256},
+ {"__int256", eBasicTypeInt256},
+ {"unsigned __int256", eBasicTypeUnsignedInt256},
+
// "bool"
{"bool", eBasicTypeBool},
{"_Bool", eBasicTypeBool},
@@ -2043,6 +2053,10 @@ TypeSystemClang::GetOpaqueCompilerType(clang::ASTContext *ast,
return ast->Int128Ty.getAsOpaquePtr();
case eBasicTypeUnsignedInt128:
return ast->UnsignedInt128Ty.getAsOpaquePtr();
+ case eBasicTypeInt256:
+ return ast->Int256Ty.getAsOpaquePtr();
+ case eBasicTypeUnsignedInt256:
+ return ast->UnsignedInt256Ty.getAsOpaquePtr();
case eBasicTypeBool:
return ast->BoolTy.getAsOpaquePtr();
case eBasicTypeHalf:
@@ -3812,6 +3826,7 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
case clang::BuiltinType::ULong:
case clang::BuiltinType::ULongLong:
case clang::BuiltinType::UInt128:
+ case clang::BuiltinType::UInt256:
case clang::BuiltinType::Char_S:
case clang::BuiltinType::SChar:
case clang::BuiltinType::WChar_S:
@@ -3820,6 +3835,7 @@ TypeSystemClang::GetTypeInfo(lldb::opaque_compiler_type_t type,
case clang::BuiltinType::Long:
case clang::BuiltinType::LongLong:
case clang::BuiltinType::Int128:
+ case clang::BuiltinType::Int256:
case clang::BuiltinType::Float:
case clang::BuiltinType::Double:
case clang::BuiltinType::LongDouble:
@@ -4814,6 +4830,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type) {
case clang::BuiltinType::Long:
case clang::BuiltinType::LongLong:
case clang::BuiltinType::Int128:
+ case clang::BuiltinType::Int256:
return lldb::eEncodingSint;
case clang::BuiltinType::Bool:
@@ -4828,6 +4845,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type) {
case clang::BuiltinType::ULong:
case clang::BuiltinType::ULongLong:
case clang::BuiltinType::UInt128:
+ case clang::BuiltinType::UInt256:
return lldb::eEncodingUint;
// Fixed point types. Note that they are currently ignored.
@@ -5144,6 +5162,10 @@ lldb::Format TypeSystemClang::GetFormat(lldb::opaque_compiler_type_t type) {
return lldb::eFormatUnsigned;
case clang::BuiltinType::Int128:
return lldb::eFormatDecimal;
+ case clang::BuiltinType::UInt256:
+ return lldb::eFormatUnsigned;
+ case clang::BuiltinType::Int256:
+ return lldb::eFormatDecimal;
case clang::BuiltinType::Half:
case clang::BuiltinType::Float:
case clang::BuiltinType::Double:
@@ -5455,6 +5477,10 @@ TypeSystemClang::GetBasicTypeEnumeration(lldb::opaque_compiler_type_t type) {
return eBasicTypeInt128;
case clang::BuiltinType::UInt128:
return eBasicTypeUnsignedInt128;
+ case clang::BuiltinType::Int256:
+ return eBasicTypeInt256;
+ case clang::BuiltinType::UInt256:
+ return eBasicTypeUnsignedInt256;
case clang::BuiltinType::Half:
return eBasicTypeHalf;
@@ -6020,6 +6046,7 @@ uint32_t TypeSystemClang::GetNumPointeeChildren(clang::QualType type) {
case clang::BuiltinType::ULong:
case clang::BuiltinType::ULongLong:
case clang::BuiltinType::UInt128:
+ case clang::BuiltinType::UInt256:
case clang::BuiltinType::Char_S:
case clang::BuiltinType::SChar:
case clang::BuiltinType::WChar_S:
@@ -6028,6 +6055,7 @@ uint32_t TypeSystemClang::GetNumPointeeChildren(clang::QualType type) {
case clang::BuiltinType::Long:
case clang::BuiltinType::LongLong:
case clang::BuiltinType::Int128:
+ case clang::BuiltinType::Int256:
case clang::BuiltinType::Float:
case clang::BuiltinType::Double:
case clang::BuiltinType::LongDouble:
diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp
index f2c18cdd896da..f01be182c2d3f 100644
--- a/lldb/source/Utility/Scalar.cpp
+++ b/lldb/source/Utility/Scalar.cpp
@@ -390,6 +390,30 @@ llvm::APInt Scalar::UInt128(const llvm::APInt &fail_value) const {
return fail_value;
}
+llvm::APInt Scalar::SInt256(const llvm::APInt &fail_value) const {
+ switch (m_type) {
+ case e_void:
+ break;
+ case e_int:
+ return m_integer;
+ case e_float:
+ return ToAPInt(m_float, 256, /*is_unsigned=*/false);
+ }
+ return fail_value;
+}
+
+llvm::APInt Scalar::UInt256(const llvm::APInt &fail_value) const {
+ switch (m_type) {
+ case e_void:
+ break;
+ case e_int:
+ return m_integer;
+ case e_float:
+ return ToAPInt(m_float, 256, /*is_unsigned=*/true);
+ }
+ return fail_value;
+}
+
float Scalar::Float(float fail_value) const {
switch (m_type) {
case e_void:
>From 7f28fdc6e73ab3c4cbc3c3d07f00d859cf82e6a6 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:41:13 +0100
Subject: [PATCH 13/17] [libc] Add __int256/__uint256 type support
Extend LLVM libc type support infrastructure for 256-bit integers:
- types.h: Define LIBC_TYPES_HAS_INT256, Int256/UInt256 typedefs
- uint128.h: Add Int256/UInt256 to the header (alongside UInt128)
- is_integral.h: Recognize __int256_t/__uint256_t as integral types
- make_signed/unsigned.h: Add 256-bit type mappings
- big_int.h: Specialize BigInt traits for native __int256
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
libc/src/__support/CPP/type_traits/is_integral.h | 3 +++
libc/src/__support/CPP/type_traits/make_signed.h | 4 ++++
libc/src/__support/CPP/type_traits/make_unsigned.h | 4 ++++
libc/src/__support/big_int.h | 3 +++
libc/src/__support/macros/properties/types.h | 5 +++++
libc/src/__support/uint128.h | 8 ++++++++
6 files changed, 27 insertions(+)
diff --git a/libc/src/__support/CPP/type_traits/is_integral.h b/libc/src/__support/CPP/type_traits/is_integral.h
index 09047cb00bf75..fa83cbcdbff84 100644
--- a/libc/src/__support/CPP/type_traits/is_integral.h
+++ b/libc/src/__support/CPP/type_traits/is_integral.h
@@ -28,6 +28,9 @@ template <typename T> struct is_integral {
public:
LIBC_INLINE_VAR static constexpr bool value =
__is_unqualified_any_of<T,
+#ifdef LIBC_TYPES_HAS_INT256
+ __int256_t, __uint256_t,
+#endif
#ifdef LIBC_TYPES_HAS_INT128
__int128_t, __uint128_t,
#endif
diff --git a/libc/src/__support/CPP/type_traits/make_signed.h b/libc/src/__support/CPP/type_traits/make_signed.h
index 00bc6be8fcc18..1f822533d25f2 100644
--- a/libc/src/__support/CPP/type_traits/make_signed.h
+++ b/libc/src/__support/CPP/type_traits/make_signed.h
@@ -33,6 +33,10 @@ struct make_signed<unsigned long long> : type_identity<long long> {};
template <> struct make_signed<__int128_t> : type_identity<__int128_t> {};
template <> struct make_signed<__uint128_t> : type_identity<__int128_t> {};
#endif
+#ifdef LIBC_TYPES_HAS_INT256
+template <> struct make_signed<__int256_t> : type_identity<__int256_t> {};
+template <> struct make_signed<__uint256_t> : type_identity<__int256_t> {};
+#endif
template <typename T> using make_signed_t = typename make_signed<T>::type;
} // namespace cpp
diff --git a/libc/src/__support/CPP/type_traits/make_unsigned.h b/libc/src/__support/CPP/type_traits/make_unsigned.h
index e5f60ae665219..9ed8b8e01b4c0 100644
--- a/libc/src/__support/CPP/type_traits/make_unsigned.h
+++ b/libc/src/__support/CPP/type_traits/make_unsigned.h
@@ -38,6 +38,10 @@ struct make_unsigned<unsigned long long> : type_identity<unsigned long long> {};
template <> struct make_unsigned<__int128_t> : type_identity<__uint128_t> {};
template <> struct make_unsigned<__uint128_t> : type_identity<__uint128_t> {};
#endif
+#ifdef LIBC_TYPES_HAS_INT256
+template <> struct make_unsigned<__int256_t> : type_identity<__uint256_t> {};
+template <> struct make_unsigned<__uint256_t> : type_identity<__uint256_t> {};
+#endif
template <typename T> using make_unsigned_t = typename make_unsigned<T>::type;
} // namespace cpp
diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
index bb9cefd67b552..e154a43656434 100644
--- a/libc/src/__support/big_int.h
+++ b/libc/src/__support/big_int.h
@@ -38,6 +38,9 @@ template <> struct half_width<uint32_t> : cpp::type_identity<uint16_t> {};
template <> struct half_width<uint64_t> : cpp::type_identity<uint32_t> {};
#ifdef LIBC_TYPES_HAS_INT128
template <> struct half_width<__uint128_t> : cpp::type_identity<uint64_t> {};
+#ifdef LIBC_TYPES_HAS_INT256
+template <> struct half_width<__uint256_t> : cpp::type_identity<__uint128_t> {};
+#endif // LIBC_TYPES_HAS_INT256
#endif // LIBC_TYPES_HAS_INT128
#endif // LIBC_TYPES_HAS_INT64
template <typename T> using half_width_t = typename half_width<T>::type;
diff --git a/libc/src/__support/macros/properties/types.h b/libc/src/__support/macros/properties/types.h
index 3259c8a6a1d12..61b991d22eb8f 100644
--- a/libc/src/__support/macros/properties/types.h
+++ b/libc/src/__support/macros/properties/types.h
@@ -46,6 +46,11 @@
#define LIBC_TYPES_HAS_INT128
#endif // defined(__SIZEOF_INT128__)
+// int256 / uint256 support
+#if defined(__SIZEOF_INT256__)
+#define LIBC_TYPES_HAS_INT256
+#endif // defined(__SIZEOF_INT256__)
+
// -- float16 support ---------------------------------------------------------
// LIBC_TYPES_HAS_FLOAT16 is provided by
// "include/llvm-libc-macros/float16-macros.h"
diff --git a/libc/src/__support/uint128.h b/libc/src/__support/uint128.h
index 722e79d0802e2..6e40aee314f7c 100644
--- a/libc/src/__support/uint128.h
+++ b/libc/src/__support/uint128.h
@@ -20,4 +20,12 @@ using UInt128 = LIBC_NAMESPACE::UInt<128>;
using Int128 = LIBC_NAMESPACE::Int<128>;
#endif // LIBC_TYPES_HAS_INT128
+#ifdef LIBC_TYPES_HAS_INT256
+using UInt256 = __uint256_t;
+using Int256 = __int256_t;
+#else
+using UInt256 = LIBC_NAMESPACE::UInt<256>;
+using Int256 = LIBC_NAMESPACE::Int<256>;
+#endif // LIBC_TYPES_HAS_INT256
+
#endif // LLVM_LIBC_SRC___SUPPORT_UINT128_H
>From c0cd30f00d955c3c4513fa514adc3fce1f982059 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Tue, 24 Feb 2026 21:41:22 +0100
Subject: [PATCH 14/17] [flang] Add __int256 host type mapping
Map __int256_t to Fortran INTEGER(32) in the host type evaluation
infrastructure, enabling constant folding for 256-bit integer
expressions when the host compiler supports __int256.
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
flang/lib/Evaluate/host.h | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/flang/lib/Evaluate/host.h b/flang/lib/Evaluate/host.h
index fbb52f2886a40..81cbf0b8071ea 100644
--- a/flang/lib/Evaluate/host.h
+++ b/flang/lib/Evaluate/host.h
@@ -129,6 +129,14 @@ template <> struct HostTypeHelper<Type<TypeCategory::Integer, 16>> {
#endif
};
+template <> struct HostTypeHelper<Type<TypeCategory::Integer, 32>> {
+#if defined(__SIZEOF_INT256__)
+ using Type = __int256_t;
+#else
+ using Type = UnsupportedType;
+#endif
+};
+
// TODO no mapping to host types are defined currently for 16bits float
// It should be defined when gcc/clang have a better support for it.
>From f7b5b7bbfe71d27a2f4ef5d8658183dfe9e61068 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Thu, 26 Feb 2026 14:49:56 +0100
Subject: [PATCH 15/17] [clang][compiler-rt][llvm] Strengthen __int256 tests
and documentation
Add new test coverage from PR audit remediation:
- RISC-V 64 ABI test (mirrors x86-64/AArch64 patterns)
- _BitInt(192/200) division routing through __divoi3/__udivoi3
- Compound assignment and increment/decrement codegen
- 32-bit target rejection (i686, armv7)
- SPIR target rejection (spirv64, spirv32)
- Stronger debug info test with parameter/local variable checks
Add documentation comments:
- ppcf128<->i256 conversion gap in TargetLoweringBase.cpp
- Shift builtin non-compiler-called status in ashloi3/lshroi3/ashroi3
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
.../CodeGen/RISCV/riscv64-arguments-int256.c | 37 ++++++++++++
clang/test/CodeGen/debug-info-int256.c | 14 +++++
clang/test/CodeGen/int256-compound-assign.c | 56 +++++++++++++++++++
clang/test/Sema/int256-spir-unsupported.c | 8 +++
clang/test/Sema/int256-unsupported-target.c | 13 +++++
compiler-rt/lib/builtins/ashloi3.c | 5 ++
compiler-rt/lib/builtins/ashroi3.c | 5 ++
compiler-rt/lib/builtins/lshroi3.c | 5 ++
llvm/lib/CodeGen/TargetLoweringBase.cpp | 12 ++++
9 files changed, 155 insertions(+)
create mode 100644 clang/test/CodeGen/RISCV/riscv64-arguments-int256.c
create mode 100644 clang/test/CodeGen/int256-compound-assign.c
create mode 100644 clang/test/Sema/int256-spir-unsupported.c
create mode 100644 clang/test/Sema/int256-unsupported-target.c
diff --git a/clang/test/CodeGen/RISCV/riscv64-arguments-int256.c b/clang/test/CodeGen/RISCV/riscv64-arguments-int256.c
new file mode 100644
index 0000000000000..8b3d97a2b075b
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/riscv64-arguments-int256.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple riscv64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify RISC-V 64 IR generation for __int256_t arguments and returns.
+
+// CHECK-LABEL: define{{.*}} i256 @f_ret256(i256 noundef %a)
+__int256_t f_ret256(__int256_t a) { return a; }
+
+// CHECK-LABEL: define{{.*}} i256 @f_ret256u(i256 noundef %a)
+__uint256_t f_ret256u(__uint256_t a) { return a; }
+
+// Multiple 256-bit args
+// CHECK-LABEL: define{{.*}} i256 @f_two256(i256 noundef %a, i256 noundef %b)
+__int256_t f_two256(__int256_t a, __int256_t b) { return a + b; }
+
+// Mixed argument sizes: 256-bit with smaller types
+// CHECK-LABEL: define{{.*}} i256 @f_mixed(i64 noundef %x, i256 noundef %a, i32 noundef signext %y)
+__int256_t f_mixed(long long x, __int256_t a, int y) { return a; }
+
+// 128-bit vs 256-bit: both returned directly in IR
+// CHECK-LABEL: define{{.*}} i128 @f_ret128(i128 noundef %a)
+__int128_t f_ret128(__int128_t a) { return a; }
+
+// Register exhaustion: 3 i256 args still passed directly (LLVM handles lowering)
+// CHECK-LABEL: define{{.*}} i256 @f_three256(i256 noundef %a, i256 noundef %b, i256 noundef %c)
+__int256_t f_three256(__int256_t a, __int256_t b, __int256_t c) { return a + b + c; }
+
+// Struct containing a 256-bit integer: passed/returned via sret/indirect
+struct s256 { __int256_t val; };
+
+// CHECK-LABEL: define{{.*}} void @f_struct256(ptr dead_on_unwind noalias writable sret(%struct.s256) align 16 %{{.*}}, ptr noundef dead_on_return %s)
+struct s256 f_struct256(struct s256 s) { return s; }
+
+// Nested struct with __int256: also indirect
+struct nested256 { int x; __int256_t val; int y; };
+
+// CHECK-LABEL: define{{.*}} void @f_nested256(ptr dead_on_unwind noalias writable sret(%struct.nested256) align 16 %{{.*}}, ptr noundef dead_on_return %s)
+struct nested256 f_nested256(struct nested256 s) { return s; }
diff --git a/clang/test/CodeGen/debug-info-int256.c b/clang/test/CodeGen/debug-info-int256.c
index eeee2dddfd7f6..a9c5054902f83 100644
--- a/clang/test/CodeGen/debug-info-int256.c
+++ b/clang/test/CodeGen/debug-info-int256.c
@@ -3,10 +3,24 @@
// Verify DWARF debug info encoding for __int256_t and __uint256_t.
+// Global variables
__int256_t s256;
__uint256_t u256;
+// Function with __int256_t parameter and local variable
+void f(__int256_t param) {
+ __uint256_t local = (__uint256_t)param;
+ (void)local;
+}
+
+// Basic type encoding
// CHECK-DAG: !DIBasicType(name: "__int256", size: 256, encoding: DW_ATE_signed)
// CHECK-DAG: !DIBasicType(name: "unsigned __int256", size: 256, encoding: DW_ATE_unsigned)
+
+// Typedef encoding
// CHECK-DAG: !DIDerivedType(tag: DW_TAG_typedef, name: "__int256_t"
// CHECK-DAG: !DIDerivedType(tag: DW_TAG_typedef, name: "__uint256_t"
+
+// Function parameter and local variable
+// CHECK-DAG: !DILocalVariable(name: "param", arg: 1,
+// CHECK-DAG: !DILocalVariable(name: "local",
diff --git a/clang/test/CodeGen/int256-compound-assign.c b/clang/test/CodeGen/int256-compound-assign.c
new file mode 100644
index 0000000000000..dbf6855530b8b
--- /dev/null
+++ b/clang/test/CodeGen/int256-compound-assign.c
@@ -0,0 +1,56 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// Verify IR generation for __int256_t compound assignment and increment ops.
+// On x86-64, __int256 value params use byval and returns use sret.
+
+// CHECK-LABEL: define{{.*}} void @test_add_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: add nsw i256
+void test_add_assign(__int256_t *p, __int256_t v) { *p += v; }
+
+// CHECK-LABEL: define{{.*}} void @test_sub_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: sub nsw i256
+void test_sub_assign(__int256_t *p, __int256_t v) { *p -= v; }
+
+// CHECK-LABEL: define{{.*}} void @test_mul_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: mul nsw i256
+void test_mul_assign(__int256_t *p, __int256_t v) { *p *= v; }
+
+// CHECK-LABEL: define{{.*}} void @test_shl_assign(ptr noundef %p, i32 noundef %n)
+// CHECK: shl i256
+void test_shl_assign(__int256_t *p, int n) { *p <<= n; }
+
+// CHECK-LABEL: define{{.*}} void @test_shr_assign(ptr noundef %p, i32 noundef %n)
+// CHECK: ashr i256
+void test_shr_assign(__int256_t *p, int n) { *p >>= n; }
+
+// CHECK-LABEL: define{{.*}} void @test_ushr_assign(ptr noundef %p, i32 noundef %n)
+// CHECK: lshr i256
+void test_ushr_assign(__uint256_t *p, int n) { *p >>= n; }
+
+// CHECK-LABEL: define{{.*}} void @test_and_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: and i256
+void test_and_assign(__int256_t *p, __int256_t v) { *p &= v; }
+
+// CHECK-LABEL: define{{.*}} void @test_or_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: or i256
+void test_or_assign(__int256_t *p, __int256_t v) { *p |= v; }
+
+// CHECK-LABEL: define{{.*}} void @test_xor_assign(ptr noundef %p, ptr noundef byval(i256) align 16 %0)
+// CHECK: xor i256
+void test_xor_assign(__int256_t *p, __int256_t v) { *p ^= v; }
+
+// CHECK-LABEL: define{{.*}} void @test_pre_inc(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %0)
+// CHECK: add nsw i256 %{{.*}}, 1
+__int256_t test_pre_inc(__int256_t a) { return ++a; }
+
+// CHECK-LABEL: define{{.*}} void @test_pre_dec(ptr{{.*}}sret(i256){{.*}}, ptr noundef byval(i256) align 16 %0)
+// CHECK: add nsw i256 %{{.*}}, -1
+__int256_t test_pre_dec(__int256_t a) { return --a; }
+
+// CHECK-LABEL: define{{.*}} void @test_post_inc(ptr{{.*}}sret(i256){{.*}}, ptr noundef %p)
+// CHECK: add nsw i256 %{{.*}}, 1
+__int256_t test_post_inc(__int256_t *p) { return (*p)++; }
+
+// CHECK-LABEL: define{{.*}} void @test_post_dec(ptr{{.*}}sret(i256){{.*}}, ptr noundef %p)
+// CHECK: add nsw i256 %{{.*}}, -1
+__int256_t test_post_dec(__int256_t *p) { return (*p)--; }
diff --git a/clang/test/Sema/int256-spir-unsupported.c b/clang/test/Sema/int256-spir-unsupported.c
new file mode 100644
index 0000000000000..c79ba61e46e7a
--- /dev/null
+++ b/clang/test/Sema/int256-spir-unsupported.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv64-unknown-unknown %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple spirv32-unknown-unknown %s
+
+// Verify __int256 is rejected on SPIR targets (even 64-bit SPIR).
+// On SPIR, the __int256_t typedef is not predefined, so use the keyword.
+
+__int256 x; // expected-error {{__int256 is not supported on this target}}
+unsigned __int256 y; // expected-error {{__int256 is not supported on this target}}
diff --git a/clang/test/Sema/int256-unsupported-target.c b/clang/test/Sema/int256-unsupported-target.c
new file mode 100644
index 0000000000000..2a4203c302c36
--- /dev/null
+++ b/clang/test/Sema/int256-unsupported-target.c
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -triple i686-linux-gnu %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple armv7-linux-gnueabihf %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple riscv32-unknown-elf %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple mipsel-linux-gnu %s
+
+// Verify __int256 is rejected on 32-bit targets.
+// On 32-bit, the __int256_t typedef is not predefined, so use the keyword.
+
+__int256 x; // expected-error {{__int256 is not supported on this target}}
+unsigned __int256 y; // expected-error {{__int256 is not supported on this target}}
+
+void f(__int256 a) {} // expected-error {{__int256 is not supported on this target}}
+__int256 g(void); // expected-error {{__int256 is not supported on this target}}
diff --git a/compiler-rt/lib/builtins/ashloi3.c b/compiler-rt/lib/builtins/ashloi3.c
index 9d81628403ab7..5b83542eec264 100644
--- a/compiler-rt/lib/builtins/ashloi3.c
+++ b/compiler-rt/lib/builtins/ashloi3.c
@@ -8,6 +8,11 @@
//
// This file implements __ashloi3 for the compiler_rt library.
//
+// NOTE: This builtin is not called by the compiler (shift libcalls are not
+// registered for i256 to avoid sanitizer link failures -- ASan embeds UBSan
+// but does not link against compiler-rt builtins). It exists for direct use
+// by libraries and applications that need 256-bit shift operations.
+//
//===----------------------------------------------------------------------===//
#include "int_lib.h"
diff --git a/compiler-rt/lib/builtins/ashroi3.c b/compiler-rt/lib/builtins/ashroi3.c
index 35b583d47f7cb..1323c4fe12cd8 100644
--- a/compiler-rt/lib/builtins/ashroi3.c
+++ b/compiler-rt/lib/builtins/ashroi3.c
@@ -8,6 +8,11 @@
//
// This file implements __ashroi3 for the compiler_rt library.
//
+// NOTE: This builtin is not called by the compiler (shift libcalls are not
+// registered for i256 to avoid sanitizer link failures -- ASan embeds UBSan
+// but does not link against compiler-rt builtins). It exists for direct use
+// by libraries and applications that need 256-bit shift operations.
+//
//===----------------------------------------------------------------------===//
#include "int_lib.h"
diff --git a/compiler-rt/lib/builtins/lshroi3.c b/compiler-rt/lib/builtins/lshroi3.c
index d4e4920bda0a1..b8268d7725229 100644
--- a/compiler-rt/lib/builtins/lshroi3.c
+++ b/compiler-rt/lib/builtins/lshroi3.c
@@ -8,6 +8,11 @@
//
// This file implements __lshroi3 for the compiler_rt library.
//
+// NOTE: This builtin is not called by the compiler (shift libcalls are not
+// registered for i256 to avoid sanitizer link failures -- ASan embeds UBSan
+// but does not link against compiler-rt builtins). It exists for direct use
+// by libraries and applications that need 256-bit shift operations.
+//
//===----------------------------------------------------------------------===//
#include "int_lib.h"
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 355063a91ec40..d5dbfc5fcce40 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -412,6 +412,9 @@ RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
return FPTOSINT_PPCF128_I64;
if (RetVT == MVT::i128)
return FPTOSINT_PPCF128_I128;
+ // Note: ppcf128 -> i256 conversion is not yet supported.
+ // ppc_fp128 uses a unique double-double representation that requires
+ // dedicated builtins. Falls back to expansion through smaller types.
}
return UNKNOWN_LIBCALL;
}
@@ -469,6 +472,9 @@ RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
return FPTOUINT_PPCF128_I64;
if (RetVT == MVT::i128)
return FPTOUINT_PPCF128_I128;
+ // Note: ppcf128 -> i256 conversion is not yet supported.
+ // ppc_fp128 uses a unique double-double representation that requires
+ // dedicated builtins. Falls back to expansion through smaller types.
}
return UNKNOWN_LIBCALL;
}
@@ -526,6 +532,9 @@ RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
return SINTTOFP_I256_F80;
if (RetVT == MVT::f128)
return SINTTOFP_I256_F128;
+ // Note: i256 -> ppcf128 conversion is not yet supported.
+ // ppc_fp128 uses a unique double-double representation that requires
+ // dedicated builtins. Falls back to expansion through smaller types.
}
return UNKNOWN_LIBCALL;
}
@@ -583,6 +592,9 @@ RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
return UINTTOFP_I256_F80;
if (RetVT == MVT::f128)
return UINTTOFP_I256_F128;
+ // Note: i256 -> ppcf128 conversion is not yet supported.
+ // ppc_fp128 uses a unique double-double representation that requires
+ // dedicated builtins. Falls back to expansion through smaller types.
}
return UNKNOWN_LIBCALL;
}
>From 171bc3c248ecd61049af7011e8ba8f39bfdc09f5 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Thu, 26 Feb 2026 15:08:02 +0100
Subject: [PATCH 16/17] [clang][lldb][test] Add LLDB unit tests and expand
atomic __int256 tests
Add LLDB unit tests for __int256 support:
- ScalarTest: SInt256/UInt256 getter tests using 2^200 (201-bit value)
- TestTypeSystemClang: eBasicTypeInt256/UnsignedInt256 enum-to-type
mapping and name-to-type lookup (__int256_t, __uint256_t, etc.)
Expand atomic-int256.c Sema test with:
- __c11_atomic_exchange (signed/unsigned)
- __c11_atomic_compare_exchange_strong (signed/unsigned)
- __c11_atomic_compare_exchange_weak (signed/unsigned)
- Multiple memory orderings (RELAXED, ACQUIRE, RELEASE, ACQ_REL)
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
clang/test/Sema/atomic-int256.c | 47 +++++++++++++++++++
lldb/unittests/Symbol/TestTypeSystemClang.cpp | 10 ++++
lldb/unittests/Utility/ScalarTest.cpp | 9 ++++
3 files changed, 66 insertions(+)
diff --git a/clang/test/Sema/atomic-int256.c b/clang/test/Sema/atomic-int256.c
index 6257338e50ad4..c9d1051ba2ede 100644
--- a/clang/test/Sema/atomic-int256.c
+++ b/clang/test/Sema/atomic-int256.c
@@ -24,3 +24,50 @@ __uint256_t load_atomic_unsigned(void) {
void store_atomic_unsigned(__uint256_t val) {
__c11_atomic_store(&atomic_u256, val, __ATOMIC_SEQ_CST);
}
+
+// Atomic exchange
+__int256_t exchange_atomic(__int256_t val) {
+ return __c11_atomic_exchange(&atomic_s256, val, __ATOMIC_SEQ_CST);
+}
+
+__uint256_t exchange_atomic_unsigned(__uint256_t val) {
+ return __c11_atomic_exchange(&atomic_u256, val, __ATOMIC_RELAXED);
+}
+
+// Atomic compare-exchange (strong and weak)
+_Bool cas_strong(__int256_t *expected, __int256_t desired) {
+ return __c11_atomic_compare_exchange_strong(
+ &atomic_s256, expected, desired, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+}
+
+_Bool cas_weak(__int256_t *expected, __int256_t desired) {
+ return __c11_atomic_compare_exchange_weak(
+ &atomic_s256, expected, desired, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+}
+
+_Bool cas_strong_unsigned(__uint256_t *expected, __uint256_t desired) {
+ return __c11_atomic_compare_exchange_strong(
+ &atomic_u256, expected, desired, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
+
+_Bool cas_weak_unsigned(__uint256_t *expected, __uint256_t desired) {
+ return __c11_atomic_compare_exchange_weak(
+ &atomic_u256, expected, desired, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+}
+
+// Different memory orderings for load/store
+__int256_t load_relaxed(void) {
+ return __c11_atomic_load(&atomic_s256, __ATOMIC_RELAXED);
+}
+
+__int256_t load_acquire(void) {
+ return __c11_atomic_load(&atomic_s256, __ATOMIC_ACQUIRE);
+}
+
+void store_relaxed(__int256_t val) {
+ __c11_atomic_store(&atomic_s256, val, __ATOMIC_RELAXED);
+}
+
+void store_release(__int256_t val) {
+ __c11_atomic_store(&atomic_s256, val, __ATOMIC_RELEASE);
+}
diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
index 76abdd12d32a6..a18e7b632d80d 100644
--- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp
+++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
@@ -89,6 +89,8 @@ TEST_F(TestTypeSystemClang, TestGetBasicTypeFromEnum) {
context.hasSameType(GetBasicQualType(eBasicTypeInt), context.IntTy));
EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeInt128),
context.Int128Ty));
+ EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeInt256),
+ context.Int256Ty));
EXPECT_TRUE(
context.hasSameType(GetBasicQualType(eBasicTypeLong), context.LongTy));
EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeLongDouble),
@@ -116,6 +118,8 @@ TEST_F(TestTypeSystemClang, TestGetBasicTypeFromEnum) {
context.UnsignedIntTy));
EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeUnsignedInt128),
context.UnsignedInt128Ty));
+ EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeUnsignedInt256),
+ context.UnsignedInt256Ty));
EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeUnsignedLong),
context.UnsignedLongTy));
EXPECT_TRUE(context.hasSameType(GetBasicQualType(eBasicTypeUnsignedLongLong),
@@ -171,6 +175,12 @@ TEST_F(TestTypeSystemClang, TestGetBasicTypeFromName) {
EXPECT_EQ(GetBasicQualType(eBasicTypeInt128), GetBasicQualType("__int128"));
EXPECT_EQ(GetBasicQualType(eBasicTypeUnsignedInt128),
GetBasicQualType("unsigned __int128"));
+ EXPECT_EQ(GetBasicQualType(eBasicTypeInt256), GetBasicQualType("__int256_t"));
+ EXPECT_EQ(GetBasicQualType(eBasicTypeUnsignedInt256),
+ GetBasicQualType("__uint256_t"));
+ EXPECT_EQ(GetBasicQualType(eBasicTypeInt256), GetBasicQualType("__int256"));
+ EXPECT_EQ(GetBasicQualType(eBasicTypeUnsignedInt256),
+ GetBasicQualType("unsigned __int256"));
EXPECT_EQ(GetBasicQualType(eBasicTypeVoid), GetBasicQualType("void"));
EXPECT_EQ(GetBasicQualType(eBasicTypeBool), GetBasicQualType("bool"));
EXPECT_EQ(GetBasicQualType(eBasicTypeFloat), GetBasicQualType("float"));
diff --git a/lldb/unittests/Utility/ScalarTest.cpp b/lldb/unittests/Utility/ScalarTest.cpp
index 869a5809e6d14..6b3a1604b61e4 100644
--- a/lldb/unittests/Utility/ScalarTest.cpp
+++ b/lldb/unittests/Utility/ScalarTest.cpp
@@ -112,6 +112,15 @@ TEST(ScalarTest, Getters) {
Scalar(-std::pow(2.0, 70.0)).SInt128(APInt()));
EXPECT_EQ(APInt(128, 1) << 70, Scalar(std::pow(2.0, 70.0)).UInt128(APInt()));
EXPECT_EQ(APInt(128, 0), Scalar(-std::pow(2.0, 70.0)).UInt128(APInt()));
+
+ // Int256: use double (not float) since 2^200 exceeds float range (~3.4e38)
+ EXPECT_EQ(APInt(256, 1) << 200,
+ Scalar(std::pow(2.0, 200.0)).SInt256(APInt()));
+ EXPECT_EQ(APInt(256, -1, true) << 200,
+ Scalar(-std::pow(2.0, 200.0)).SInt256(APInt()));
+ EXPECT_EQ(APInt(256, 1) << 200,
+ Scalar(std::pow(2.0, 200.0)).UInt256(APInt()));
+ EXPECT_EQ(APInt(256, 0), Scalar(-std::pow(2.0, 200.0)).UInt256(APInt()));
}
TEST(ScalarTest, RightShiftOperator) {
>From b0dc0130b2bb35aa0da647f018ae409b6b88e174 Mon Sep 17 00:00:00 2001
From: Xavier Roche <xavier.roche at algolia.com>
Date: Sun, 1 Mar 2026 18:04:59 +0100
Subject: [PATCH 17/17] [RISCV] Reject tail calls when arguments are passed
indirectly
When a function argument is passed indirectly (CCValAssign::Indirect),
the caller allocates stack space for the value and passes a pointer to
the callee. If the call is tail-called, the caller's frame is popped
before the callee executes, leaving the pointer dangling -- a
use-after-free on the stack.
X86 already guards against this case in its
isEligibleForTailCallOptimization (X86ISelLoweringCall.cpp:3070-3071).
The RISC-V implementation was missing this check.
The existing test tail-calls.ll had a comment "Do not tail call optimize
if parameters need to be passed indirectly" but the CHECK lines showed
the call being tail-called anyway (the test was auto-generated from the
buggy codegen). After the fix, the test correctly shows a normal call
with the stack frame remaining live across it.
This affects any type requiring indirect passing on RISC-V, such as
fp128 on RV32, and is not specific to any particular frontend type.
Assisted-by: Claude (Anthropic)
Co-Authored-By: Claude Opus 4.6 <noreply at anthropic.com>
---
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 8 ++++++++
llvm/test/CodeGen/RISCV/tail-calls.ll | 18 ++++++++++++------
2 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a8542be937a87..aa1db300f8df3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24515,6 +24515,14 @@ bool RISCVTargetLowering::isEligibleForTailCallOptimization(
if (Caller.hasFnAttribute("interrupt"))
return false;
+ // Do not tail call optimize if any parameters need to be passed indirectly.
+ // The caller allocates stack space for the indirect argument and passes a
+ // pointer to the callee. A tail call pops the caller's frame before the
+ // callee executes, invalidating the pointer.
+ for (const auto &ArgLoc : ArgLocs)
+ if (ArgLoc.getLocInfo() == CCValAssign::Indirect)
+ return false;
+
// If the stack arguments for this call do not fit into our own save area then
// the call cannot be made tail.
if (CCInfo.getStackSize() > RVFI->getArgumentStackSize())
diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index 33feba3c6fba1..79855aa03adcf 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -247,20 +247,24 @@ declare i32 @callee_indirect_args(fp128 %a)
define void @caller_indirect_args() nounwind {
; CHECK-LABEL: caller_indirect_args:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: addi sp, sp, -32
+; CHECK-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
; CHECK-NEXT: lui a1, 262128
; CHECK-NEXT: mv a0, sp
; CHECK-NEXT: sw zero, 0(sp)
; CHECK-NEXT: sw zero, 4(sp)
; CHECK-NEXT: sw zero, 8(sp)
; CHECK-NEXT: sw a1, 12(sp)
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: tail callee_indirect_args
+; CHECK-NEXT: call callee_indirect_args
+; CHECK-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; CHECK-NEXT: addi sp, sp, 32
+; CHECK-NEXT: ret
;
; CHECK-LARGE-ZICFILP-LABEL: caller_indirect_args:
; CHECK-LARGE-ZICFILP: # %bb.0: # %entry
; CHECK-LARGE-ZICFILP-NEXT: lpad 0
-; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, -16
+; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, -32
+; CHECK-LARGE-ZICFILP-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
; CHECK-LARGE-ZICFILP-NEXT: lui a1, 262128
; CHECK-LARGE-ZICFILP-NEXT: .Lpcrel_hi9:
; CHECK-LARGE-ZICFILP-NEXT: auipc a0, %pcrel_hi(.LCPI7_0)
@@ -270,8 +274,10 @@ define void @caller_indirect_args() nounwind {
; CHECK-LARGE-ZICFILP-NEXT: sw zero, 4(sp)
; CHECK-LARGE-ZICFILP-NEXT: sw zero, 8(sp)
; CHECK-LARGE-ZICFILP-NEXT: sw a1, 12(sp)
-; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, 16
-; CHECK-LARGE-ZICFILP-NEXT: jr t2
+; CHECK-LARGE-ZICFILP-NEXT: jalr t2
+; CHECK-LARGE-ZICFILP-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; CHECK-LARGE-ZICFILP-NEXT: addi sp, sp, 32
+; CHECK-LARGE-ZICFILP-NEXT: ret
entry:
%call = tail call i32 @callee_indirect_args(fp128 0xL00000000000000003FFF000000000000)
ret void
More information about the cfe-commits
mailing list